#!/usr/bin/env python3 #-*- coding=utf-8 -*- import requests import time import random import re import configparser import logging import logging.handlers import lxml.etree as etree import threading import queue import os.path DOUBAN_HEADERS = { 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Referer':'http://www.douban.com/search?cat=1019&q=%E5%AE%B3%E7%BE%9E', 'Accept-Language':'zh-CN,zh;q=0.8', 'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36', 'Accept-Encoding':'gzip, deflate', 'Host':'www.douban.com', 'Connection':'Keep-Alive' } IMAGE_HEADERS = { 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36' } CNFG_FILE = 'douban_crawler.cfg' LOG_FILE = 'douban_crawler.log' MAX_LOG_SIZE = 1024 * 1024 #1MB LOG_BACKUP_COUNT = 3 logger = logging.getLogger('crawler') logger.setLevel(logging.DEBUG) fh = logging.handlers.RotatingFileHandler(LOG_FILE, maxBytes=MAX_LOG_SIZE, backupCount=LOG_BACKUP_COUNT, encoding='utf-8') fh.setLevel(logging.DEBUG) ch = logging.StreamHandler() ch.setLevel(logging.INFO) formatter = logging.Formatter( "%(asctime)s - %(name)s - %(levelname)s - %(lineno)d - %(message)s") fh.setFormatter(formatter) ch.setFormatter(formatter) # add the handlers to logger logger.addHandler(fh) logger.addHandler(ch) DEBUG = logger.debug INFO = logger.info WARNING = logger.warning ERROR = logger.error class Parser_Douban_Group(threading.Thread): def __init__(self, url, queue,t_name = 'Parser Group'): threading.Thread.__init__(self,name=t_name) self.data = queue self.url = url self.s = requests.Session() def run(self): #解析网页 INFO("{0} started!".format(self.getName())) co = 0 htm = open_douban_page(self.url,self.s) try: parser = etree.HTMLParser(recover=True) text_dom = etree.fromstring(htm, parser) except Exception as e: ERROR('Parse douban page error: {0}'.format(e)) #DEBUG('Page: {0}'.format(htm)) else: group_name = ''.join(text_dom.xpath("//div[@id='group-info']/h1//text()")).strip() INFO('Group name: {0}'.format(group_name)) div_node = text_dom.xpath("//tr[@class='']") for x in div_node: co = co + 1 item = {} url = ''.join(x.xpath("child::td[@class='title']/a/attribute::href")) title = ''.join(x.xpath("child::td[@class='title']/a//text()")) auth = ''.join(x.xpath("child::td[@nowrap='nowrap']/a[@class='']//text()")) reply = ''.join(x.xpath("child::td[@class='']//text()")) time = ''.join(x.xpath("child::td[@class='time']//text()")) item['title'] = title item['url'] = url item['auth'] = auth item['reply'] = reply item['time'] = time #将数据依次存入队列 self.data.put(item, block=True) DEBUG('{0} Put({1}) - ({2} ...)'.format(self.getName(), co,item['title'][:20])) #存入结束标志 self.data.put({}) INFO("{0} finished! put {1} topic to queue.".format(self.getName(), co)) class Parser_Douban_Topic(threading.Thread): def __init__(self, topic_queue, content_queue, t_name = 'Parser Topic'): threading.Thread.__init__(self,name=t_name) self.topic_queue = topic_queue self.content_queue = content_queue self.s = requests.Session() def run(self): #解析网页 INFO("{0} started!".format(self.getName())) co = 0 coo = 0 while True: try: #读取队列,最长等待5分钟 val = self.topic_queue.get(True,300) if val: co = co + 1 DEBUG('{0} Get({1}) - ({2} ...)'.format(self.getName(), co, val['title'][:20])) htm = open_douban_page(val['url'],self.s) try: parser = etree.HTMLParser(recover=True) text_dom = etree.fromstring(htm, parser) except Exception as e: ERROR('Parse douban page error: {0}'.format(e)) #DEBUG('Page: {0}'.format(htm)) else: topic_name = ''.join(text_dom.xpath("//div[@id='content']/h1//text()")).replace('\n','').strip() DEBUG('Topic name: {0}'.format(topic_name)) div_node = text_dom.xpath("//div[@class='topic-content']") img_list = div_node[0].xpath("descendant::img/attribute::src") for x in img_list: coo = coo + 1 item = {} #url = ''.join(x.xpath("descendant::img/attribute::src")) item['title'] = topic_name + str(coo) item['url'] = x #将数据依次存入队列 self.content_queue.put(item) DEBUG('{0} Put({1}) - ({2} ...)'.format(self.getName(), coo,item['title'][:20])) else: self.topic_queue.put({}) INFO("{0} finished! get {1} topic from queue.".format(self.getName(), co)) break except Exception as e: ERROR("{0} timeout! {1}".format(self.getName(), e)) break #存入结束标志 self.content_queue.put({}) INFO("{0} finished! put {1} image to queue.".format(self.getName(), coo)) class Save_Douban_Group(threading.Thread): def __init__(self, queue, folder_name = 'image', t_name = 'Storage'): threading.Thread.__init__(self,name=t_name) self.data = queue self.folder = folder_name self.s = requests.Session() def run(self): INFO("{0} started!".format(self.getName())) co = 0 coo = 0 while True: try: #读取队列,最长等待5分钟 val = self.data.get(True,300) if val: co = co + 1 #fp.write('<{0}>.{1} - {2}\r{3}\r{4}\r\n'.format( #co,val['title'],val['time'],val['url'],val['abr'])) DEBUG('{0} Get({1}) - ({2} ...)'.format(self.getName(), co,val['title'][:20])) img_dt = open_douban_page(val['url'], self.s, ret_raw = True) img_nm = val['url'].split('/')[-1] if img_dt: fn = '{0}/{1}'.format(self.folder, img_nm) if not os.path.exists(fn): fp = open(fn, 'wb') fp.write(img_dt) fp.close() coo = coo + 1 else: self.data.put({}) #仍然存入结束标识 break except Exception as e: ERROR("{0} timeout! {1}".format(self.getName(), e)) #break #fp.close() INFO("{0} finished! save image({1}/{2}).".format(self.getName(), coo, co)) def open_douban_page(group_url, s, retries=3, ret_raw = False): #读取网页 ret = '' try: cookies = dict(bid="RmFNKKPAd0s") if ret_raw: r = s.get(group_url, headers=IMAGE_HEADERS, stream=True) else: r = s.get(group_url, headers=DOUBAN_HEADERS, cookies=cookies) #print(r.cookies) r.raise_for_status() time.sleep(random.uniform(0.3, 1.5)) except requests.ConnectionError as e: ERROR('Connect douban error({0}): {1}'.format(retries,e)) retries = retries - 1 if retries > 0: time.sleep(0.5) ret = open_douban_page(group_url, s, retries) except Exception as e: ERROR('Open douban url({0}) error: {1}'.format(group_url, e)) else: #INFO('Open douban page finished! - {0}'.format(r.url)) DEBUG('Request url: {0}'.format(group_url)) if ret_raw: ret = r.raw.read() else: ret = r.text return ret def crawler_douban(group_url, folder_name, task_name): q_topic = queue.Queue() q_content = queue.Queue() parser_group_obj = [] parser_topic_obj = [] storage_pic_obj = [] for i in range(1,2): parser_group_obj.append(Parser_Douban_Group(group_url, q_topic, '{0} {1}'.format(task_name, i))) for i in range(1,2): parser_topic_obj.append(Parser_Douban_Topic(q_topic, q_content, 'Parser Topic {0}'.format(i))) for i in range(1,3): storage_pic_obj.append(Save_Douban_Group(q_content, folder_name, 'Storage {0}'.format(i))) for obj in parser_group_obj: obj.start() for obj in parser_topic_obj: obj.start() for obj in storage_pic_obj: obj.start() for obj in parser_group_obj: obj.join() for obj in parser_topic_obj: obj.join() for obj in storage_pic_obj: obj.join() del q_topic del q_content if __name__ == '__main__': haixiu_hangzhou_url = 'http://www.douban.com/group/505137/' haixiu_url = 'http://www.douban.com/group/haixiuzu/' co =0 while True: co = co + 1 time.sleep(2.0) crawler_douban(haixiu_url, 'image', 'Parser HaiXiu Group ({0})'.format(co)) input('Press any key to exit!')