#!/usr/bin/env python # -*- coding: utf-8 -*- # @Date : 2014-12-22 14:46:40 # @Author : kuas (hukuas@gmail.com) # @Version : $Id$ import _thread from http import cookiejar import os import random import re import threading import time import urllib.request userAgents = [{'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0'}, {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5"}, {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}, {"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1"}, {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11"}, {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER"}, {"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)"}, {"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)"}, {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER"}, {"User-Agent":"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)"}, {"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)"}, {"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)"}, {"User-Agent":"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)"}, {"User-Agent":"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1"}, {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1"}, {"User-Agent":"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"}, {"User-Agent":"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)"}, {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0) Gecko/20121026 Firefox/16.0"}, {"User-Agent":"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5"}, {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre"}, {"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0"}, {"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15"}, {"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11"}, {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11"}, {"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133"}, {"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0)"}, {"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)"}, {"User-Agent":"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"}, {"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"}, {"User-Agent":"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)"}, {"User-Agent":"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)"}, {"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)"}, {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101"}, {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"}] ##只需要设置豆瓣相册地址ActivityURL即可 ActivityURL = "http://www.douban.com/online/11748222/album/131483485" SAVE_DIR = "D:\\downPic\\" ##保存的文件目录 downPicCount = 16 ##下载图片线程数 clawThreadCount = 1 ##扒取图片链接线程数 ##以下参数不用设置 pics = [] urls = [] openers = [] exitFlag = 0 picLock = threading.Lock() urlLock = threading.Lock() pageNum = 0 ActDir = "tmp\\" PageSize = 18 #每页照片数量 MaxPageNum = 10 ##总共几页 def getRandomHeaders(): headers = [] headers.append(("User-Agent", random.choice(userAgents)["User-Agent"])) headers.append(("Accept-Language", "zh-cn,zh;q=0.8;")) headers.append(("Cache-Control", "max-age=0")) headers.append(("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")) return headers def initOpeners(openerCount): for i in range(0,openerCount): cj = cookiejar.CookieJar() opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj)) opener.addheaders = getRandomHeaders() urllib.request.install_opener(opener) openers.append(opener) def getRandomOpener(): return random.choice(openers) def downLoadImage(url, savePath): testCount = 0 while testCount < 3: try: f = getRandomOpener().open(url) file = open(savePath, 'wb') file.write(f.read()) file.close() f.close() break except Exception as e: print("DownLoad image %s Error:%s"%(url,str(e))) testCount += 1 def getHtml(url): testCount = 0 html = "" while testCount < 3: try: f = getRandomOpener().open(url) html = f.read().decode('utf-8') f.close() break except Exception as e: print("getHtml %s Error:%s"%(url,str(e))) testCount += 1 return html class ClawThread(threading.Thread): def __init__(self, name): threading.Thread.__init__(self) self.name = name def run(self): global urls,pageNum while exitFlag == 0: time.sleep(0.1 * random.randint(0, 10)) urlLock.acquire() pUrl = "" if len(urls) > 0: pUrl = urls[0] urls.remove(pUrl) else: if pageNum == MaxPageNum: break endAddPage = pageNum + 5; endAddPage = min(endAddPage,MaxPageNum) while pageNum < endAddPage: urls.append(ActivityURL % (pageNum*PageSize)) pageNum += 1 urlLock.release() if pUrl != "": html = getHtml(pUrl) getPicURL(html) class DownPicThread(threading.Thread): def __init__(self, name): threading.Thread.__init__(self) self.name = name def run(self): global pics while exitFlag == 0: time.sleep(0.01 * random.randint(0, 10)) picLock.acquire() picUrl = "" if len(pics) > 0: picUrl = pics[0] pics.remove(picUrl) picLock.release() if picUrl != "": fileName = picUrl[picUrl.rindex('/')+1:] filePath = SAVE_DIR + ActDir + fileName if not os.path.exists(filePath): downLoadImage(picUrl, filePath) def getPicURL(html): reg = r"http://img\d\.douban\.com/view/photo/thumb/public/p\d+\.jpg" picURLs = re.findall(reg, html) picLock.acquire() for picurl in picURLs: pics.append(picurl.replace("thumb", "photo")) picLock.release() def initData(): global ActivityURL,ActDir,SAVE_DIR,PageSize,MaxPageNum if not os.path.exists(SAVE_DIR): os.mkdir(SAVE_DIR) regNum = "/\d+" html = getHtml(ActivityURL) regStartNum = "\?start=\d+" startStrs = re.findall(regStartNum,html) maxStartInt = 0 for startStr in startStrs: maxStartInt = max(int(startStr[7:]),maxStartInt) nums = re.findall(regNum,ActivityURL) if len(nums) == 2:#线上活动相册 ActDir = nums[1][1:]+"/" PageSize = 90 ActivityURL = ActivityURL +"?start=%d&sortby=popularity" elif len(nums) == 1:#个人相册 ActDir = nums[0][1:]+"/" PageSize = 18 ActivityURL = ActivityURL +"?start=%d" MaxPageNum = maxStartInt/PageSize + 1 print("总页数:%d"%(MaxPageNum)) print("PageSize:%d"%(PageSize)) if __name__ == '__main__': initOpeners(10) initData() threads = [] if not os.path.exists(SAVE_DIR + ActDir): os.mkdir(SAVE_DIR + ActDir) for i in range(0,clawThreadCount): thread = ClawThread("%d"%(i)) thread.start() threads.append(thread) for i in range(0,downPicCount): thread = DownPicThread("%d"%(i)) thread.start() threads.append(thread) while True: time.sleep(1) print("Downing:%d ----------- Finished:%d"%(len(pics),len(os.listdir(SAVE_DIR + ActDir)))) if len(urls) == 0 and len(pics) == 0 and pageNum == MaxPageNum: exitFlag = 1 break print("Have DownLoaded %d files!"%(len(os.listdir(SAVE_DIR + ActDir)))) print("Waiting for all thread exit...") for thread in threads: thread.join()