多线程下载豆瓣相册

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date    : 2014-12-22 14:46:40
# @Author  : kuas (hukuas@gmail.com)
# @Version : $Id$
 
import _thread
from http import cookiejar
import os
import random
import re
import threading
import time
import urllib.request


userAgents = [{'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0'},
	{"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5"},
	{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},
	{"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1"},
	{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11"},
	{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER"},
	{"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)"},
	{"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)"},
	{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER"},
	{"User-Agent":"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)"},
	{"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)"},
	{"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)"},
	{"User-Agent":"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)"},
	{"User-Agent":"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1"},
	{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1"},
	{"User-Agent":"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"},
	{"User-Agent":"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)"},
	{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0) Gecko/20121026 Firefox/16.0"},
	{"User-Agent":"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5"},
	{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre"},
	{"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0"},
	{"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15"},
	{"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11"},
	{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11"},
	{"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133"},
	{"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0)"},
	{"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)"},
	{"User-Agent":"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"}, 	
	{"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"},
	{"User-Agent":"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)"},
	{"User-Agent":"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)"},
	{"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)"},
	{"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101"},
	{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"}]

##只需要设置豆瓣相册地址ActivityURL即可
ActivityURL = "http://www.douban.com/online/11748222/album/131483485"
SAVE_DIR = "D:\\downPic\\" ##保存的文件目录
downPicCount = 16 ##下载图片线程数
clawThreadCount = 1 ##扒取图片链接线程数

##以下参数不用设置
pics = []
urls = []
openers = []
exitFlag = 0
picLock = threading.Lock()
urlLock = threading.Lock()
pageNum = 0
ActDir = "tmp\\"
PageSize = 18 #每页照片数量
MaxPageNum = 10  ##总共几页

def getRandomHeaders():
    headers = []
    headers.append(("User-Agent", random.choice(userAgents)["User-Agent"]))
    headers.append(("Accept-Language", "zh-cn,zh;q=0.8;"))
    headers.append(("Cache-Control", "max-age=0"))
    headers.append(("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"))
    return headers

def initOpeners(openerCount):
    for i in range(0,openerCount):
        cj = cookiejar.CookieJar()
        opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
        opener.addheaders = getRandomHeaders()
        urllib.request.install_opener(opener)
        openers.append(opener)
        
def getRandomOpener():
    return random.choice(openers)

def downLoadImage(url, savePath):
    testCount = 0
    while testCount < 3:
        try:
            f = getRandomOpener().open(url)
            file = open(savePath, 'wb')
            file.write(f.read())
            file.close()
            f.close() 
            break
        except Exception as e:
            print("DownLoad image %s Error:%s"%(url,str(e)))
            testCount += 1
    
    
def getHtml(url):
    testCount = 0
    html = ""
    while testCount < 3:
        try:
            f = getRandomOpener().open(url)
            html = f.read().decode('utf-8')
            f.close()
            break
        except Exception as e:
            print("getHtml %s Error:%s"%(url,str(e)))
            testCount += 1
    return html

class ClawThread(threading.Thread):
    def __init__(self, name):
        threading.Thread.__init__(self)
        self.name = name
    def run(self):
        global urls,pageNum
        while exitFlag == 0:
            time.sleep(0.1 * random.randint(0, 10))
            urlLock.acquire()
            pUrl = ""
            if len(urls) > 0:
                pUrl = urls[0]
                urls.remove(pUrl)
            else:
                if pageNum == MaxPageNum:
                    break
                endAddPage = pageNum + 5;
                endAddPage = min(endAddPage,MaxPageNum)
                while pageNum < endAddPage:
                    urls.append(ActivityURL % (pageNum*PageSize))
                    pageNum += 1
            urlLock.release()
            if pUrl != "":
                html = getHtml(pUrl)
                getPicURL(html)

class DownPicThread(threading.Thread):
    def __init__(self, name):
        threading.Thread.__init__(self)
        self.name = name
    def run(self):
        global pics
        while exitFlag == 0:
            time.sleep(0.01 * random.randint(0, 10))
            picLock.acquire()
            picUrl = ""
            if len(pics) > 0:
                picUrl = pics[0]
                pics.remove(picUrl)
            picLock.release()
            if picUrl != "":
                fileName = picUrl[picUrl.rindex('/')+1:]
                filePath = SAVE_DIR + ActDir + fileName
                if not os.path.exists(filePath):
                    downLoadImage(picUrl, filePath)
            

def getPicURL(html):
    reg = r"http://img\d\.douban\.com/view/photo/thumb/public/p\d+\.jpg"
    picURLs = re.findall(reg, html)
    picLock.acquire()
    for picurl in picURLs:
        pics.append(picurl.replace("thumb", "photo"))
    picLock.release()

def initData():
    global ActivityURL,ActDir,SAVE_DIR,PageSize,MaxPageNum
    if not os.path.exists(SAVE_DIR):
        os.mkdir(SAVE_DIR)

    regNum = "/\d+"
    html = getHtml(ActivityURL)
    regStartNum = "\?start=\d+"
    startStrs = re.findall(regStartNum,html)
    maxStartInt = 0
    for startStr in startStrs:
        maxStartInt = max(int(startStr[7:]),maxStartInt)
    nums = re.findall(regNum,ActivityURL)
    
    if len(nums) == 2:#线上活动相册
        ActDir = nums[1][1:]+"/"
        PageSize = 90
        ActivityURL = ActivityURL +"?start=%d&sortby=popularity"
    elif len(nums) == 1:#个人相册
        ActDir = nums[0][1:]+"/"
        PageSize = 18
        ActivityURL = ActivityURL +"?start=%d"
    MaxPageNum = maxStartInt/PageSize + 1
    print("总页数:%d"%(MaxPageNum))
    print("PageSize:%d"%(PageSize))
    
if __name__ == '__main__':
    
    initOpeners(10)
    initData()
    
    threads = []
    
    if not os.path.exists(SAVE_DIR + ActDir):
        os.mkdir(SAVE_DIR + ActDir)
     
    for i in range(0,clawThreadCount):
        thread = ClawThread("%d"%(i))
        thread.start()
        threads.append(thread)    

    for i in range(0,downPicCount):
        thread = DownPicThread("%d"%(i))
        thread.start()
        threads.append(thread)
        
    while True:
        time.sleep(1)
        print("Downing:%d  ----------- Finished:%d"%(len(pics),len(os.listdir(SAVE_DIR + ActDir))))
        if len(urls) == 0 and len(pics) == 0 and pageNum == MaxPageNum:
            exitFlag = 1
            break 
    print("Have DownLoaded %d files!"%(len(os.listdir(SAVE_DIR + ActDir))))
    print("Waiting for all thread exit...")
    for thread in threads:
        thread.join()
         

编程技巧