#! /usr/bin/env python # -*- coding=utf-8 -*- import urllib2,urllib import re,time,socket import os import sys import threading path = os.getcwd() new_path = os.path.join(path,r'mnsfz') if not os.path.isdir(new_path): os.mkdir(new_path) path1=new_path+'/'+r'List1.txt' k=open(path1,'wt') k.close() path2=new_path+'/'+r'List2.txt' g=open(path2,'wt') g.close() path3=new_path+'/'+r'List3.txt' g=open(path3,'wt') g.close() match1=r'<div class="bgyellow_bsb"><a href="(info_toplist.*?)"'#匹配下一页 match2=r'(http.*?)#'#匹配txt1中的网址 match3=r'<a href="(unit_info.*?ps=18)">'#匹配html1中的各图册pageurl match4=r'(unit_info.*?ps=18)'#匹配txt2中的地址 match5=r'value="(http://.*?\.jpg)" emptyok="true" />'#匹配打开的图册中图片的下载地址<input name="picurl" type="hidden" value="http://d4.lexun.net/d43/act/20150324/18/94798621.jpg" emptyok="true" /> match6=r'"<a href="(unit_info.*?ps=18)">u"下一页"'#匹配打开图册的下一页 match7=r'(http.*?\.jpg)'#匹配txt3中的地址 match8=r'<img src="(http.*?\.jpg)" alt='#匹配原图下载地址'<a href="http.jpg">立即下载' match9=r'(http.*?\.jpg)'#匹配原图下载页的真实imgurl url1=r'http://p.lexun.net/w/info_toplist.aspx?flag=1&ps=18&total=17967&total=17967&cd=0&lxt=404dd8b222b4d64dsggshhtgrq&vs=1&_r=451103666' def pageloop1(url1): for i in range(1,41): putintotxt(url1+r'#',path1) html=useragent(url1) bturl=geturl(match1,html) if bturl: src=bturl[0] url1=r'http://p.lexun.net/w/'+src.replace(r'amp;','') def pageloop2(url2): print r'page',url2 html2=useragent(url2) pagelist=geturl(match3,html2) putintotxt(pagelist,path2) def pageloop3(pageurl): url2=r'http://p.lexun.net/w/'+pageurl.replace(r'amp;','') # print r'next page',url2 html3=useragent(url2) imglist=geturl(match5,html3) # print imglist putintotxt(imglist,path3) nextimgurl=geturl(match6,html3) if nextimgurl: src=nextimgurl[0] pageurl2=r'http://p.lexun.net/w/'+src.replace(r'amp;','') pageloop3(pageurl2) def pageloop4(urlimg): try: name=os.path.basename(urlimg) size=os.path.isfile(new_path+'/'+name) if size==True: print u'已经存在' pass else: content=urllib2.urlopen(urlimg,None,timeout=20).read() with open(new_path+'/'+name,'wb') as code: code.write(content) if size==False: print u'需要host' useragent2(urlimg) else: print urlimg except: useragent2(urlimg) def useragent2(urlimg): try: url=r'http://app.lexun.com/resizepic/pic_zoomr.aspx?cd=0&lxt=404dd8b222b4d64dsggshhtgrq&vs=1&_r=3925580'+str(i) values={ "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36", "picurl":urlimg } data = urllib.urlencode(values) req = urllib2.Request(url, data) proxy_support = urllib2.ProxyHandler({'http':'http://190.79.62.76:8080'}) opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler) urllib2.install_opener(opener) response = urllib2.urlopen(req) html = response.read() HTL=geturl(match8,html) print HTL[0] pageloop4(HTL[0]) except: pass class getallpag(threading.Thread): def __init__(self,begin,end): threading.Thread.__init__(self) self.begin = begin self.end = end def run(self): for i in range(self.begin,self.end): pageloop2(ALLPAG[i]) class getimgpag(threading.Thread): def __init__(self,begin,end): threading.Thread.__init__(self) self.begin = begin self.end = end def run(self): for i in range(self.begin,self.end): pageloop3(ALLPAG2[i]) class getmypic(threading.Thread): def __init__(self,begin,end): threading.Thread.__init__(self) self.begin = begin self.end = end def run(self): for i in range(self.begin,self.end): pageloop4(ALLPIC[i]) def geturl(match,html): reg=re.compile(match) URLNEXT=re.findall(reg,html) return URLNEXT def putintotxt(url,path): with open (path,'a+') as code: code.writelines(url) def useragent(url): try: html = urllib2.urlopen(url,None,timeout=10).read() #time.sleep(1) except: html=r'123456' pass return html def listmk(path,match): f=open(path,'r+') allurl=f.readlines() f.close reg=re.compile(match) urllist=re.findall(reg,allurl[0]) return urllist pageloop1(url1) ALLPAG=listmk(path1,match2) l=len(ALLPAG) print l if __name__ == '__main__': threads = [] m=1 n=10 while(1): threads.append(getallpag(m-1,n-1)) m+=10 n+=10 if n-1>l: break for t in threads: t.start() for t in threads: t.join() ALLPAG2=listmk(path2,match4) l2=len(ALLPAG2) print l2 if __name__ == '__main__': threads = [] m=0 n=100 while(1): threads.append(getimgpag(m,n)) m+=101 n+=100 if n>l2: break for t in threads: t.start() for t in threads: t.join() ALLPIC=listmk(path3,match7) print u'一共:',len(ALLPIC) if __name__ == '__main__': threads = [] i=0 j=100 kl=len(ALLPIC) while(1): threads.append(getmypic(i,j)) i+=101 j+=100 if j>kl: break for t in threads: t.start() # 等待子线程结束 for t in threads: t.join() print "the end!!"