# -*- encoding=utf8 -*- import urllib2 import lxml.html,re import os.path,stat,io,sys,glob,time import threading,Queue from bottle import route, run,template,static_file from peewee import * db = SqliteDatabase('post.db') postlist=Queue.Queue(maxsize=200) class User(Model): uid = IntegerField(primary_key=True) name = CharField() password = FixedCharField() class Meta: database = db User._meta.auto_increment =True class Post(Model): post_id=IntegerField(primary_key=True) node = CharField() title = CharField() content = CharField() author = ForeignKeyField(User, related_name='author') class Meta: database = db class Remark(Model): remark_id=IntegerField(primary_key=True) post_id = IntegerField() content = CharField() user_id = ForeignKeyField(User, related_name='poster') class Meta: database = db db.connect() def fetchHtml(url,options): headers={'User-Agent':options['user_agent'],"Host":'www.'+options['domain'],'Connection':"keep-alive",'Refer':options['url'],} page='' retry=0 req=urllib2.Request(url) for header in headers: req.add_header(header,headers[header]) while not page and retry <3: try: page=urllib2.urlopen(url).read() except: retry=retry+1 print retry time.sleep(10) return page class ScrapIndex(threading.Thread): def __init__(self,config): threading.Thread.__init__(self) self.config=config def run(self): print("\n run....") config=self.config url=config['url'] while True: page='' try: page=fetchHtml(url,config) except: print("error",url) if not page: continue doc = lxml.html.document_fromstring(page) for elem in doc.cssselect(config['links_css']): id=re.search(config['href_patten'],elem.get("href")).group(1) #print(elem) filename=config['save_dir']+'//'+id #print(filename) if not os.path.exists(filename): print(filename) self.touch(filename) postlist.put(id) time.sleep(config['refresh_fruiqence']) def touch(self,fname, times=None): with open(fname, 'a'): os.utime(fname, times) class Refresh(threading.Thread): def __init__(self, config): threading.Thread.__init__(self) self.config=config def run(self): dir=self.config['save_dir'] while True: now=time.time() for path_and_filename in glob.iglob(dir+"/*"): ctime=os.stat(path_and_filename)[stat.ST_CTIME] elapse=now-ctime if elapse > 86400: print("\t"*3,ctime,path_and_filename,elapse) os.remove(path_and_filename) elif elapse > 3600: print(postlist.qsize()) postlist.put(os.path.basename(path_and_filename)) time.sleep(20) class ScrapPage(threading.Thread): def __init__(self, config): threading.Thread.__init__(self) self.config=config def run(self): config=self.config while True: print(postlist.qsize()) id=postlist.get() url=config['detail_url'] % id print(url) filename=config['save_dir']+"//"+ id page='' try: page=fetchHtml(url,config) except: print("ERROR:",url) if not page: continue doc = lxml.html.document_fromstring(page) size=0 idset=[] if os.path.exists(filename): size=os.path.getsize(filename) print('size=',size) if 0 == size: header=doc.cssselect("#Main .box .header") node=header[0].findall("./a")[1].get("href").replace('/go/','') title=header[0].find("./h1").text_content() user=header[0].find("./small/a").text_content() content=doc.find_class("topic_content") if content: content=content[0].text_content() try: user = User.get(User.name ==user) user_id=user.uid except: created = User.create(name=user,password='xx') user_id=created.uid post,created=Post.create_or_get(post_id=int(id), node=node, title=title,author=user_id,content=content) #print(post,created) with open(filename,'r+') as f: f.write(chr(32)) idlist='' if 0 < size: with open(filename,'r+') as f: idlist=f.read().strip() uniq=set() if 0 != len(idlist) : uniq=set(idlist.split(',')) print("\n-----------------------------") uniqnew=set([]) for elem in doc.cssselect('#Main div.box:nth-child(4) div[id^="r_"]'): user=elem.find(".//strong/a").text_content() try: user = User.get(User.name ==user) user_id=user.uid except: created = User.create(name=user,password='xx') user_id=created.uid #print(user_id) rid=elem.get('id').replace('r_','') td=elem.find_class("reply_content") content=td[0].text_content() #print(td[0].text_content()) if rid not in uniq: uniq.add(rid) Remark.create(content=content,user_id=user_id,post_id=id) time.sleep(10) #t=ScrapIndex(config) config={'url':'http://v2ex.com/?tab=all', 'domain':'v2ex.com', 'user_agent':'Mozilla/5.0 (Windows NT 6.3; rv:38.0) Gecko/20100101 Firefox/38.0', 'links_css':"div.box:nth-child(2) table td:nth-child(3) .item_title a", 'href_patten':r"/t/(\d+)#", 'save_dir':'tmp', "detail_url":"http://v2ex.com/t/%s", 'refresh_fruiqence':20} savedir=config['save_dir'] if not os.path.exists(savedir): os.mkdir(savedir) db.create_tables([User,Post,Remark]) threads=[ScrapIndex(config),Refresh(config),ScrapPage(config)] for t in threads: t.start() @route('/static/<filepath:path>') def server_static(filepath): return static_file(filepath, root='.') @route('/') def index(): posts=Post.select().paginate(0,30) return template('index', page=1,posts=posts) @route('/recent/:page') def recent(page): page = int(page) posts=Post.select().paginate((page-1)*30,30) page=page+1 return template('index',page=page, posts=posts) @route('/t/:id') def remark(id): id = int(id) post=Post.get(Post.post_id==id) remarks=Remark.select().where(Remark.post_id==id) return template('post', post=post,remarks=remarks) run(host='localhost', port=8080, debug=True) sys.exit() 模板文件在 http://git.oschina.net/yaky/movesite/attach_files/download?i=15588&u=http%3A%2F%2Ffiles.git.oschina.net%2Fgroup1%2FM00%2F00%2F9D%2FfMqNk1Xps-KAdIHKABZQps08o6k878.rar%3Ftoken%3D5af19cbad2332df36b8ebf433041ac2f%26ts%3D1441379309 python app.py 开始运行 如果提示缺少包 请运行python -m pip install xx 安装相应的包