__author__ = 'Saint' import os import urllib.request import json from html.parser import HTMLParser # 从获取的网页内容筛选图片的内容 class MyHtmlParser(HTMLParser): links = [] def handle_starttag(self, tag, attrs): if tag == "img": if len(attrs) == 0: pass else: for name, value in attrs: if name == "src": self.links.append(value) class Down(object): # 总的目录 img_path = "E:/saint" # 下载目录 dir = '' # 采集源地址 collect_links = ["http://dy.163.com/v2/media/articlelist/T1374483113516-1", "http://dy.163.com/v2/media/articlelist/T1420776257254-1", "http://dy.163.com/v2/media/articlelist/T1376641060407-1"] img_links = "http://dy.163.com/v2/article" def handleCollect(self): for collect_link in self.collect_links: notice = "开始从[" + collect_link + "]采集图片" print(notice) # 建立下载的目录 dir_name = collect_link.split("/")[-1] self.isDirExists(dir_name) dict = self.getListFromSubscribe(collect_link) if dict == False: print("数据采集失败,是否继续(y/n)") op = input(); if op == "y": os.system("cls") pass elif op == "n": print("停止采集") break else: os.system("cls") print("非法输入") break else: for page in dict: page_uri = self.img_links + "/" + page["tid"] + "/" + page["docid"] self.getImgFromUri(page_uri) print("是否继续(y/n)") new_op = input(); if new_op == "n": os.system("cls") print("采集完毕") break print("OK") # 从订阅源获取目录 def getListFromSubscribe(self, uri): res = urllib.request.urlopen(uri) if res.code < 200 or res.code > 300: os.system("clear") return False else: result = res.read().decode("gbk") # 3.4版本的read()返回的是byte类型,需要decode()处理,选项是网页编码 dict = json.loads(result) if dict['code'] != 1: print(dict['msg']) return False else: return dict['data'] # 获取本期订阅的网页,并从网页中提取出来需要的图片 def getImgFromUri(self, uri): html_code = urllib.request.urlopen(uri).read().decode("gbk") hp = MyHtmlParser() hp.feed(html_code) hp.close() for link in hp.links: # hp.links 是图片的下载地址的列表 self.writeToDisk(link) # 检查文件目录是否存在,如果不存在,则创建目录 def isDirExists(self, dir_name): self.dir = self.img_path + dir_name isExists = os.path.exists(self.dir) if not isExists: os.makedirs(self.dir) return True else: return True # 下载文件,并且写入磁盘 def writeToDisk(self, url): os.chdir(self.dir) file = urllib.request.urlopen(url).read() file_name = url.split("/")[-1] open(file_name, "wb").write(file) return True if __name__ == "__main__": down = Down() down.handleCollect()