#!/usr/bin/env python # -*- coding:utf-8 -*- """ Copyright (c) 2015, The Sun Technology This Program could download files from the internet """ import urllib2 import os import time from urllib2 import HTTPError from bs4 import BeautifulSoup from urlparse import urlparse BASE_URL="/Users/mac/Documents%s" def get_file_name(req_url): path_obj=urlparse(req_url) return os.path.split(path_obj.path) def get_save_path(save_dir): dirs=get_file_name(save_dir) save_path=BASE_URL%dirs[0] if not os.path.exists(save_path): os.mkdir(save_path) def save_files(file_url,file_path): start=time.time() response=urllib2.urlopen(file_url) html=response.read() response.close() with open(file_path,"wb") as handler: handler.write(html) print "%s has been downloaded successfully "%file_url print "Total cost:%.3f ms"%(time.time()-start) def download(url_path): start = 82 for pageNum in range(start,start+10): try: combine_url=url_path%pageNum response=urllib2.urlopen(combine_url) page=response.read() if response.getcode()==200 else None """ Start parsing the HTML from web page""" if not page: return soup = BeautifulSoup(page,"html.parser") img_url=soup.find_all('img',id="main-comic") #parse the url url_parse=urlparse(url_path) #rebuild the url rebuild_url= url_parse.scheme+':'+img_url[0].get('src') #download comic from url get_name=get_file_name(rebuild_url) save_files(rebuild_url, BASE_URL%'/'.join(get_name)) except HTTPError, e: print "An error has accour",e continue finally: response.close() if __name__ == '__main__': req_url="http://explosm.net/comics/%s" get_save_path(req_url) download(req_url)