写了个小脚本爬取http://design-patterns.readthedocs.io/zh_CN/latest/index.html页面所有的页面,现在遇到一个问题,怎么根据爬取到的url创建对应的目录并把文件放到对应的文件夹呢?
代码如下
# coding:utf-8 import os import urllib2 from bs4 import BeautifulSoup import datetime baseUrl = "http://design-patterns.readthedocs.io/zh_CN/latest/" seed_url = "http://design-patterns.readthedocs.io/zh_CN/latest/index.html" def getFileName(url): splitResult = os.path.split(url) # str分割为tuple return splitResult def parseHtml(html): soup = BeautifulSoup(html) list = soup.findAll('li') for tag in list: if tag.a != None: final_url = baseUrl + tag.a['href'] # print dirs downLoad(final_url, False) def downLoad(url, needParse): try: html = urllib2.urlopen(url).read() # 读取url 返回html if (needParse): parseHtml(html) list = getFileName(url) name_dir = '图解设计模式' if (not os.path.exists(name_dir)): dir = os.mkdir(name_dir) f = open(name_dir + '/' + str(list[list.count("foo") - 1]), 'w') # 保存为html文件 f.write(bytes(html)) f.close() print "downLoading..." + url except urllib2.URLError as e: # 异常处理 读取失败时显示失败原因 print "downLoad Error:" + str(e.reason) html = None return html downLoad(seed_url, True)
初学python。还望各位大佬指点一二
os.mkdir(path)
readthedocs上的文档通常在github都有开源。不用这么麻烦吧?