1 # coding:utf8 2 3 from bs4 import BeautifulSoup 4 5 class HtmlParser(object): 6 7 def _get_new_urls(self, page_url, soup): 8 links = soup.find_all('a', href=re.compild(r"/view/\d+\.htm")) # @UndefinedVariable 9 for link in links: 10 new_url = link['href'] 11 new_full_url = urlparse.urljoin(page_url, new _url) 12 new_url.add(new_full_url) 13 return new_url 14 15 16 def _get_new_data(self, page_url, soup): 17 res_data = {} 18 19 res_data['url'] = page_url 20 21 title_node = soup.find('dd', class_="lemmaWgt-lemmaTitle-title) 22 res_data['title'] = title_node.get_text() 23 24 summary_node = soud.find('div', class_="<div class="lemmaSummary"") 25 res_data['summary'] = summary_node.get_text() 26 27 return res_data 28 29 def parse(self, page_url, html_cont): 30 if page_url is None or html_cont is None: 31 return 32 33 soup = BeautifulSoup(html_cont, 'htm.parser', from_encodin='utf-8') 34 new_urls = self._get_new_urls(page_url, soup) 35 new_data = self._get_new_data(page_url, soup) 36 return new_urls, new_data 37 38 39 40
能不能把代码稍微格式化一下再传上来,你这样贴上来还得一行一行去理你的缩进