# -*- coding: utf-8 -*- import scrapy import requests import os import sys import io from scrapy.selector import HtmlXPathSelector from scrapy.http import Request sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030') class CnblogsSpider(scrapy.Spider): name = 'cnblogs' allowed_domains = ['http://www.duia.com/'] start_urls = ['http://www.duia.com/'] def parse(self, response): hxs = HtmlXPathSelector(response) all_tags_href = hxs.select('//a[@class="sku-mr"]/@href').extract() print(len(all_tags_href)) for href in all_tags_href: # ret = requests.get(href) # print(ret) # href是'http://www.duia.com/sku/1','http://www.duia.com/sku/2'这样字符串 yield Request(url=href, callback=self.second_level) def second_level(self, response): hxs = HtmlXPathSelector(response) all_tags_href = hxs.select('//div[@class="container"]//ul/li/a/@href').extract() print(len(all_tags_href))
代码如上,帮测试写的,就是在首页获取a标签的href后将其放入然后由第二个方法进行对应操作,现在href爬出来是一些url字符串没问题,但是每次执行后只执行了parse方法,新的url该执行的second_level每次都不生效,不知道哪里有问题。
second_level里的代码单独测试过,没有问题,可以取到想要的东西
allowed_domains的问题