爬虫目的 新闻详情页的结构分两种,判断结构,再获取不同的内容。
请各位朋友看一下这个错误? 非常感谢
Traceback (most recent call last):
File "/usr/local/lib/python3.5/dist-packages/twisted/internet/defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/home/haowai/PycharmProjects/weinanweb/weinanweb/pipelines.py", line 47, in process_item
text = json.dumps(dict(item), ensure_ascii=False, indent=2)
ValueError: dictionary update sequence element #0 has length 5; 2 is required
第一个函数爬取
title = scrapy.Field()
title_url = scrapy.Field()
abstract = scrapy.Field()
第二个函数爬取
content = scrapy.Field()
image_urls = scrapy.Field()
可能是第一个函数传递item的时候 出现只收到item的前三个字段,缺少后两个字段,深拷贝试过,还是一样的问题。
欢迎交流 QQ:2924765579
————————————————————————————————————————————
import scrapy import time import re from copy import deepcopy from weinanweb.items import WeinanwebItem class WnwebSpider(scrapy.Spider): name = 'wnweb' allowed_domains = ['weinan.hsw.cn'] start_urls = ['http://weinan.hsw.cn/weinannews'] def parse(self, response): item = WeinanwebItem() List = response.xpath("//div[@class='yaoweilist']/ul[@class='list_tpbt']/li") for li in List: item['title'] = li.xpath(".//p[@class='titc fz18c']/a/text()").extract_first() item['title_url'] = li.xpath("./p/a/@href").extract()[0] item['abstract'] = li.xpath("./p[@class='absc fz14c']/text()").extract_first() item['content'] = [] item['image_urls'] = [] # print(item['title']) yield scrapy.Request( url=item['title_url'], meta={"item": deepcopy(item)}, callback=self.content_info ) # 列表翻页的处理 new_url = response.xpath("//div[@class='page']/a[last()]").extract_first() time.sleep(10) if new_url: yield scrapy.Request( new_url, callback=self.parse) # 有一种新闻页 def content_info(self, response): time.sleep(0.2) item = response.meta['item'] # 新闻页有两种结构 if判断 分别取不同的 页面内容 if response.xpath("//div[@class='contentBox cf']").extract(): # 小图模式 item['content'] += response.xpath("//div[@class='contentBox cf']").extract() item['image_urls'] += response.xpath("//div[@class='contentBox cf']//img/@src").extract() yield item # 小图模式没找到翻页的 # next_content_url = response.xpath("").extract_first() # if next_content_url: # yield scrapy.Request(next_content_url, # meta={'item': item}, # callback=self.content_info) elif response.xpath("//div[@id='photoView']/img/@src").extract(): # 大图模式 item['content'] += response.xpath("//div[@class='bd main']//p/text()").extract() item['image_urls'] += response.xpath("//div[@id='photoView']/img/@src").extract() next_content_url = response.xpath("//div[@id='news_more_page_div_id']" "/a[@id='nexpageid']/@href").extract_first() if next_content_url: yield scrapy.Request(next_content_url, meta={'item': item}, callback=self.content_info) else: pass yield item
下面是pipeline中的代码:
import scrapy import json from scrapy.exporters import JsonItemExporter from scrapy.pipelines.images import ImagesPipeline from scrapy.exceptions import DropItem from weinanweb import settings class WeinanwebPipeline(ImagesPipeline): def get_media_requests(self, item, info): """ 你需要重写 get_media_requests() 方法, 并对各个图片URL返回一个Request: """ for image_url in item['image_urls']: yield scrapy.Request(image_url) def item_completed(self, results, item, info): """ 当一个单独项目中的所有图片请求完成时(要么完成下载,要么因为某种原因下载失败), item_completed() 方法将被调用。 """ image_paths = [x['path'] for ok, x in results if ok] if not image_paths: raise DropItem("Item contains no images") yield item class ItemPipeline(object): def __init__(self): self.file = open("Qianhua.json", 'wb') def process_item(self, item, spider): text = json.dumps(dict(item), ensure_ascii=False, indent=2) self.file.write(text.encode('utf-8')) print('正在写入——————————>>>') def close(self): self.file.close()
item中的代码:
import scrapy class WeinanwebItem(scrapy.Item): image_urls = scrapy.Field() image = scrapy.Field() title = scrapy.Field() abstract = scrapy.Field() content = scrapy.Field() title_url = scrapy.Field()
dict(item)没对,你的item传得有问题,可以调试看看
另,scrapy的item本来就是个字典,不用dict
上面是我的一个报错item,
下面,我试了一下json输出,不知道这里为什么会报错
>>> import json >>> a={'d':1} >>> v = json.dumps(dict(a)) >>> v '{"d": 1}' >>> >>> b = {'s':['boke']} >>> n = json.dumps(dict(b)) >>> n '{"s": ["boke"]}' >>>
跑了一遍代码,发现
1.需要dict()转换,我的错
2.代码正确运行,没有报错
我也是过直接把字典字典化再dump,确实是可以的.