报错信息如下
(base) licongjian@bogon jdPro $ scrapy crawl jingdong
2021-03-05 01:16:37 [scrapy.core.scraper] ERROR: Error downloading <GET https://www.jd.com/allSort.aspx>
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/twisted/internet/defer.py", line 1416, in _inlineCallbacks
result = result.throwExceptionIntoGenerator(g)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/twisted/python/failure.py", line 512, in throwExceptionIntoGenerator
return g.throw(self.type, self.value, self.tb)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/scrapy/core/downloader/middleware.py", line 45, in process_request
return (yield download_func(request=request, spider=spider))
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/scrapy/utils/defer.py", line 55, in mustbe_deferred
result = f(*args, **kw)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/scrapy/core/downloader/handlers/__init__.py", line 75, in download_request
return handler.download_request(request, spider)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/scrapy/core/downloader/handlers/http11.py", line 88, in download_request
return agent.download_request(request)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/scrapy/core/downloader/handlers/http11.py", line 342, in download_request
agent = self._get_agent(request, timeout)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/scrapy/core/downloader/handlers/http11.py", line 301, in _get_agent
_, _, proxyHost, proxyPort, proxyParams = _parse(proxy)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/scrapy/core/downloader/webclient.py", line 36, in _parse
return _parsed_url_args(parsed)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/scrapy/core/downloader/webclient.py", line 19, in _parsed_url_args
host = to_bytes(parsed.hostname, encoding="ascii")
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/scrapy/utils/python.py", line 106, in to_bytes
raise TypeError('to_bytes must receive a str or bytes '
TypeError: to_bytes must receive a str or bytes object, got NoneType
### 之前手动添加代理后爬取了45w+的信息,使用中间件自动获取ip发送请求后就出现了这个错误,
class RandomProxyMiddleware(object):
def __init__(self, login_data, data):
self.redis_db = redis.Redis(host='127.0.0.1', port=6379, db=1)
self.max_reco = 3
self.login_data = login_data
self.data = data
@classmethod
def from_crawler(cls, crawler):
return cls(
login_data=crawler.settings.get('LOGIN_DATA'),
data=crawler.settings.get('DATA')
)
def get_proxy(self, request, spider):
url = 'http://http.tiqu.alibabaapi.com/getip?'
sessiono = requests.Session()
sessiono.post('http://ty-http-d.hamir.net/index/login/dologin', data=self.login_data) # 获取登陆
proxys = sessiono.get(url, params=self.data).json()
for proxy in proxys['data']:
ip_str = 'https://' + proxy['ip'] + ':' + proxy['port']
self.redis_db.sadd('proxy', ip_str)
self.process_request(request, spider)
def process_request(self, request, spider):
proxy = str(self.redis_db.srandmember('proxy')).replace('b', '')
if self.redis_db.scard('proxy') < 3:
print('IP不够用啦,立刻去请求新的ip')
self.get_proxy(request, spider)
if request.meta.get('retry_times') == 2:
print('代理连接出错啦,正在更换代理')
self.redis_db.srem('proxy', proxy)
new_proxy = str(self.redis_db.srandmember('proxy')).replace('b', '')
request.meta['proxy'] = new_proxy
request.meta['proxy'] = proxy
def process_response(self, request, response, spider):
proxy = request.meta['proxy']
if response.status != 200:
self.redis_db.srem('proxy', proxy)
new_proxy = str(self.redis_db.srandmember('proxy')).replace('b', '')
request.meta['proxy'] = new_proxy
if response.text == "<script>window.location.href='https://passport.jd.com/uc/login'</script>":
self.redis_db.srem('proxy', proxy)
new_proxy = str(self.redis_db.srandmember('proxy')).replace('b', '')
request.meta['proxy'] = new_proxy
return request
return response
请问下是中间件出现了什么问题吗?
最后问下大佬们关于scrapy使用redis去重的话是用内置的scrapy-redis好还是直接调用redis的模块使用.
class JdProRedisPipeline:
def process_item(self, item, spider):
if not redis_db.hexists(redis_data_dict,item['data_sku']): #如果哈希内没有data_sku
redis_db.hset('data_sku',item['data_sku'],0) #将data_sku存入
return item
print(f'{item["data_sku"]}已经存在,跳过!!!!')
###请问这样可以实现去重功能吗