这是IP被封了吗?(网站可以正常浏览)
Traceback (most recent call last):
File "D:\DataScience\Python3.7\lib\site-packages\urllib3\connectionpool.py", line 600, in urlopen
chunked=chunked)
File "D:\DataScience\Python3.7\lib\site-packages\urllib3\connectionpool.py", line 384, in _make_request
six.raise_from(e, None)
File "<string>", line 2, in raise_from
File "D:\DataScience\Python3.7\lib\site-packages\urllib3\connectionpool.py", line 380, in _make_request
httplib_response = conn.getresponse()
File "D:\DataScience\Python3.7\lib\http\client.py", line 1321, in getresponse
response.begin()
File "D:\DataScience\Python3.7\lib\http\client.py", line 296, in begin
version, status, reason = self._read_status()
File "D:\DataScience\Python3.7\lib\http\client.py", line 257, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "D:\DataScience\Python3.7\lib\socket.py", line 589, in readinto
return self._sock.recv_into(b)
TimeoutError: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:\DataScience\Python3.7\lib\site-packages\requests\adapters.py", line 445, in send
timeout=timeout
File "D:\DataScience\Python3.7\lib\site-packages\urllib3\connectionpool.py", line 638, in urlopen
_stacktrace=sys.exc_info()[2])
File "D:\DataScience\Python3.7\lib\site-packages\urllib3\util\retry.py", line 367, in increment
raise six.reraise(type(error), error, _stacktrace)
File "D:\DataScience\Python3.7\lib\site-packages\urllib3\packages\six.py", line 685, in reraise
raise value.with_traceback(tb)
File "D:\DataScience\Python3.7\lib\site-packages\urllib3\connectionpool.py", line 600, in urlopen
chunked=chunked)
File "D:\DataScience\Python3.7\lib\site-packages\urllib3\connectionpool.py", line 384, in _make_request
six.raise_from(e, None)
File "<string>", line 2, in raise_from
File "D:\DataScience\Python3.7\lib\site-packages\urllib3\connectionpool.py", line 380, in _make_request
httplib_response = conn.getresponse()
File "D:\DataScience\Python3.7\lib\http\client.py", line 1321, in getresponse
response.begin()
File "D:\DataScience\Python3.7\lib\http\client.py", line 296, in begin
version, status, reason = self._read_status()
File "D:\DataScience\Python3.7\lib\http\client.py", line 257, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "D:\DataScience\Python3.7\lib\socket.py", line 589, in readinto
return self._sock.recv_into(b)
urllib3.exceptions.ProtocolError: ('Connection aborted.', TimeoutError(10060, '由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。', None, 10060, None))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:/Project/Python/practise.py", line 133, in <module>
dit = get_all_data(urls_list)
File "D:/Project/Python/practise.py", line 116, in get_all_data
title, authors, publish_text, year, publish, ref_wr, key_words = get_item_info(url)
File "D:/Project/Python/practise.py", line 68, in get_item_info
content_details = requests.get(url)
File "D:\DataScience\Python3.7\lib\site-packages\requests\api.py", line 72, in get
return request('get', url, params=params, kwargs)
File "D:\DataScience\Python3.7\lib\site-packages\requests\api.py", line 58, in request
return session.request(method=method, url=url, kwargs)
File "D:\DataScience\Python3.7\lib\site-packages\requests\sessions.py", line 512, in request
resp = self.send(prep, send_kwargs)
File "D:\DataScience\Python3.7\lib\site-packages\requests\sessions.py", line 622, in send
r = adapter.send(request, kwargs)
File "D:\DataScience\Python3.7\lib\site-packages\requests\adapters.py", line 495, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', TimeoutError(10060, '由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。', None, 10060, None))
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import pandas as pd
import requests
import re
import random
from collections import defaultdict
def driver_open(key_word,start_year,end_year):
url = "http://xueshu.baidu.com/"
driver = webdriver.Chrome(r'C:\Users\zzy\AppData\Local\Google\Chrome\Application\chromedriver.exe')
driver.maximize_window()
driver.get(url)
time.sleep(10)
#搜索指定关键词
driver.find_element_by_class_name('s_ipt').send_keys(key_word)
time.sleep(2)
driver.find_element_by_class_name('s_btn_wr').click()
time.sleep(2)
#限定年份
driver.find_element_by_id('leftnav_input_ylo').send_keys(start_year)
time.sleep(2)
driver.find_element_by_id('leftnav_input_yhi').send_keys(end_year)
time.sleep(2)
driver.find_element_by_class_name('leftnav_input_sub').click()
time.sleep(2)
#其他条件
driver.find_element_by_css_selector('[title="学位"]').click()
time.sleep(2)
content = driver.page_source.encode('utf-8')
driver.close()
soup = BeautifulSoup(content, 'lxml')
return soup
def page_url_list(soup, page=0):
fir_page = "http://xueshu.baidu.com" + soup.find_all("a", class_="n")[0]["href"]
urls_list = []
hhh=1
for i in range(page):
next_page = fir_page.replace("pn=10", "pn={:d}".format(i * 10))
response = requests.get(next_page)
soup_new = BeautifulSoup(response.text, "lxml")
c_fonts = soup_new.find_all("h3", class_="t c_font")
for c_font in c_fonts:
url = "http://xueshu.baidu.com" + c_font.find("a").attrs["href"]
urls_list.append(url)
hhh+=1
print(hhh)
return urls_list
def get_item_info(url):
print(url)
content_details = requests.get(url)
soup = BeautifulSoup(content_details.text, "lxml")
# 提取文章题目
title=soup.select('#dtl_l > div > h3 > a')
if len(title)==0:
title="NA"
else:
title = ''.join(list(soup.select('#dtl_l > div > h3 > a')[0].stripped_strings))
# 提取文章作者
authors = ''.join(str(author_) for author_ in list(soup.select('div.author_wr')))
if len(authors) ==0:
authors="NA"
else:
authors = ''.join(str(author_) for author_ in list(soup.select('div.author_wr')[0].stripped_strings)[1:])
# 提取出版社和时间
fir_publish_text = list(soup.select('p.publish_text'))
if len(fir_publish_text) == 0:
publish_text = "NA"
publish = "NA"
year = "NA"
else:
publish_text = list(soup.select('p.publish_text')[0].stripped_strings)
publish = publish_text[0]
publish = re.sub("[\r\n ]+", "", publish)
publish_text = ''.join(publish_text)
publish_text = re.sub("[\r\n ]+", "", publish_text)
# 提取时间
match_re = re.match(".*?(\d{4}).*", publish_text)
if match_re:
year = int(match_re.group(1))
else:
year = 0
# 提取引用量
ref_wr = list(soup.select('a.sc_cite_cont'))
if len(ref_wr) == 0:
ref_wr = 0
else:
ref_wr = list(soup.select('a.sc_cite_cont')[0].stripped_strings)[0]
# 提取关键词
key_words = soup.select('div.dtl_search_word > div')
if len(key_words)==0:
key_words = "NA"
else:
key_words = ','.join(key_word for key_word in list(soup.select('div.dtl_search_word > div')[0].stripped_strings)[1::2])
return title, authors, publish_text, year, publish, ref_wr, key_words
def get_all_data(urls_list):
dit = defaultdict(list)
for url in urls_list:
title, authors, publish_text, year, publish, ref_wr, key_words = get_item_info(url)
dit["title"].append(title)
dit["authors"].append(authors)
dit["publish_text"].append(publish_text)
dit["year"].append(year)
dit["publish"].append(publish)
dit["ref_wr"].append(ref_wr)
dit["key_words"].append(key_words)
return dit
def save_csv(dit):
data = pd.DataFrame(dit)
columns = ["title", "authors", "publish_text", "year", "publish", "ref_wr", "key_words"]
data.to_csv("D:/test.csv", index=False, columns=columns)
print("That's OK!")
if __name__ == "__main__":
soup = driver_open('库存',2013,2018)
urls_list = page_url_list(soup, page=1000)
dit = get_all_data(urls_list)
save_csv(dit)
这个不用贴代码,只需要看
'由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。',
至少说明你无法访问该主机的此服务。
很容易测试验证,1 ping,2 telnet。
这种咋解决呀?
目标网站的限制。反爬取,反dos的。
找个ip代理软件可以继续爬取。