import requests,datetime,threading,time from fake_useragent import UserAgent from lxml import etree import random def write(path,text): #写入文档 with open(path,'a',encoding='utf-8') as f: f.writelines(text) f.write('\n') def truncatefile(path): #清空文档 with open(path,'w',encoding='utf-8') as f: f.truncate() def read(path): with open(path,'r',encoding='utf-8') as f: txt = [] for i in f.readlines(): txt.append(i.strip()) return txt def get_time_diff(start_time,end_time): seconds = (end_time - start_time).seconds m,s = divmod(seconds,60) h,m = divmod(m,60) diff = ("%02d:%02d:%02d" % (h, m, s)) return diff def get_headers(): #调用fake_useragent库生成各种header ua = UserAgent() user_agent = ua.random headers = {'User-Agent':user_agent} return headers # def get_headers(): # user_agent_list = [ \ # "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" \ # "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \ # "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \ # "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \ # "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \ # "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \ # "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \ # "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ # "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ # "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ # "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \ # "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \ # "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ # "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ # "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ # "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \ # "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \ # "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" # ] # UserAgent=random.choice(user_agent_list) # headers = {'User-Agent': UserAgent} # return headers def check_ip(target_url,ip): ''' 通过status_code==200测试ip是否可用 :param target_url: 自定义一个测试网址,比如www.baidu.com :param ip: 待测试的ip :return: 返回真假 ''' headers = get_headers() # 定制请求头 proxies = {"http": "http://" + ip, "https": "https://" + ip} # 代理ip try: response = requests.get(url=target_url, proxies=proxies, headers=headers, timeout=5).status_code if response == 200: return True else: return False except: return False def find_ip(type,pagenum,target_url,path): ''' 爬取xici代理网的ip :param type:ip类型,包括国内高匿,透明等 :param pagenum:页数 :param target_url:目标url,判断ip是否可用 :param path:存放ip池的路径 :return: ''' list = { '1':'http://www.xicidaili.com/nn/', '2':'http://www.xicidaili.com/nt/', '3':'http://www.xicidaili.com/wn/', '4':'http://www.xicidaili.com/wt/' } url = list[str(type)]+str(pagenum) headers = get_headers() html = requests.get(url=url, headers=headers, timeout=5).text selector = etree.HTML(html) infos = selector.xpath('//*[@class="odd"]') for info in infos: ip = info.xpath('td[2]/text()')[0].strip() + ':' + info.xpath('td[3]/text()')[0].strip() is_avail = check_ip(target_url,ip) if is_avail: write(path=path,text=ip) print(ip) else: print(ip+' 无效ip') def get_ip(target_url,path): truncatefile(path) start_time = datetime.datetime.now() threads = [] for type in range(1,4): for pagenum in range(1,3): t = threading.Thread(target=find_ip,args=(type+1,pagenum+1,target_url,path)) threads.append(t) # time.sleep(2) print('开始爬取代理ip') for s in threads: s.start() for e in threads: e.join() print('爬取完成') end_time = datetime.datetime.now() diff = get_time_diff(start_time,end_time) ips = read(path) print('一共爬取代理ip: %s 个,共耗时: %s \n' % (len(ips), diff)) if __name__ == '__main__': path = 'ips.text' target_url = 'http://www.cnblogs.com/TurboWay/' get_ip(target_url,path)
我学着网上的代码,自己结合着写了一个,但是全部都是无效IP。用别人的代码,大概有1/3是有效的IP。大家来帮我看看是为什么。
1.你的网页就算访问起了也是403
2.你的timeout设置的5秒可能有点短,就功能来说
我使用你的代码,将target url改成百度,超时设置10s,得到的结果是大约有一成可用IP
我把超时设置为10s,确实可以获取一些可用IP了,但是可用率太低了。请问知道是什么原因吗?是我测试IP有效性的方法不对还是说这个网站爬下来的IP就是大多数不可用。另外,请问请求头是可以随意设置的吗,会不会和请求头有关系?
@从MH到其他: 毕竟是免费的,找得到可用的就不错了。如果这免费的全都可用,怎么还有那么多人付费买ip池,可想而知。请求头只是给网站看看,影响不大。
@Masako: 学习了!我刚刚搜了下,付费的也不贵,自己学习的话有那么十几个IP可以了,免费的稳定性也差。