首页 新闻 会员 周边

python建立IP池全部都是无效ip

0
悬赏园豆:20 [已解决问题] 解决于 2018-08-10 14:56
import requests,datetime,threading,time
from fake_useragent import UserAgent
from lxml import etree
import random

def write(path,text):
    #写入文档
    with open(path,'a',encoding='utf-8') as f:
        f.writelines(text)
        f.write('\n')

def truncatefile(path):
    #清空文档
    with open(path,'w',encoding='utf-8') as f:
        f.truncate()

def read(path):
    with open(path,'r',encoding='utf-8') as f:
        txt = []
        for i in f.readlines():
            txt.append(i.strip())
    return txt

def get_time_diff(start_time,end_time):
    seconds = (end_time - start_time).seconds
    m,s = divmod(seconds,60)
    h,m = divmod(m,60)
    diff = ("%02d:%02d:%02d" % (h, m, s))
    return diff

def get_headers():
    #调用fake_useragent库生成各种header
    ua = UserAgent()
    user_agent = ua.random
    headers = {'User-Agent':user_agent}
    return headers

# def get_headers():
#     user_agent_list = [ \
#         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" \
#         "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \
#         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \
#         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \
#         "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \
#         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \
#         "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \
#         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
#         "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
#         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
#         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
#         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
#         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
#         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
#         "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
#         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \
#         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \
#         "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
#     ]
#     UserAgent=random.choice(user_agent_list)
#     headers = {'User-Agent': UserAgent}
#     return headers


def check_ip(target_url,ip):
    '''
    通过status_code==200测试ip是否可用
    :param target_url: 自定义一个测试网址,比如www.baidu.com
    :param ip: 待测试的ip
    :return: 返回真假
    '''
    headers = get_headers()  # 定制请求头
    proxies = {"http": "http://" + ip, "https": "https://" + ip}  # 代理ip
    try:
        response = requests.get(url=target_url, proxies=proxies, headers=headers, timeout=5).status_code
        if response == 200:
            return True
        else:
            return False
    except:
        return False

def find_ip(type,pagenum,target_url,path):
    '''
    爬取xici代理网的ip
    :param type:ip类型,包括国内高匿,透明等
    :param pagenum:页数
    :param target_url:目标url,判断ip是否可用
    :param path:存放ip池的路径
    :return:
    '''
    list = {
        '1':'http://www.xicidaili.com/nn/',
        '2':'http://www.xicidaili.com/nt/',
        '3':'http://www.xicidaili.com/wn/',
        '4':'http://www.xicidaili.com/wt/'
    }
    url = list[str(type)]+str(pagenum)
    headers = get_headers()
    html = requests.get(url=url, headers=headers, timeout=5).text
    selector = etree.HTML(html)
    infos = selector.xpath('//*[@class="odd"]')
    for info in infos:
        ip = info.xpath('td[2]/text()')[0].strip() + ':' + info.xpath('td[3]/text()')[0].strip()
        is_avail = check_ip(target_url,ip)
        if is_avail:
            write(path=path,text=ip)
            print(ip)
        else:
            print(ip+'  无效ip')

def get_ip(target_url,path):
    truncatefile(path)
    start_time = datetime.datetime.now()
    threads = []
    for type in range(1,4):
        for pagenum in range(1,3):
            t = threading.Thread(target=find_ip,args=(type+1,pagenum+1,target_url,path))
            threads.append(t)
            # time.sleep(2)
    print('开始爬取代理ip')
    for s in threads:
        s.start()
    for e in threads:
        e.join()
    print('爬取完成')
    end_time = datetime.datetime.now()
    diff = get_time_diff(start_time,end_time)
    ips = read(path)
    print('一共爬取代理ip: %s 个,共耗时: %s \n' % (len(ips), diff))

if __name__ == '__main__':
    path = 'ips.text'
    target_url = 'http://www.cnblogs.com/TurboWay/'
    get_ip(target_url,path)
问题补充:

我学着网上的代码,自己结合着写了一个,但是全部都是无效IP。用别人的代码,大概有1/3是有效的IP。大家来帮我看看是为什么。

从MH到其他的主页 从MH到其他 | 初学一级 | 园豆:140
提问于:2018-08-08 00:25
< >
分享
最佳答案
0

1.你的网页就算访问起了也是403

2.你的timeout设置的5秒可能有点短,就功能来说

我使用你的代码,将target url改成百度,超时设置10s,得到的结果是大约有一成可用IP

收获园豆:20
Masako | 小虾三级 |园豆:1893 | 2018-08-08 09:53

我把超时设置为10s,确实可以获取一些可用IP了,但是可用率太低了。请问知道是什么原因吗?是我测试IP有效性的方法不对还是说这个网站爬下来的IP就是大多数不可用。另外,请问请求头是可以随意设置的吗,会不会和请求头有关系?

从MH到其他 | 园豆:140 (初学一级) | 2018-08-08 10:11

@从MH到其他: 毕竟是免费的,找得到可用的就不错了。如果这免费的全都可用,怎么还有那么多人付费买ip池,可想而知。请求头只是给网站看看,影响不大。

Masako | 园豆:1893 (小虾三级) | 2018-08-08 10:19

@Masako: 学习了!我刚刚搜了下,付费的也不贵,自己学习的话有那么十几个IP可以了,免费的稳定性也差。

从MH到其他 | 园豆:140 (初学一级) | 2018-08-08 10:21
清除回答草稿
   您需要登录以后才能回答,未注册用户请先注册