首页新闻找找看学习计划

python爬虫 大佬 请教下 为什么爬取的数据有时能爬到 有时有爬不到, 代码如下:

0
[已关闭问题] 关闭于 2020-02-15 13:31

import time
import requests
from lxml import etree
import pymysql

class GuPiao_spider():

def __init__(self):
    self.headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/70.0'
    }

    self.url = 'http://q.10jqka.com.cn/index/index/board/all/field/zdf/order/desc/page/{}/ajax/1/'  # 每页的网址
    # self.content_url = 'http://qd.10jqka.com.cn/quote.php?cate=real&type=stock&return=json&callback=showStockData&code='  # 动态网页的网址

def get_pages(self):
    '''
    获取网站总页数
    :return:
    '''
    url = 'http://q.10jqka.com.cn/index/index/board/all/field/zdf/order/desc/page/1/ajax/1/'
    html = requests.get(url, headers=self.headers).content.decode('gbk')
    html_path = etree.HTML(html)
    page_num = html_path.xpath('//*[@id="m-page"]/span/text()')[0].split('/')[1]
    return page_num

def get_content(self, url):
    '''
    :param url: 爬取每页的源代码,并获取数据
    :return: 返回所需数据的列表
    '''
    html = requests.get(url, headers=self.headers).content.decode('gbk')
    html_path = etree.HTML(html)
    tr = html_path.xpath('//tbody/tr')
    content_list = []
    for td in tr:
        info = {}
        title = td.xpath('td[3]/a/text()')[0]
        code = td.xpath('td[2]/a/text()')[0]
        li = td.xpath('td/text()')
        info['序号'] = li[0]
        info['代码'] = code
        info['名称'] = title
        info['现价'] = li[1]
        info['涨跌幅'] = li[2]
        info['涨跌'] = li[3]
        info['涨速'] = li[4]
        info['换手'] = li[5]
        info['量比'] = li[6]
        info['振幅'] = li[7]
        info['成交额'] = li[8]
        info['流通股'] = li[9]
        info['流动市值'] = li[10]
        info['市赢率'] = li[11]
        content_list.append(info)
    return content_list

# def save_mysql(self,sql, data):
#     '''
#     连接mysql数据库,把数据存到数据库中
#     :return:
#     '''
#     conn = pymysql.connect(
#         host = 'localhost',
#         user = 'root',
#         password = '123456',
#         port = 3306,
#         db = 'test'
#     )
#     cur = conn.cursor()
#     cur.execute(sql,data)
#     conn.commit()

def run(self):
    sql = 'insert into tonghuashun(序号,代码,名称,现价,涨跌幅,涨跌,涨速,换手,量比,振幅,成交额,流通股,流动市值,市赢率) values (%(序号)s,%(代码)s,%(名称)s,%(现价)s,' \
          '%(涨跌幅)s,%(涨跌)s,%(涨速)s,%(换手)s,' \
          '%(量比)s,%(振幅)s,%(成交额)s,%(流通股)s,' \
          '%(流动市值)s,%(市盈率)s)'
    page_num = self.get_pages()  # 网站总页数

    count = 2  # 用计数器表示爬取的第几页 带入self.url中
    while True:
        print('正在爬取第{}页数据...............'.format(count))

        url = self.url.format(str(count))
        # print(url)
        content_list = self.get_content(url)
        for data in content_list:  # 循环列表  得到字典数据
            print(data)
            # self.save_mysql(sql, data)

        count += 1
        time.sleep(3)

        if count >= int(page_num):
            return False

if name == 'main':
spider = GuPiao_spider()
spider.run()

Bruce-张的主页 Bruce-张 | 菜鸟二级 | 园豆:202
提问于:2020-02-14 17:07
< >
分享
清除回答草稿
   您需要登录以后才能回答,未注册用户请先注册