首页 新闻 搜索 专区 学院

webmagic 获取所有详情数据,目前只获取一页的数据

0
[待解决问题]

package cn.g4b.qhc.micro.service.webmagic;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

import java.util.ArrayList;
import java.util.List;

/**

  • @author code4crafter@gmail.com <br>
    */
    public class SinaBlogProcessor implements PageProcessor {

    public static final String URL_LIST = "https://www\.qichacha\.com/search\?key=[\u4e00-\u9fa5]+#p:\d+";

    public static final String URL_POST = "https://www\.qichacha\.com/firm_\w+\.html";
    private static int count = 0;
    private Site site = Site.me().setRetryTimes(3).
    setSleepTime(1000)
    // .setDomain("blog.sina.com.cn")
    // .setSleepTime(1300)
    .addHeader("Cookie", "QCCSESSID=101a9lg15rjace0afekmfgh444; zg_did=%7B%22did%22%3A%20%2216967f9e5b44f-0667a471b9d1be-b781636-100200-16967f9e5c85d%22%7D; UM_distinctid=16967f9eb241f-080db49ce7f914-b781636-100200-16967f9eb27134; _uab_collina=155222762707210960020939; acw_tc=0e77721e15522276280874600e9a5599bfc7a42d8f5b3d06c61bfea8e0; CNZZDATA1254842228=1014061049-1552226827-https%253A%252F%252Fwww.baidu.com%252F%7C1552743450; Hm_lvt_3456bee468c83cc63fb5147f119f1075=1552227626,1552385981,1552748361; hasShow=1; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201552748359331%2C%22updated%22%3A%201552748381909%2C%22info%22%3A%201552227624453%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22www.baidu.com%22%2C%22cuid%22%3A%20%22d7022906007b7e4d69a3aa7b77a92aee%22%7D; Hm_lpvt_3456bee468c83cc63fb5147f119f1075=1552748383")
    .addHeader("Accept", " text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8")
    .addHeader("Accept-Encoding", " gzip, deflate, br")
    .addHeader("Accept-Language", " zh-CN,zh;q=0.9")
    .addHeader("Cache-Control", " no-cache")
    .addHeader("Connection", " keep-alive")
    .addHeader("Host", " www.qichacha.com")
    .addHeader("Upgrade-Insecure-Requests", " 1")
    .addHeader("User-Agent", " Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3664.3 Safari/537.36");

    @Override
    public void process(Page page) {

     //列表页
     if (!page.getUrl().regex(URL_POST).match()) {
         page.addTargetRequests(page.getHtml().xpath("//table[@class=\"m_srchList\"]").links().regex(URL_POST).all());
         page.addTargetRequests(page.getHtml().links().regex("javascript\\:getSearchPage*").all());
    
         //文章页
     } else {
         count++;
         System.out.println(page.getHtml().xpath("//*[@id=\"Cominfo\"]/table[2]/tbody/tr[1]/td[2]/text()"));
    
     }

    }

    @Override
    public Site getSite() {
    return site;
    }

    public static void main(String[] args) {
    // Spider.create(new SinaBlogProcessor()).addUrl("http://blog.sina.com.cn/s/articlelist_1487828712_0_1.html")
    // .run();

// Spider.create(new ListProcessor()).addUrl("https://www.qichacha.com/search?key=%E5%B0%8F%E7%B1%B3").thread(5).run();
Spider.create(new SinaBlogProcessor()).addUrl("https://www.qichacha.com/search?key=小米#p:1").addUrl("https://www.qichacha.com/search?key=小米#p:2")
.run();
System.out.println("秒,抓取了" + count + "条记录");
}
}

dzblogs的主页 dzblogs | 菜鸟二级 | 园豆:202
提问于:2019-03-17 19:40
< >
分享
清除回答草稿
   您需要登录以后才能回答,未注册用户请先注册