package cn.g4b.qhc.micro.service.webmagic;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import java.util.ArrayList;
import java.util.List;
/**
@author code4crafter@gmail.com <br>
*/
public class SinaBlogProcessor implements PageProcessor {
public static final String URL_LIST = "https://www\.qichacha\.com/search\?key=[\u4e00-\u9fa5]+#p:\d+";
public static final String URL_POST = "https://www\.qichacha\.com/firm_\w+\.html";
private static int count = 0;
private Site site = Site.me().setRetryTimes(3).
setSleepTime(1000)
// .setDomain("blog.sina.com.cn")
// .setSleepTime(1300)
.addHeader("Cookie", "QCCSESSID=101a9lg15rjace0afekmfgh444; zg_did=%7B%22did%22%3A%20%2216967f9e5b44f-0667a471b9d1be-b781636-100200-16967f9e5c85d%22%7D; UM_distinctid=16967f9eb241f-080db49ce7f914-b781636-100200-16967f9eb27134; _uab_collina=155222762707210960020939; acw_tc=0e77721e15522276280874600e9a5599bfc7a42d8f5b3d06c61bfea8e0; CNZZDATA1254842228=1014061049-1552226827-https%253A%252F%252Fwww.baidu.com%252F%7C1552743450; Hm_lvt_3456bee468c83cc63fb5147f119f1075=1552227626,1552385981,1552748361; hasShow=1; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201552748359331%2C%22updated%22%3A%201552748381909%2C%22info%22%3A%201552227624453%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22www.baidu.com%22%2C%22cuid%22%3A%20%22d7022906007b7e4d69a3aa7b77a92aee%22%7D; Hm_lpvt_3456bee468c83cc63fb5147f119f1075=1552748383")
.addHeader("Accept", " text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8")
.addHeader("Accept-Encoding", " gzip, deflate, br")
.addHeader("Accept-Language", " zh-CN,zh;q=0.9")
.addHeader("Cache-Control", " no-cache")
.addHeader("Connection", " keep-alive")
.addHeader("Host", " www.qichacha.com")
.addHeader("Upgrade-Insecure-Requests", " 1")
.addHeader("User-Agent", " Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3664.3 Safari/537.36");
@Override
public void process(Page page) {
//列表页
if (!page.getUrl().regex(URL_POST).match()) {
page.addTargetRequests(page.getHtml().xpath("//table[@class=\"m_srchList\"]").links().regex(URL_POST).all());
page.addTargetRequests(page.getHtml().links().regex("javascript\\:getSearchPage*").all());
//文章页
} else {
count++;
System.out.println(page.getHtml().xpath("//*[@id=\"Cominfo\"]/table[2]/tbody/tr[1]/td[2]/text()"));
}
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
// Spider.create(new SinaBlogProcessor()).addUrl("http://blog.sina.com.cn/s/articlelist_1487828712_0_1.html")
// .run();
// Spider.create(new ListProcessor()).addUrl("https://www.qichacha.com/search?key=%E5%B0%8F%E7%B1%B3").thread(5).run();
Spider.create(new SinaBlogProcessor()).addUrl("https://www.qichacha.com/search?key=小米#p:1").addUrl("https://www.qichacha.com/search?key=小米#p:2")
.run();
System.out.println("秒,抓取了" + count + "条记录");
}
}