1 /** 2 * 爬取简书首页文章 3 * 照着抓包抓到的文件request header,头部都给加上去了 4 * 怎么就是抓不到第二页呢? 5 */ 6 var http = require("http"); 7 var cheerio = require("cheerio"); 8 var superagent = require("superagent"); 9 var fs = require("fs"); 10 var async = require("async"); 11 var Eventproxy = require("eventproxy"); 12 13 var ep = new Eventproxy(); 14 var url = "http://www.jianshu.com/"; 15 var temp = 1;// 存档编号,用于记录及debug 16 17 var artDetails = {};// 文章详情(包括 赞数、评论数、阅读数、发布时间) 18 19 var authorDetails = {};// 作者详情 20 21 start(); 22 23 function start(){ 24 25 http.createServer(onRequest).listen(3000); 26 27 function onRequest(req, res){ 28 29 function parseArtItem($item){ 30 var obj = {}; 31 var footHtml = $item.find(".meta").html().replace(/\s/, ""); 32 33 var reg_read = /ic-list-read.+?<\/i>(\d+)/; 34 var reg_comment = /ic-list-comments.+?<\/i>(\d+)/; 35 var reg_like = /ic-list-like.+?<\/i>(\d+)/; 36 var reg_money = /ic-list-money.+?<\/i>(\d+)/; 37 38 obj.title = $item.find(".title").text().trim(); 39 obj.url = url + $item.find(".title").attr("href"); 40 41 if(reg_read.test(footHtml)){ 42 obj.readNum = RegExp.$1; 43 } 44 if(reg_comment.test(footHtml)){ 45 obj.commentNum = RegExp.$1; 46 } 47 if(reg_like.test(footHtml)){ 48 obj.likeNum = RegExp.$1; 49 } 50 if(reg_money.test(footHtml)){ 51 obj.moneyNum = RegExp.$1; 52 } 53 54 return obj; 55 } 56 57 function getPage(page, cookie, csrf){ 58 59 console.log("准备爬第", page, "页"); 60 if(page >5){ 61 console.log("爬好了"); 62 // 63 var str = JSON.stringify(authorDetails, null, 2) + JSON.stringify(artDetails, null, 2); 64 fs.appendFile("./data/test.txt", str, "utf8", function(err){ 65 if(!!err){ 66 console.log(err); 67 } 68 }); 69 return; 70 } 71 72 var artIds = Object.keys(artDetails); 73 var query = artIds.map(function(id){ 74 return "&seen_snote_ids%5B%5D=" + id; 75 }).join("").slice(1); 76 superagent 77 .get(url) 78 .query(query) 79 .query("page=" + page) 80 .set("Accept", "text/html, */*; q=0.01") 81 .set("Accept-Encoding", "gzip, deflate, sdch, br") 82 .set("Accept-Language", "zh-CN,zh;q=0.8") 83 .set("connection", "keep-alive") 84 .set("Cookie", cookie) 85 .set("Host", "www.jianshu.com") 86 .set("Referer", "https://www.jianshu.com/") 87 .set("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.2127.400 QQBrowser/9.5.10208.400") 88 .set("X-CSRF-Token", csrf) 89 .set("X-INFINITESCROLL", "true") 90 .set("X-Requested-With", "XMLHttpRequest") 91 .end(function(err, sres){ 92 93 // 存档,方便debug 94 fs.appendFile("./data/" + temp++ + ".txt", JSON.stringify(sres, null, 2), "utf8", function(err){ 95 if(!!err) console.log(err); 96 }); 97 98 // 一段延迟后爬取下一页 99 var delay = parseInt((Math.random() * 30000000) % 1000, 10); 100 var theCookie = sres.header["set-cookie"]; 101 setTimeout(function(){ 102 getPage(++page, theCookie, csrf); 103 }, delay); 104 }); 105 } 106 107 superagent.get(url).end(function(err, pres){ 108 109 var $ = cheerio.load(pres.text); 110 var $lists = $("#list-container li"); 111 112 for(var i = 0, len = $lists.length; i < len; i++){ 113 var $item = $lists.eq(i); 114 var artId = $item.attr("id").substr(5);// 文章唯一编号 115 var authorId = $item.find(".nickname").attr("href").slice(3);// 用户唯一编号 116 117 if(!(artId in artDetails)){// 文章未收录 118 artDetails[artId] = parseArtItem($item); 119 if(!(authorId in authorDetails)){// 用户未收录 120 authorDetails[authorId] = { 121 "url": url + "/u/" + authorId 122 } 123 } 124 } 125 } 126 127 var cookie = pres.header["set-cookie"]; 128 var csrf = ""; 129 var reg_csrf = /<meta name="csrf-token" content="(.+?)"/i; 130 if(reg_csrf.test(pres.text)){ 131 csrf = RegExp.$1; 132 } 133 134 getPage(2, cookie, csrf); 135 }); 136 } 137 }
jq去写爬虫?你跨域问题解决了吗?还有你说ajax我都没看到你代码里有$.ajax,你可以用python写比较简单
我用的是nodejs,用superagent模拟的ajax get请求,那个$是cheerio库(将response.text解析为类jq,便于信息提取的)。
@小明1992: 没用过nodejs,python我写过爬虫,
@DanBrown: 哦,之前听说过python写爬虫更友好,但是我因为有点前端基础,这一个月新上手了nodejs、vue,一个都没学懂,短时间不敢再学python了。
@小明1992: 如果你只是爬着一个网页用python不用花时间学习改一改就行我给你个demo?
@DanBrown: 好啊,就是搭环境会不会很花时间?
说实话,我不是怕麻烦,就是怕学的多又不精,现在的时间不怎么够用。。。之前连npm都不敢用,战战兢兢的搭了下,发现npm挺好用,除了配置webpack对新手不友好外,用起来大大提高效率。
@小明1992: 加我qq820398513