import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.processor.PageProcessor; public class GithubRepoPageProcessor implements PageProcessor { // 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等 private Site site = Site.me().setRetryTimes(3).setSleepTime(1000); @Override // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑 public void process(Page page) { // 部分二:定义如何抽取页面信息,并保存下来 page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString()); if (page.getResultItems().get("name") == null) { //skip this page page.setSkip(true); } page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()")); // 部分三:从页面发现后续的url地址来抓取 page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-]+)").all()); } @Override public Site getSite() { return site; } public static void main(String[] args) { Spider.create(new GithubRepoPageProcessor()) //从"https://github.com/code4craft"开始抓 .addUrl("https://github.com/code4craft") //开启5个线程抓取 .thread(5) //启动爬虫 .run(); } }
报错
[main] INFO us.codecraft.webmagic.model.OOSpider - Spider github.com started! [pool-1-thread-1] WARN us.codecraft.webmagic.downloader.HttpClientDownloader - download page https://github.com/code4craft error javax.net.ssl.SSLException: Received fatal alert: protocol_version at sun.security.ssl.Alerts.getSSLException(Unknown Source) at sun.security.ssl.Alerts.getSSLException(Unknown Source) at sun.security.ssl.SSLSocketImpl.recvAlert(Unknown Source) at sun.security.ssl.SSLSocketImpl.readRecord(Unknown Source) at sun.security.ssl.SSLSocketImpl.performInitialHandshake(Unknown Source) at sun.security.ssl.SSLSocketImpl.startHandshake(Unknown Source) at sun.security.ssl.SSLSocketImpl.startHandshake(Unknown Source) at org.apache.http.conn.ssl.SSLConnectionSocketFactory.createLayeredSocket(SSLConnectionSocketFactory.java:396) at org.apache.http.conn.ssl.SSLConnectionSocketFactory.connectSocket(SSLConnectionSocketFactory.java:355) at org.apache.http.impl.conn.DefaultHttpClientConnectionOperator.connect(DefaultHttpClientConnectionOperator.java:142) at org.apache.http.impl.conn.PoolingHttpClientConnectionManager.connect(PoolingHttpClientConnectionManager.java:373) at org.apache.http.impl.execchain.MainClientExec.establishRoute(MainClientExec.java:381) at org.apache.http.impl.execchain.MainClientExec.execute(MainClientExec.java:237) at org.apache.http.impl.execchain.ProtocolExec.execute(ProtocolExec.java:185) at org.apache.http.impl.execchain.RetryExec.execute(RetryExec.java:89) at org.apache.http.impl.execchain.RedirectExec.execute(RedirectExec.java:111) at org.apache.http.impl.client.InternalHttpClient.doExecute(InternalHttpClient.java:185) at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:83) at us.codecraft.webmagic.downloader.HttpClientDownloader.download(HttpClientDownloader.java:85) at us.codecraft.webmagic.Spider.processRequest(Spider.java:404) at us.codecraft.webmagic.Spider.access$000(Spider.java:61) at us.codecraft.webmagic.Spider$1.run(Spider.java:320) at us.codecraft.webmagic.thread.CountableThreadPool$1.run(CountableThreadPool.java:74) at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source) at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source) at java.lang.Thread.run(Unknown Source) Exception in thread "pool-1-thread-1" java.lang.NoClassDefFoundError: org/apache/commons/collections/CollectionUtils at us.codecraft.webmagic.Spider.onSuccess(Spider.java:350) at us.codecraft.webmagic.Spider$1.run(Spider.java:321) at us.codecraft.webmagic.thread.CountableThreadPool$1.run(CountableThreadPool.java:74) at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source) at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source) at java.lang.Thread.run(Unknown Source) Caused by: java.lang.ClassNotFoundException: org.apache.commons.collections.CollectionUtils at java.net.URLClassLoader.findClass(Unknown Source) at java.lang.ClassLoader.loadClass(Unknown Source) at sun.misc.Launcher$AppClassLoader.loadClass(Unknown Source) at java.lang.ClassLoader.loadClass(Unknown Source) ... 6 more [main] INFO us.codecraft.webmagic.model.OOSpider - Spider github.com closed! 1 pages downloaded.
我也遇到过这个问题,maven源上的包有问题,把 https://github.com/code4craft/webmagic/tree/master/webmagic-core 代码下载下来,打包然后替换掉maven本地仓库的包,就好了。