我用heritrix抓取当当网的商品页面，结果只抓到首页就结束了，请大家帮忙看看，谢谢！

悬赏园豆：20 [待解决问题]

这是我扩展的Extractor

  1 package isearch.heritrix;
  2 
  3 
  4 import java.io.IOException;
  5 import java.io.UnsupportedEncodingException;
  6 import java.util.ArrayList;
  7 import java.util.Collection;
  8 import java.util.HashMap;
  9 import java.util.Iterator;
 10 import java.util.logging.Level;
 11 import java.util.logging.Logger;
 12 import java.util.regex.Matcher;
 13 import java.util.regex.Pattern;
 14 
 15 import org.apache.commons.httpclient.URIException;
 16 import org.archive.crawler.datamodel.CoreAttributeConstants;
 17 import org.archive.crawler.datamodel.CrawlURI;
 18 import org.archive.crawler.extractor.Extractor;
 19 import org.archive.crawler.extractor.Link;
 20 import org.archive.io.ReplayCharSequence;
 21 import org.archive.util.HttpRecorder;
 22 
 23 public class IsearchExtractor extends Extractor {
 24 
 25     private static Logger logger = Logger.getLogger(IsearchExtractor.class.getName());
 26     
 27     private static final long serialVersionUID = 1L;
 28     public IsearchExtractor(String name){
 29         this(name,"");
 30     }
 31     
 32     public IsearchExtractor(String name, String description) {
 33         super(name, description);
 34     }
 35     // 用于内容页面匹配
 36     public static final String regexFilter = "http://product\\.dangdang\\.com/product\\.aspx\\?product_id=.+$";
 37     public static final String  regexLink = "<a\\s+href\\s*=\\s*(\"([^\"]*)\"|[^\\s>])\\s*>";
 38     @Override
 39     protected void extract(CrawlURI curi) {
 40         System.out.println("当前url\n"+curi.toString());
 41             ReplayCharSequence cs=null; //保存抓取内容
 42             try {
 43                 HttpRecorder hr=curi.getHttpRecorder();
 44                 if(hr==null)
 45                 {
 46                     throw new IOException("内容为空！");
 47                 }
 48                 cs = hr.getReplayCharSequence();
 49             } catch (Exception e) {
 50                 curi.addLocalizedError(this.getName(), e, "get of replay char sequence失败！"+e.getMessage());
 51                 logger.log(Level.SEVERE,"get of replay char sequence失败在 in"+Thread.currentThread().getName(),e);
 52             }
 53             //没有抓到内容
 54             if(cs==null)
 55                 return;
 56             String content=cs.toString(); //将抓取回来的网页内容转换为字符串
 57 //测试打印内容
 58             try {
 59                 byte[]   byteGB2312   =   content.getBytes("GB2312");
 60                 content = new String(byteGB2312,"GB2312");
 61             } catch (UnsupportedEncodingException e) {
 62                 System.out.println("编码转换错误！");
 63                 e.printStackTrace();
 64             }
 65             System.out.println("内容\n"+content);
 66             
 67             Pattern pattern=Pattern.compile(regexLink, Pattern.CASE_INSENSITIVE);
 68             Matcher m=pattern.matcher(content);
 69             while(m.find())
 70             {
 71                 String newUrl = m.group(2);
 72                 System.out.println("匹配content找到的链接\n"+newUrl);
 73                 if(newUrl.startsWith("http"))
 74                 {
 75                     Matcher urlMatcher = Pattern.compile(regexFilter).matcher(newUrl);
 76                     if(urlMatcher.find())
 77                     {
 78                         System.out.println("匹配regex的链接\n"+newUrl);
 79                         addLinkFromString(curi, newUrl, "", Link.NAVLINK_HOP);
 80                     }
 81                 }
 82             /*    else if(!newUrl.toLowerCase().startsWith("mailto")&&!newUrl.toLowerCase().startsWith("javascript"))
 83                 {
 84                     if(newUrl.trim().startsWith("/"))
 85                     {
 86                         newUrl = newUrl.trim().substring(1).trim();
 87                     }
 88                     newUrl = ""
 89                 }*/
 90             }        
 91     }
 92     public void addLinkFromString(CrawlURI curi,String uri,CharSequence context,char hopType)
 93     {
 94         try {
 95             curi.createAndAddLinkRelativeToBase(uri, context.toString(), hopType);
 96         } catch (URIException e) {
 97             if(getController()!=null)
 98             {
 99                 getController().logUriError(e, curi.getUURI(), uri);
100             }
101             else
102             {
103                 logger.info("失败 CreateAndAddLinkRelativeToBase "+
104                         curi+","+uri+","+context+","+hopType+":"+e);
105             }
106         }
107         
108     }
109 }

这是我扩展的FrontierScheduler

 1 package isearch.heritrix;
 2 
 3 import org.archive.crawler.datamodel.CandidateURI;
 4 import org.archive.crawler.postprocessor.FrontierScheduler;
 5 
 6 
 7 public class IsearchFrontierScheduler extends FrontierScheduler {
 8 
 9     /**
10      * 
11 */
12     private static final long serialVersionUID = 1L;
13 
14     public IsearchFrontierScheduler(String name) {
15         super(name);
16         // TODO Auto-generated constructor stub
17     }
18 
19     @Override
20     protected void schedule(CandidateURI caUri) {
21         String url = caUri.toString();
22         String currentUrl = caUri.toString();
23         String regex = "product\\.dangdang\\.com/product\\.aspx\\?product_id=";
24         if(currentUrl.indexOf(regex)!=-1||currentUrl.indexOf("robots.txt")!=-1
25                 ||currentUrl.indexOf("dns:")!=-1)
26         {
27             if(currentUrl.indexOf("#")==-1)
28                 getController().getFrontier().schedule(caUri);
29         }
30         else
31         {
32             return ;
33         }
34         if(url.endsWith(".zip")||url.endsWith(".swf")||url.endsWith(".rar")||url.endsWith(".exe")||
35                 url.endsWith(".pdf")||url.endsWith(".doc")||url.endsWith(".xls")||
36                 url.endsWith(".ppt"))
37         {
38             return;
39         }
40     }
41 }

大家给点意见！我的目的是只想抓取当当网的商品页面，其他外链都不要，在这个过程中我遇到连个主要问题，要么连外链一起抓回来，要么就抓个当当首页就结束了，，，我不要首页，只需要将当当商品页面（符合正则表达式：http://product\\.dangdang\\.com/product\\.aspx\\?product_id=.+$）保存下来就好了

网站开发 Google应用

情非得已swust | 初学一级 | 园豆：180
提问于：2011-10-10 10:45

< >

所有回答(0)

清除回答草稿

您需要登录以后才能回答，未注册用户请先注册。