 悬赏园豆:20
                [待解决问题]
                悬赏园豆:20
                [待解决问题] 
            
                 
        这是我扩展的Extractor
1 package isearch.heritrix;
2
3
4 import java.io.IOException;
5 import java.io.UnsupportedEncodingException;
6 import java.util.ArrayList;
7 import java.util.Collection;
8 import java.util.HashMap;
9 import java.util.Iterator;
10 import java.util.logging.Level;
11 import java.util.logging.Logger;
12 import java.util.regex.Matcher;
13 import java.util.regex.Pattern;
14
15 import org.apache.commons.httpclient.URIException;
16 import org.archive.crawler.datamodel.CoreAttributeConstants;
17 import org.archive.crawler.datamodel.CrawlURI;
18 import org.archive.crawler.extractor.Extractor;
19 import org.archive.crawler.extractor.Link;
20 import org.archive.io.ReplayCharSequence;
21 import org.archive.util.HttpRecorder;
22
23 public class IsearchExtractor extends Extractor {
24
25 private static Logger logger = Logger.getLogger(IsearchExtractor.class.getName());
26
27 private static final long serialVersionUID = 1L;
28 public IsearchExtractor(String name){
29 this(name,"");
30 }
31
32 public IsearchExtractor(String name, String description) {
33 super(name, description);
34 }
35 // 用于内容页面匹配
36 public static final String regexFilter = "http://product\\.dangdang\\.com/product\\.aspx\\?product_id=.+$";
37 public static final String regexLink = "<a\\s+href\\s*=\\s*(\"([^\"]*)\"|[^\\s>])\\s*>";
38 @Override
39 protected void extract(CrawlURI curi) {
40 System.out.println("当前url\n"+curi.toString());
41 ReplayCharSequence cs=null; //保存抓取内容
42 try {
43 HttpRecorder hr=curi.getHttpRecorder();
44 if(hr==null)
45 {
46 throw new IOException("内容为空!");
47 }
48 cs = hr.getReplayCharSequence();
49 } catch (Exception e) {
50 curi.addLocalizedError(this.getName(), e, "get of replay char sequence失败!"+e.getMessage());
51 logger.log(Level.SEVERE,"get of replay char sequence失败在 in"+Thread.currentThread().getName(),e);
52 }
53 //没有抓到内容
54 if(cs==null)
55 return;
56 String content=cs.toString(); //将抓取回来的网页内容转换为字符串
57 //测试打印内容
58 try {
59 byte[] byteGB2312 = content.getBytes("GB2312");
60 content = new String(byteGB2312,"GB2312");
61 } catch (UnsupportedEncodingException e) {
62 System.out.println("编码转换错误!");
63 e.printStackTrace();
64 }
65 System.out.println("内容\n"+content);
66
67 Pattern pattern=Pattern.compile(regexLink, Pattern.CASE_INSENSITIVE);
68 Matcher m=pattern.matcher(content);
69 while(m.find())
70 {
71 String newUrl = m.group(2);
72 System.out.println("匹配content找到的链接\n"+newUrl);
73 if(newUrl.startsWith("http"))
74 {
75 Matcher urlMatcher = Pattern.compile(regexFilter).matcher(newUrl);
76 if(urlMatcher.find())
77 {
78 System.out.println("匹配regex的链接\n"+newUrl);
79 addLinkFromString(curi, newUrl, "", Link.NAVLINK_HOP);
80 }
81 }
82 /* else if(!newUrl.toLowerCase().startsWith("mailto")&&!newUrl.toLowerCase().startsWith("javascript"))
83 {
84 if(newUrl.trim().startsWith("/"))
85 {
86 newUrl = newUrl.trim().substring(1).trim();
87 }
88 newUrl = ""
89 }*/
90 }
91 }
92 public void addLinkFromString(CrawlURI curi,String uri,CharSequence context,char hopType)
93 {
94 try {
95 curi.createAndAddLinkRelativeToBase(uri, context.toString(), hopType);
96 } catch (URIException e) {
97 if(getController()!=null)
98 {
99 getController().logUriError(e, curi.getUURI(), uri);
100 }
101 else
102 {
103 logger.info("失败 CreateAndAddLinkRelativeToBase "+
104 curi+","+uri+","+context+","+hopType+":"+e);
105 }
106 }
107
108 }
109 }
这是我扩展的FrontierScheduler
1 package isearch.heritrix;
2
3 import org.archive.crawler.datamodel.CandidateURI;
4 import org.archive.crawler.postprocessor.FrontierScheduler;
5
6
7 public class IsearchFrontierScheduler extends FrontierScheduler {
8
9 /**
10 *
11 */
12 private static final long serialVersionUID = 1L;
13
14 public IsearchFrontierScheduler(String name) {
15 super(name);
16 // TODO Auto-generated constructor stub
17 }
18
19 @Override
20 protected void schedule(CandidateURI caUri) {
21 String url = caUri.toString();
22 String currentUrl = caUri.toString();
23 String regex = "product\\.dangdang\\.com/product\\.aspx\\?product_id=";
24 if(currentUrl.indexOf(regex)!=-1||currentUrl.indexOf("robots.txt")!=-1
25 ||currentUrl.indexOf("dns:")!=-1)
26 {
27 if(currentUrl.indexOf("#")==-1)
28 getController().getFrontier().schedule(caUri);
29 }
30 else
31 {
32 return ;
33 }
34 if(url.endsWith(".zip")||url.endsWith(".swf")||url.endsWith(".rar")||url.endsWith(".exe")||
35 url.endsWith(".pdf")||url.endsWith(".doc")||url.endsWith(".xls")||
36 url.endsWith(".ppt"))
37 {
38 return;
39 }
40 }
41 }
大家给点意见!我的目的是只想抓取当当网的商品页面,其他外链都不要,在这个过程中我遇到连个主要问题,要么连外链一起抓回来,要么就抓个当当首页就结束了,,,我不要首页,只需要将当当商品页面(符合正则表达式:http://product\\.dangdang\\.com/product\\.aspx\\?product_id=.+$)保存下来就好了