垂直搜索,用heritrix抓取网页数据时,数据过滤不彻底
我在做垂直搜索,数据来源是太平电脑网,我关心的是里面的手机信息,我定制heritrix的FrontierScheduler类,代码如下:
package my.processor;import java.util.logging.Logger;import org.archive.crawler.datamodel.CandidateURI;import org.archive.crawler.postprocessor.FrontierScheduler;public class FrontierSchedulerForPconlineMobile extends FrontierScheduler { private static Logger LOGGER = Logger.getLogger(FrontierSchedulerForPconlineMobile.class.getName()); public FrontierSchedulerForPconlineMobile(String name) { super(name); } protected void schedule(CandidateURI caUri){ String url=caUri.toString(); try{ if(url.indexOf("product.pconline.com.cn/mobile/")!=-1 || url.indexOf("product.pconline.com.cn/pdlib/")!=-1 || url.indexOf("img.pconline.com.cn/images/product/")!=-1 || url.endsWith(".jpg") || url.indexOf("robots.txt")!=-1 || url.indexOf("dns:")!=-1){ getController().getFrontier().schedule(caUri); }else{ return ; } }catch(Exception e){ e.printStackTrace(); }finally{ } }}