首页 诗词 字典 板报 句子 名言 友答 励志 学校 网站地图
当前位置: 首页 > 教程频道 > 其他教程 > 开源软件 >

nutch源码阅览(8)-Generator

2013-06-26 
nutch源码阅读(8)-Generator接着看下最后一个Job??/*** Update the CrawlDB so that the next generate wo

nutch源码阅读(8)-Generator

接着看下最后一个Job

?

?

  /**   * Update the CrawlDB so that the next generate won't include the same URLs.   */  public static class CrawlDbUpdater extends MapReduceBase implements      Mapper<Text,CrawlDatum,Text,CrawlDatum>, Reducer<Text,CrawlDatum,Text,CrawlDatum> {    long generateTime;    public void configure(JobConf job) {      generateTime = job.getLong(Nutch.GENERATE_TIME_KEY, 0L);    }    public void map(Text key, CrawlDatum value, OutputCollector<Text,CrawlDatum> output,        Reporter reporter) throws IOException {      output.collect(key, value);    }    private CrawlDatum orig = new CrawlDatum();    private LongWritable genTime = new LongWritable(0L);    public void reduce(Text key, Iterator<CrawlDatum> values,        OutputCollector<Text,CrawlDatum> output, Reporter reporter) throws IOException {      genTime.set(0L);      //遍历相同url的crawlDatum      while (values.hasNext()) {        CrawlDatum val = values.next();        //判断是否生成过        if (val.getMetaData().containsKey(Nutch.WRITABLE_GENERATE_TIME_KEY)) {          LongWritable gt = (LongWritable) val.getMetaData().get(              Nutch.WRITABLE_GENERATE_TIME_KEY);          genTime.set(gt.get());          if (genTime.get() != generateTime) {            orig.set(val);            genTime.set(0L);            continue;          }        } else {          orig.set(val);        }      }      if (genTime.get() != 0L) {        //设置新的生成时间        orig.getMetaData().put(Nutch.WRITABLE_GENERATE_TIME_KEY, genTime);      }      output.collect(key, orig);    }  }

?

热点排行