首页 诗词 字典 板报 句子 名言 友答 励志 学校 网站地图
当前位置: 首页 > 教程频道 > 其他教程 > 开源软件 >

nutch源码阅览(7)-Generator

2013-07-01 
nutch源码阅读(7)-Generator继续向下看,第二个Job.....................................................

nutch源码阅读(7)-Generator

继续向下看,第二个Job

............................................................  // read the subdirectories generated in the temp    // output and turn them into segments    List<Path> generatedSegments = new ArrayList<Path>();    //读取上个job生成的多个fetchlist的segment    FileStatus[] status = fs.listStatus(tempDir);    try {      for (FileStatus stat : status) {        Path subfetchlist = stat.getPath();        //过滤掉不是以fetchlist开头的文件        if (!subfetchlist.getName().startsWith("fetchlist-")) continue;        // start a new partition job for this segment        //一个partition Job 对segment        Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists);        generatedSegments.add(newSeg);      }    } catch (Exception e) {      LOG.warn("Generator: exception while partitioning segments, exiting ...");      fs.delete(tempDir, true);      return null;    }   if (generatedSegments.size() == 0) {      LOG.warn("Generator: 0 records selected for fetching, exiting ...");      LockUtil.removeLockFile(fs, lock);      fs.delete(tempDir, true);      return null;    }............................................................

?

?

?

?//?这里主要是通过URLPartitioner来做的,具体是按哪一个来分类,是通用参数来配置的,这里有PARTITION_MODE_DOMAIN,PARTITION_MODE_IP ?

//?来配置,默认是按Url的hashCode来分。?

  private Path partitionSegment(FileSystem fs, Path segmentsDir, Path inputDir,      int numLists) throws IOException {    // invert again, partition by host/domain/IP, sort by url hash    if (LOG.isInfoEnabled()) {      LOG.info("Generator: Partitioning selected urls for politeness.");    }    //产生一个新的目录,以当前时间明明    Path segment = new Path(segmentsDir, generateSegmentName());    //在上面的目录下,再产生一个特定的crawl_generate目录    Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME);    LOG.info("Generator: segment: " + segment);    NutchJob job = new NutchJob(getConf());    job.setJobName("generate: partition " + segment);    job.setInt("partition.url.seed", new Random().nextInt());    FileInputFormat.addInputPath(job, inputDir);    job.setInputFormat(SequenceFileInputFormat.class);    job.setMapperClass(SelectorInverseMapper.class);    job.setMapOutputKeyClass(Text.class);    job.setMapOutputValueClass(SelectorEntry.class);    job.setPartitionerClass(URLPartitioner.class);    job.setReducerClass(PartitionReducer.class);    job.setNumReduceTasks(numLists);    FileOutputFormat.setOutputPath(job, output);    job.setOutputFormat(SequenceFileOutputFormat.class);    job.setOutputKeyClass(Text.class);    job.setOutputValueClass(CrawlDatum.class);    job.setOutputKeyComparatorClass(HashComparator.class);    JobClient.runJob(job);    return segment;  }

?

?

?

热点排行