深入学习 Lucene 3.0 索引段
Lucene索引index由若干段(segment)组成,每一段由若干的文档(document)组成,每一个文档由若干的域(field)组成,每一个域由若干的项(term)组成。
生成索引的代码:
// 创建两个 Document 对象File f1 = new File("d:/lucene/demo1.txt");File f2 = new File("d:/lucene/demo2.txt");Document doc1 = new Document();doc1.add(new Field("path", f1.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED));doc1.add(new Field("content", new FileReader(f1)));Document doc2 = new Document();doc2.add(new Field("path", f2.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED));doc2.add(new Field("content", new FileReader(f2)));// 创建索引对象IndexWriter writer = new IndexWriter(FSDirectory.open(indexPath),new StandardAnalyzer(Version.LUCENE_30), true,IndexWriter.MaxFieldLength.LIMITED);// 是否复合索引writer.setUseCompoundFile(false);writer.addDocument(doc1);writer.addDocument(doc2);writer.optimize();writer.close(); int version = genInput.readInt(); if (version == FORMAT_LOCKLESS) { long gen0 = genInput.readLong(); long gen1 = genInput.readLong(); message("fallback check: " + gen0 + "; " + gen1); if (gen0 == gen1) { // The file is consistent. genB = gen0; break; } } IndexOutput genOutput = dir.createOutput(IndexFileNames.SEGMENTS_GEN); try { genOutput.writeInt(FORMAT_LOCKLESS); genOutput.writeLong(generation); genOutput.writeLong(generation); } finally { genOutput.close(); } segnOutput.writeInt(CURRENT_FORMAT); // write FORMAT segnOutput.writeLong(++version); // every write changes // the index segnOutput.writeInt(counter); // write counter segnOutput.writeInt(size()); // write infos for (int i = 0; i < size(); i++) { info(i).write(segnOutput); // 此处参考 2 } segnOutput.writeStringStringMap(userData);// 此处参考 4 segnOutput.prepareCommit();// 此处写入长整型的校验码 void write(IndexOutput output) throws IOException { output.writeString(name);// 此处参考 3 output.writeInt(docCount); output.writeLong(delGen); output.writeInt(docStoreOffset); if (docStoreOffset != -1) { output.writeString(docStoreSegment); output.writeByte((byte) (docStoreIsCompoundFile ? 1:0)); } output.writeByte((byte) (hasSingleNormFile ? 1:0)); if (normGen == null) { output.writeInt(NO); } else { output.writeInt(normGen.length); for(int j = 0; j < normGen.length; j++) { output.writeLong(normGen[j]); } } output.writeByte(isCompoundFile); output.writeInt(delCount); output.writeByte((byte) (hasProx ? 1:0)); output.writeStringStringMap(diagnostics); // 此处参考 4 和 5 } public void writeString(String s) throws IOException { UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8Result); writeVInt(utf8Result.length);// 写入名称的长度 writeBytes(utf8Result.result, 0, utf8Result.length);// 写入名称的字节数组,长度为 utf8Result.length } public void writeVInt(int i) throws IOException { while ((i & ~0x7F) != 0) {// 8 位以上是否存在数据 writeByte((byte)((i & 0x7f) | 0x80));// 第 8 位设置为 1 ,表示高位还有数据 i >>>= 7;// 算术右移 7 位 } writeByte((byte)i); } if (map == null) { writeInt(0); } else { writeInt(map.size()); for(final Map.Entry<String, String> entry: map.entrySet()) { writeString(entry.getKey()); // 此处参考 3 writeString(entry.getValue()); } } } Map<String,String> diagnostics = new HashMap<String,String>(); diagnostics.put("source", source); diagnostics.put("lucene.version", Constants.LUCENE_VERSION); // 大家可以看一下 Constants 类,其实它取得 Java 的环境变量 diagnostics.put("os", Constants.OS_NAME+""); diagnostics.put("os.arch", Constants.OS_ARCH+""); diagnostics.put("os.version", Constants.OS_VERSION+""); diagnostics.put("java.version", Constants.JAVA_VERSION+""); diagnostics.put("java.vendor", Constants.JAVA_VENDOR+""); if (details != null) { diagnostics.putAll(details); } info.setDiagnostics(diagnostics);