Lucene 第一个Lucene例子
第一个Lucene例子,使用lucene-4.0.0,中文查询没有结果。
1.创建索引
package lucene.index;import java.io.BufferedReader;import java.io.File;import java.io.FileInputStream;import java.io.IOException;import java.io.InputStreamReader;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.LongField;import org.apache.lucene.document.StringField;import org.apache.lucene.document.TextField;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.index.IndexWriterConfig.OpenMode;import org.apache.lucene.index.Term;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.util.Version;/** * 创建文档索引 * 步骤1:创建Lucene Index Writer * 步骤2:索引文档 */public class Indexer {/* * 创建索引的目录 */private String indexDir = "F:/project/Lucene/index";/* * 文档目录 */private String dataDir = "F:/project/Lucene/docs";/* * 是否第一次创建索引 */private boolean create = true;/* * 这个类负责创建索引或打开已有索引,以及向索引中添加、删除或更新被索引文档的信息。 提供针对索引文件的写入操作,但不能读取或搜索索引。 */private IndexWriter writer;/** * 创建Lucene Index Writer * 步骤1:Directory创建索引存放的位置 * 步骤2:创建分析器Analyzer * 步骤3:配置IndexWriterConfig,使用分析器Analyzer * 步骤4:创建IndexWriter,使用Directory和IndexWriterConfig */public Indexer() throws IOException {/* * 它是一个抽象类,它的子类负责具体指定索引的存储路径。 */Directory dir = FSDirectory.open(new File(indexDir));/* * 分析器,它负责从被索引文本文件中提取语汇单元,并剔除剩下的无用信息。 */Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_40, analyzer);if (create) {iwc.setOpenMode(OpenMode.CREATE);} else {iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);}iwc.setInfoStream(System.out);writer = new IndexWriter(dir, iwc);}/** * 关闭Lucene Index Writer */public void close() throws IOException {writer.close();}/** * 索引文档 * 步骤1:找到文档目录下所有文件 * 步骤2:循环每个文档,如果是txt文档则步骤3,否则继续循环,或到步骤6 * 步骤3:文档作为输入流FileInputStream,创建Document,为Document添加多个域 * 步骤4:创建或更新索引文档 * 步骤5:关闭输入流 * 步骤6:返回索引文档的数目 */public int index() throws Exception {File[] files = new File(dataDir).listFiles();for (File f : files) {FileInputStream fis = null;try {/* * 只索引目录下所有txt文档 */if (!f.isDirectory() && !f.isHidden() && f.exists() && f.canRead() && f.getName().toLowerCase().endsWith(".txt")) {System.out.println("Indexing " + f.getCanonicalPath());fis = new FileInputStream(f);/* * Document对象代表Field的集合。文档的Field代表文档或文档相关的一些元数据。 */Document doc = new Document();/* * TextField、StringField、LongField等Field是包含能被索引的文本内容的类。每个Field包含一个名称和值,以及一组选项来控制Lucene索引操作各个域值。 */doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));doc.add(new StringField("filename", f.getName(), Field.Store.YES));doc.add(new StringField("fullpath", f.getCanonicalPath(), Field.Store.YES));doc.add(new LongField("modified", f.lastModified(), Field.Store.NO));if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {System.out.println("adding " + f);writer.addDocument(doc);} else {System.out.println("updating " + f);writer.updateDocument(new Term("path", f.getPath()), doc);}}} finally {if (fis != null) {fis.close();}}}return writer.numDocs();}public static void main(String[] args) throws Exception {Indexer indexer = null;int numIndexed;long start = System.currentTimeMillis();try {indexer = new Indexer();numIndexed = indexer.index();} finally {if (indexer != null) {indexer.close();}}long end = System.currentTimeMillis();System.out.println("Indexing " + numIndexed + " files took " + (end - start) + " milliseconds");}}?
2.搜索
package lucene.index;import java.io.File;import java.io.IOException;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.index.DirectoryReader;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.Term;import org.apache.lucene.queryparser.classic.ParseException;import org.apache.lucene.queryparser.classic.QueryParser;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TermQuery;import org.apache.lucene.search.TopDocs;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.util.Version;/** * 搜索文档 * 步骤1:创建IndexReader * 步骤2:创建IndexSearcher * 步骤3:创建Query * 步骤4:搜索searcher.search */public class Searcher {/* * 索引存放目录 */private String indexDir = "F:/project/Lucene/index";/** * 搜索 * * @param 搜索的域名 * ,如contents或filename * @param 搜索的值 */public void search(String where, String q) throws IOException, ParseException {IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexDir)));/* * 用于搜索由IndexWriter类创建的索引 */IndexSearcher searcher = new IndexSearcher(reader);/* * Query 方法一 */Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);QueryParser parser = new QueryParser(Version.LUCENE_40, where, analyzer);/* * Lucene含有许多具体的Query子类,TermQuery、BooleanQuery、PhraseQuery、PrefixQuery、PhrasePrefixQuery、TermRangeQuery、NumericRangeQuery、FilteredQuery和SpanQuery * */Query query1 = parser.parse(q);/* * Query 方法二 *//* * TermQuery是Lucene提供的最基本的查询类型,也是简单查询类型之一。用来匹配指定域中包含特定项的文档。 */Query query2 = new TermQuery(new Term(where, q));long start = System.currentTimeMillis();/* * 一个简单的指针容器,指向前N个排名的搜索结果。 */TopDocs hits = searcher.search(query1, null, 10);long end = System.currentTimeMillis();System.err.println("Found " + hits.totalHits + " document(s) (in " + (end - start) + " milliseconds) that matched query '" + q + "':");for (ScoreDoc scoreDoc : hits.scoreDocs) {Document doc = searcher.doc(scoreDoc.doc);System.out.println(doc.get("fullpath"));System.out.println(doc.get("filename"));}}public static void main(String[] args) throws IOException, ParseException {Searcher searcher = new Searcher();searcher.search("filename", "b.txt");searcher.search("contents", "abc");}}?