Lucene基础运用学习笔记

2012-10-11

Lucene基础使用学习笔记最近学习Lucene的使用，打算做一个基于lucene的知识管理，感觉这个东西真是太方便太

Lucene基础使用学习笔记

最近学习Lucene的使用，打算做一个基于lucene的知识管理，感觉这个东西真是太方便太好用了

不过话说回来，如果数据量不是很大的话，就美必要用了 #- - ~~

这个笔记主要是摘抄自网络，记录一下我的学习过程，和大家共勉～在此感谢那些无私奉献的人们！

1、整体结构说明

索引和搜索过程图：

Lucene基础运用学习笔记

API使用示例图：

?
Lucene基础运用学习笔记

1. 索引过程：

2. 搜索过程：

/** * 对目录进行Lucene索引 * @author roy * */public class Indexer {private static String INDEX_DIR = "G:\\ROY的各种笔记\\索引";//索引结果存放目录 private static String DATA_DIR = "G:\\ROY的各种笔记";//被索引文件存放的目录/** * 测试主函数 * @param args * @throws Exception */public static void main(String[] args) throws Exception{long start = new Date().getTime(); int numIndexed = index(new File(INDEX_DIR),new File(DATA_DIR));//调用index方法 long end = new Date().getTime(); System.out.println("Indexing " + numIndexed + " files took " + (end - start) + " milliseconds"); }/** * 索引dataDir下的.txt文件，并储存在indexDir下，返回索引的文件数量 * @param indexDir * @param dataDir * @return * @throws IOException */private static int index(File indexDir, File dataDir) throws Exception{if(!dataDir.exists() || !dataDir.isDirectory()){throw new Exception("被索引的文件不存在！");}if(!indexDir.exists() || !indexDir.isDirectory()){indexDir.mkdirs();}IndexWriter writer = new IndexWriter(FSDirectory.open(indexDir),new StandardAnalyzer(Version.LUCENE_30),true,IndexWriter.MaxFieldLength.UNLIMITED);//按照目录进行递归索引indexDirectory(writer,dataDir);int numIndexed = writer.numDocs();//对索引后的结果加以优化writer.optimize();writer.close();return numIndexed;}/** * 循环遍历目录下的所有.doc文件并进行索引 * @param writer * @param dir * @throws IOException */private static void indexDirectory(IndexWriter writer, File dir) throws IOException { File[] files = dir.listFiles(); for (int i = 0; i < files.length; i++) { File f = files[i]; if (f.isDirectory()) { indexDirectory(writer, f); // recurse } else if (f.getName().endsWith(".doc")) { indexFile(writer, f); } } }/** * 对单个doc文件进行索引 * @param writer * @param f * @throws IOException */private static void indexFile(IndexWriter writer, File f) throws IOException { if (f.isHidden() || !f.exists() || !f.canRead()) { return; } System.out.println("Indexing " + f.getCanonicalPath()); Document doc = new Document(); doc.add(new Field("content",new FileReader(f))); doc.add(new Field("filename",f.getName(),Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("path",f.getCanonicalPath(),Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc); } }?

2）查询用类：

/** * 对索引结果进行搜索 * @author roy * */public class Searcher {private static String INDEX_DIR = "G:\\ROY的各种笔记\\索引";//索引所在的路径   private static String KEYWORD = "自己";//关键词   private static int TOP_NUM = 100;//显示前100条结果/** * 测试主函数 * @param args * @throws Exception */public static void main(String[] args) throws Exception{File indexDir = new File(INDEX_DIR);if (!indexDir.exists() || !indexDir.isDirectory()){throw new Exception(indexDir + "索引目录不存在！");   }search(indexDir,KEYWORD);//调用search方法进行查询}/** * 索引查找方法 * @param indexDir索引所在的目录 * @param q查询的字符串 * @throws Exception */private static void search(File indexDir, String q) throws Exception {        IndexSearcher is = new  IndexSearcher(FSDirectory.open(indexDir),true);//read-only        String field = "content";     //创建查询解析器     QueryParser parser = new QueryParser(Version.LUCENE_30, field, new StandardAnalyzer(Version.LUCENE_30));        Query query = parser.parse(q);     //创建结果收集器     TopScoreDocCollector collector = TopScoreDocCollector.create(TOP_NUM ,false);//有变化的地方        long start = new Date().getTime();// start time        is.search(query, collector);        ScoreDoc[] hits = collector.topDocs().scoreDocs;             System.out.println(hits.length);        for (int i = 0; i < hits.length; i++) {            Document doc = is.doc(hits[i].doc);//new method is.doc()            System.out.println(doc.getField("filename")+"   "+hits[i].toString()+"  ");        }        long end = new Date().getTime();//end time        System.out.println("Found " + collector.getTotalHits() +                  " document(s) (in " + (end - start) +                  " milliseconds) that matched query '" +                  q + "':");      }}

?3）选择分词器：

/** * 查看分词后的结果 * @param analyzer分词器 * @param s需要分词的字符串 * @throws Exception */public static void showAnalyzerResult(Analyzer analyzer, String s) throws Exception {StringReader reader = new StringReader(s);TokenStream tokenStream = analyzer.tokenStream(s,reader);tokenStream.addAttribute(TermAttribute.class);while (tokenStream.incrementToken()) {TermAttribute ta = tokenStream.getAttribute(TermAttribute.class);System.out.println(ta.term());}System.out.println();}?4）索引doc,pdf等文件：

InputStream is = new FileInputStream(f);WordExtractor wordExtractor = new WordExtractor(is);return wordExtractor.getText();

?2、pdf文件：

下载最新的pdf-box，把fontbox-1.1.0.jar、jempbox-1.1.0.jar、pdfbox-1.1.0.jar三个包放进项目路径中

InputStream is = new FileInputStream(f);PDFTextStripper ts = new PDFTextStripper();PDDocument pDocument = PDDocument.load(is);StringWriter writer = new StringWriter();ts.writeText(pDocument,writer);is.close();pDocument.close();return  writer.getBuffer().toString();

?5、查看当前索引库的内容

IndexWriter writer = new IndexWriter(FSDirectory.open(indexFile),analyzer,false,IndexWriter.MaxFieldLength.UNLIMITED);

?在创建索引器的时候指定create参数为true表示新建立索引，false表示使用当前目录下已有的索引。此时为writer添加document并optimize()即可

2、删除索引

/** * 暂时删除某个索引 * @param f * @throws Exception */public static void deleteIndexTmp(File indexFile,String targetPath) throws Exception{IndexReader ir = IndexReader.open(FSDirectory.open(indexFile));ir.deleteDocuments(new Term("path",targetPath));ir.close();} /** * 恢复某个临时删除的索引 * @param f * @throws Exception */public static void rollbackDelete(File indexFile) throws Exception{IndexReader ir = IndexReader.open(FSDirectory.open(indexFile));ir.undeleteAll();ir.close();}/** * 将标记为删除的索引真正删除 * @param f * @throws Exception */public static void optimizeIndex(File indexFile) throws Exception{IndexWriter writer = new IndexWriter(FSDirectory.open(indexFile),analyzer,false,IndexWriter.MaxFieldLength.UNLIMITED);writer.optimize();writer.close();}

?3、更新索引：

可以先查到旧索引将其删除，然后再新建索引即可。
Lucene3.0以后提供了新的update(Term,Document)方法，封装了上述两个操作。

6）高级查询：

/** *** 一个关键字，对一个字段进行查询 **** */QueryParser qp = new QueryParser("content",analyzer);query = qp.parse(keyword);Hits hits = searcher.search(query); /** *** 模糊查询 **** */Term term = new Term("content",keyword);FuzzyQuery fq = new FuzzyQuery(term);Hits hits = searcher.search(fq); /** *** 一个关键字，在两个字段中查询 **** *//* * 1.BooleanClause.Occur[]的三种 类型： MUST : + and MUST_NOT : - not SHOULD : or * 2.下面查询的意思是：content中 必须包含该关键字，而title有 没有都无所谓 * 3.下面的这个查询中，Occur[]的 长度必须和Fields[]的 长度一致。每个限制条件对应一个字段 */BooleanClause.Occur[] flags = new BooleanClause.Occur[]{BooleanClause.Occur.SHOULD,BooleanClause.Occur.MUST};query=MultiFieldQueryParser.parse(keyword,new String[]{"title","content"},flags,analyzer);  /** *** 两个(多个)关键 字对两个(多 个)字 段进行查询,默 认匹配规则 **** *//* * 1.关键字的个数必须和字段的个数相等  * 2.由于没有指定匹配规定，默认为"SHOULD" 因 此，下面查询的意思是："title"中 含有keyword1 或 "content"含有keyword2.  * 在此例中，把keyword1和keyword2相 同 */ query=MultiFieldQueryParser.parse(new String[]{keyword,keyword},new String[]{"title","content"},analyzer);  /** ** 两个(多个)关键 字对两个(多 个)字 段进行查询,手 工指定匹配规则 ****//* * 1.必须 关键字的个数 == 字 段名的个数 == 匹 配规则的个数 * 2.下面查询的意思是："title"必 须不含有keyword1,并 且"content"中 必须含有keyword2 */ BooleanClause.Occur[] flags = new BooleanClause.Occur[]{BooleanClause.Occur.MUST_NOT,BooleanClause.Occur.MUST}; query=MultiFieldQueryParser.parse(new String[]{keyword,keyword},new String[]{"title","content"},flags,analyzer);  /** *** 对日期型字段进行查询 **** */ /** *** 对数字范围进行查询 **** *//* * 1.两个条件必须是同一个字段  * 2.前面一个条件必须比后面一个条件小，否则找不到数据 *  3.new RangeQuery中 的第三个参数，表示是否包含"=" true: >= 或 <= false: > 或 <  * 4.找出 55>=id>=53 or 60>=id>=57: */Term lowerTerm1 = new Term("id","53");Term upperTerm1 = new Term("id","55");RangeQuery rq1 = new RangeQuery(lowerTerm1,upperTerm1,true); Term lowerTerm2 = new Term("id","57");Term upperTerm2 = new Term("id","60");RangeQuery rq2 = new RangeQuery(lowerTerm2,upperTerm2,true); BooleanQuery bq = new BooleanQuery();bq.add(rq1,BooleanClause.Occur.SHOULD);bq.add(rq2,BooleanClause.Occur.SHOULD);Hits hits = searcher.search(bq);

热点排行

软件架构设计

Lucene基础运用学习笔记