首页 诗词 字典 板报 句子 名言 友答 励志 学校 网站地图
当前位置: 首页 > 教程频道 > 软件管理 > 软件架构设计 >

lucene怎么解析Doc文档

2012-09-13 
lucene如何解析Doc文档加入poi-scratchpad-3.0.2-FINAL-20080204.jar到lib下package com.cspublic interf

lucene如何解析Doc文档
加入poi-scratchpad-3.0.2-FINAL-20080204.jar到lib下

package com.cs;public interface Parsable {public String getTitle() ;public String getContent()  ;public String getSummary()  ;}


package com.cs;import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.IOException;import java.io.InputStream;import org.apache.poi.hwpf.extractor.WordExtractor;public class DocParser implements Parsable {private File file;private String content;private WordExtractor wordExtractor;public DocParser(File file) {this.file = file;}public String getContent() {try {if (content != null) {return content;}InputStream is = null;is = new FileInputStream(file);wordExtractor = new WordExtractor(is);content = wordExtractor.getText();return content;} catch (FileNotFoundException e) {// TODO Auto-generated catch blocke.printStackTrace();} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}return null;}/** * summary取内容的前200个字符 */public String getSummary()  {String summary;if (content == null) {getContent();}if (content.length() > 200) {summary = content.substring(0, 200);} else {summary = content;}return summary;}public String getTitle() {return file.getName();}public static void main(String[] args) {DocParser docParser = new DocParser(new File("E:\\EclipseStudyWorkspace\\LuceneParse\\fileSource\\XPDF使用文档.doc")) ;System.out.println("doc content : "+docParser.getContent()) ;}}


txt的解析
package com.cs;import java.io.BufferedReader;import java.io.File;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.IOException;import java.io.InputStreamReader;public class TextParser implements Parsable {private File file ;private String content  ;public TextParser(File file) {super();this.file = file;}public String getContent() {if (content != null ) {return content ;}    try {BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file))) ;StringBuffer sb = new StringBuffer() ;String line = null ;while ((line = br.readLine()) != null) {sb.append(line).append("\n") ;}content = sb.toString() ;return content ;        } catch (FileNotFoundException e) {// TODO Auto-generated catch blocke.printStackTrace();} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}return null;}public String getSummary() {String summary ;if (content == null ) {getContent() ;}if (content.length() > 200) {summary = content.substring(0, 200) ;}else {summary = content ;}return summary;}public String getTitle() {return file.getName();}public static void main(String[] args) {TextParser textParser = new TextParser(new File("E:\\EclipseStudyWorkspace\\LuceneParse\\fileSource\\文档.txt")) ;System.out.println("text content : "+textParser.getContent()) ;}}

热点排行