lucene如何解析pdf文档
XPDF使用文档
XPDF版本3.0.2
日期2008-11-26
文档版本V1.0
1、概述
读取PDF文件中的文本内容,可以使用开源项目xpdf。下载地址:http://www.foolabs.com/xpdf/download.html。
注意使用:xpdf-3.02pl2-win32.zip以及xpdf-chinese-simplified.tar.gz(支持中文)。
2、安装
将xpdf-3.02pl2-win32.zip解压缩到D盘xpdf目录下,我们将以d:\xpdf作为xpdf的工作路径。
将xpdf-chinese-simplified.tar解压缩到xpdf根目录下的xpdf-chinese-simplified目录中。
为了启用中文简体语言包,您必须将xpdf目录下的sample-xpdfrc文件另存为xpdfrc文件。
注意:此文件为配置文件,而且名称必须是xpdfrc。如果是别的名字,即使调用pdftotext.exe时,传入”-cfg xpdfrc2”来告诉xpdf配置文件的名字,好像pdftotext.exe也并没有使用这个配置文件。所以为了减少误解,请您将配置文件直接命名为xpdfrc。
并在这个xpdfrc文件最后加上以下配置,注意Map文件的路径一定要正确。
#----- begin Chinese Simplified support package (2004-jul-27)cidToUnicode Adobe-GB1 D:/xpdf/ xpdf-chinese-simplified/Adobe-GB1.cidToUnicodeunicodeMap ISO-2022-CN D:/xpdf/ xpdf-chinese-simplified/ISO-2022-CN.unicodeMapunicodeMap EUC-CN D:/xpdf/xpdf-chinese-simplified/EUC-CN.unicodeMapunicodeMap GBK D:/xpdf/xpdf-chinese-simplified/GBK.unicodeMapcMapDir Adobe-GB1 D:/xpdf/xpdf-chinese-simplified/CmaptoUnicodeDir D:/xpdf/xpdf-chinese-simplified/Cmap#displayCIDFontTT Adobe-GB1 /usr/..../gkai00mp.ttf#----- end Chinese Simplified support package另外,配置文件中原先没有加上一个“textPageBreaks”控制。为了避免这个分页符号,我们需要在xpdfrc文件“text output control”下面加上这么一段话:
# If set to "yes", text extraction will insert page# breaks (form feed characters) between pages. This# defaults to "yes".textPageBreaks no
#textEncodingUTF-8textEncodingGBK
private String excuteStr = "D:\\xpdf\\xpdf-3.02pl2-win32\\pdftotext.exe";public String getContent() {String[] cmd = new String[] { excuteStr, "-enc", "UTF-8", "-q", file.getAbsolutePath(),"-" };Process p = null;BufferedInputStream bis = null ;InputStreamReader reader = null;StringBuffer sb = null;BufferedReader br = null;try {p = Runtime.getRuntime().exec(cmd);bis = new BufferedInputStream(p.getInputStream());reader = new InputStreamReader(bis, "UTF-8");sb = new StringBuffer();br = new BufferedReader(reader);String line = br.readLine();sb = new StringBuffer();while (line != null) {System.out.println(line);sb.append(line);sb.append(" ");line = br.readLine();}} catch (IOException e) {e.printStackTrace();} finally {try {br.close() ;} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}}content = sb.toString() ; return content ;}
package com.cs;public interface Parsable {public String getTitle() ;public String getContent() ;public String getSummary() ;}
package com.cs;import java.io.BufferedInputStream;import java.io.BufferedReader;import java.io.File;import java.io.IOException;import java.io.InputStreamReader;public class PdfParser implements Parsable {private File file ;private String content ;//内容/* * pdf解读需配置 */private String executeStr = "E:\\EclipseStudyWorkspace\\LuceneParse\\xpdf\\xpdf-3.02pl2-win32\\pdftotext.exe" ;public PdfParser(File file){this.file = file ;}public String getContent(){if (content != null){return content ;}String[] cmd = new String[]{executeStr,"-enc","UTF-8","-q",file.getAbsolutePath(),"-"} ;Process p = null ; BufferedReader br = null ;StringBuffer sb = new StringBuffer() ;try {p = Runtime.getRuntime().exec(cmd) ;br = new BufferedReader(new InputStreamReader(p.getInputStream(),"UTF-8")) ;String str = null ;while((str = br.readLine() ) != null ){sb.append(str).append("\n") ;}} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();} finally{if (br != null){try {br.close() ;} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}}}content = sb.toString() ;return content ;}public String getSummary() {String summary ;if (content == null ) {getContent() ;}if (content.length() > 200) {summary = content.substring(0, 200) ;}else {summary = content ;}return summary;}public String getTitle(){return file.getName() ;}public static void main(String[] args){PdfParser parser = new PdfParser(new File("E:\\EclipseStudyWorkspace\\LuceneParse\\fileSource\\123.pdf")) ;System.out.println("pdf content : "+parser.getContent()) ;}}