开发中相关读取pdf,html,word,rtf,txt,powerpoint,excel等文档的操作
开发中有关读取pdf,html,word,rtf,txt,powerpoint,excel等文档的操作?关于这七种文档,我相信应该是最常用
开发中有关读取pdf,html,word,rtf,txt,powerpoint,excel等文档的操作
?
关于这七种文档,我相信应该是最常用的文档了
在以下的介绍中会提到POI,现介绍下POI吧
poi处理WORD,EXCEL比较好:http://jakarta.apache.org/poi/
poi处理至少需要如下几个JAR包

?
PDFbox处理PDF比较好:http://pdfbox.apache.org/download.html
下面一一介绍了
第一和第二是只支持03版的word和excel文档
?? 第一、首先来看WORD文档:
我这里用的是poi,相关jar包自己去下载,然后加到工程中(以下所要用的jar包也是,不再重复说)
Java代码??

- <span?style="font-size:?medium;"><span?style="font-size:?large;">public?static?String?readWord(String?path)?throws?Exception?{??
- String?bodyText?=?null;??
- try?{??
- FileInputStream?is?=?new?FileInputStream(path);??
- bodyText?=?new?WordExtractor(is).getText();??
- }?catch?(Exception?e)?{??
- System.out.println("=======");??
- }??
- return?bodyText;??
- }</span></span>??
?
? 第二、Exel的文档
?
?
Java代码??

- <span?style="font-size:?medium;"><span?style="font-size:?large;">public?static?String?ReadExcel(String?path)?throws?IOException?{??
- InputStream?inputStream?=?null;??
- String?content?=?null;??
- try?{??
- inputStream?=?new?FileInputStream(path);??
- HSSFWorkbook?wb?=?new?HSSFWorkbook(inputStream);??
- ExcelExtractor?extractor?=?new?ExcelExtractor(wb);??
- extractor.setFormulasNotResults(true);??
- extractor.setIncludeSheetNames(false);??
- content?=?extractor.getText();??
- }?catch?(FileNotFoundException?e)?{??
- e.printStackTrace();??
- }??
- return?content;??
- }</span></span>??
?
??
?针对07版的word和excel的操作
???package com.test;
Java代码??

- <span?style="font-size:?large;">?????
- /**????
- ?*?需要的jar包:????
- ?*?poi-3.0.2-FINAL-20080204.jar????
- ?*?poi-contrib-3.0.2-FINAL-20080204.jar????
- ?*?poi-scratchpad-3.0.2-FINAL-20080204.jar????
- ?*?poi-3.5-beta6-20090622.jar????
- ?*?geronimo-stax-api_1.0_spec-1.0.jar????
- ?*?ooxml-schemas-1.0.jar????
- ?*?openxml4j-bin-beta.jar????
- ?*?poi-ooxml-3.5-beta6-20090622.jar????
- ?*?xmlbeans-2.3.0.jar????
- ?*?dom4j-1.6.1.jar????
- ?*/?????
- ?????
- import?java.io.FileInputStream;??????
- import?java.io.IOException;??????
- import?java.io.InputStream;??????
- ?????
- import?org.apache.poi.POIXMLDocument;??????
- import?org.apache.poi.POIXMLTextExtractor;??????
- import?org.apache.poi.hssf.usermodel.HSSFCell;??????
- import?org.apache.poi.hssf.usermodel.HSSFRow;??????
- import?org.apache.poi.hssf.usermodel.HSSFSheet;??????
- import?org.apache.poi.hssf.usermodel.HSSFWorkbook;??????
- import?org.apache.poi.hwpf.extractor.WordExtractor;??????
- import?org.apache.poi.openxml4j.exceptions.OpenXML4JException;??????
- import?org.apache.poi.openxml4j.opc.OPCPackage;??????
- import?org.apache.poi.xssf.usermodel.XSSFCell;??????
- import?org.apache.poi.xssf.usermodel.XSSFRow;??????
- import?org.apache.poi.xssf.usermodel.XSSFSheet;??????
- import?org.apache.poi.xssf.usermodel.XSSFWorkbook;??????
- import?org.apache.poi.xwpf.extractor.XWPFWordExtractor;??????
- import?org.apache.xmlbeans.XmlException;??????
- ?????
- public?class?WordAndExcelExtractor?{??????
- ?public?static?void?main(String[]?args){??????
- ??try{??????
- ???String?wordFile?=?"D:/松山血战.docx";??????
- ???String?wordText2007?=?WordAndExcelExtractor.extractTextFromDOC2007(wordFile);??????
- ???System.out.println("wordText2007======="+wordText2007);??????
- ?????????
- ???InputStream?is?=?new?FileInputStream("D:/XXX研发中心技术岗位职位需求.xls");?????????
- ???String?excelText?=?WordAndExcelExtractor.extractTextFromXLS(is);?????????
- ???System.out.println("text2003=========="?+?excelText);??????
- ?????????
- ???String?excelFile?=?"D:/Hello2007.xlsx";?????????
- ???String?excelText2007?=?WordAndExcelExtractor.extractTextFromXLS2007(excelFile);??????
- ???System.out.println("excelText2007=========="?+?excelText2007);??????
- ?????
- ?????????
- ??}catch(Exception?e?){??????
- ???e.printStackTrace();??????
- ??}??????
- ?}??????
- ???????
- ?/**????
- ??*?@Method:?extractTextFromDOCX????
- ??*?@Description:?从word?2003文档中提取纯文本????
- ??*????
- ??*?@param?????
- ??*?@return?String????
- ??*?@throws????
- ??*/?????
- ????public?static?String?extractTextFromDOC(InputStream?is)?throws?IOException?{??????
- ????????WordExtractor?ex?=?new?WordExtractor(is);?//is是WORD文件的InputStream???????
- ?????
- ????????return?ex.getText();??????
- ????}??????
- ???????
- ?/**????
- ??*?@Method:?extractTextFromDOCX????
- ??*?@Description:?从word?2007文档中提取纯文本????
- ??*????
- ??*?@param?????
- ??*?@return?String????
- ??*?@throws????
- ??*/?????
- ????public?static?String?extractTextFromDOC2007(String?fileName)?throws?IOException,?OpenXML4JException,?XmlException?{??????
- ?????OPCPackage?opcPackage?=?POIXMLDocument.openPackage(fileName);??????
- ?????POIXMLTextExtractor?ex?=?new?XWPFWordExtractor(opcPackage);?????????
- ?????
- ????????return?ex.getText();??????
- ????}??????
- ???????
- ?/**????
- ??*?@Method:?extractTextFromXLS????
- ??*?@Description:?从excel?2003文档中提取纯文本????
- ??*????
- ??*?@param?????
- ??*?@return?String????
- ??*?@throws????
- ??*/?????
- ????@SuppressWarnings("deprecation")??????
- ?private?static?String?extractTextFromXLS(InputStream?is)??????
- ????????throws?IOException?{??????
- ????????StringBuffer?content??=?new?StringBuffer();??????
- ????????HSSFWorkbook?workbook?=?new?HSSFWorkbook(is);?//创建对Excel工作簿文件的引用???????
- ?????
- ????????for?(int?numSheets?=?0;?numSheets?<?workbook.getNumberOfSheets();?numSheets++)?{??????
- ????????????if?(null?!=?workbook.getSheetAt(numSheets))?{??????
- ????????????????HSSFSheet?aSheet?=?workbook.getSheetAt(numSheets);?//获得一个sheet??????
- ?????
- ????????????????for?(int?rowNumOfSheet?=?0;?rowNumOfSheet?<=?aSheet.getLastRowNum();?rowNumOfSheet++)?{??????
- ????????????????????if?(null?!=?aSheet.getRow(rowNumOfSheet))?{??????
- ????????????????????????HSSFRow?aRow?=?aSheet.getRow(rowNumOfSheet);?//获得一行??????
- ?????
- ????????????????????????for?(short?cellNumOfRow?=?0;?cellNumOfRow?<=?aRow.getLastCellNum();?cellNumOfRow++)?{??????
- ????????????????????????????if?(null?!=?aRow.getCell(cellNumOfRow))?{??????
- ????????????????????????????????HSSFCell?aCell?=?aRow.getCell(cellNumOfRow);?//获得列值??????
- ??????????????????????????????????????????????????????????????????????
- ????????????????????????????????if(aCell.getCellType()?==?HSSFCell.CELL_TYPE_NUMERIC){??????
- ?????????????????????????????????content.append(aCell.getNumericCellValue());??????
- ????????????????????????????????}else?if(aCell.getCellType()?==?HSSFCell.CELL_TYPE_BOOLEAN){??????
- ?????????????????????????????????content.append(aCell.getBooleanCellValue());??????
- ????????????????????????????????}else?{??????
- ?????????????????????????????????content.append(aCell.getStringCellValue());??????
- ????????????????????????????????}??????
- ????????????????????????????}??????
- ????????????????????????}??????
- ????????????????????}??????
- ????????????????}??????
- ????????????}??????
- ????????}??????
- ?????
- ????????return?content.toString();??????
- ????}??????
- ??????????
- ????/**????
- ?????*?@Method:?extractTextFromXLS2007????
- ?????*?@Description:?从excel?2007文档中提取纯文本????
- ?????*????
- ?????*?@param?????
- ?????*?@return?String????
- ?????*?@throws????
- ?????*/?????
- ????private?static?String?extractTextFromXLS2007(String?fileName)?throws?Exception{??????
- ?????StringBuffer?content?=?new?StringBuffer();??????
- ???????????
- ?????//构造?XSSFWorkbook?对象,strPath?传入文件路径??????????
- ??XSSFWorkbook?xwb?=?new?XSSFWorkbook(fileName);??????
- ????????
- ??//循环工作表Sheet??????
- ??for(int?numSheet?=?0;?numSheet?<?xwb.getNumberOfSheets();?numSheet++){??????
- ???XSSFSheet?xSheet?=?xwb.getSheetAt(numSheet);???????
- ???if(xSheet?==?null){??????
- ????continue;??????
- ???}??????
- ?????????
- ???//循环行Row??????
- ???for(int?rowNum?=?0;?rowNum?<=?xSheet.getLastRowNum();?rowNum++){??????
- ????XSSFRow?xRow?=?xSheet.getRow(rowNum);??????
- ????if(xRow?==?null){??????
- ?????continue;??????
- ????}??????
- ??????????
- ????//循环列Cell??????
- ????for(int?cellNum?=?0;?cellNum?<=?xRow.getLastCellNum();?cellNum++){??????
- ?????XSSFCell?xCell?=?xRow.getCell(cellNum);??????
- ?????if(xCell?==?null){??????
- ??????continue;??????
- ?????}??????
- ???????????
- ?????if(xCell.getCellType()?==?XSSFCell.CELL_TYPE_BOOLEAN){??????
- ??????content.append(xCell.getBooleanCellValue());??????
- ?????}else?if(xCell.getCellType()?==?XSSFCell.CELL_TYPE_NUMERIC){??????
- ??????content.append(xCell.getNumericCellValue());??????
- ?????}else{??????
- ??????content.append(xCell.getStringCellValue());??????
- ?????}??????
- ????}??????
- ???}??????
- ??}??????
- ????????
- ??return?content.toString();??????
- ????}??????
- ??????????
- }??????
- </span>??
?
? 第三、PowerPoint的文档
?
Java代码??

- <span?style="font-size:?medium;"><span?style="font-size:?large;">public?static?String?readPowerPoint(String?path)?{??
- StringBuffer?content?=?new?StringBuffer("");??
- try?{??
- SlideShow?ss?=?new?SlideShow(new?HSLFSlideShow(new?FileInputStream(??
- path)));//?is??
- //?为文件的InputStream,建立SlideShow??
- Slide[]?slides?=?ss.getSlides();//?获得每一张幻灯片??
- for?(int?i?=?0;?i?<?slides.length;?i++)?{??
- TextRun[]?t?=?slides[i].getTextRuns();//?为了取得幻灯片的文字内容,建立TextRun??
- for?(int?j?=?0;?j?<?t.length;?j++)?{??
- content.append(t[j].getText());//?这里会将文字内容加到content中去??
- }??
- }??
- }?catch?(Exception?ex)?{??
- System.out.println(ex.toString());??
- }??
- return?content.toString();??
- }</span></span>??
?
?
?第四、PDF的文档
?
Java代码??

- <span?style="font-size:?medium;"><span?style="font-size:?large;">public?static?String?readPdf(String?path)?throws?Exception?{??
- StringBuffer?content?=?new?StringBuffer("");??
- FileInputStream?fis?=?new?FileInputStream(path);??
- PDFParser?p?=?new?PDFParser(fis);??
- p.parse();??
- PDFTextStripper?ts?=?new?PDFTextStripper();??
- content.append(ts.getText(p.getPDDocument()));??
- fis.close();??
- return?content.toString().trim();??
- }</span></span>??
?
??
?? 第五、HTML的文档,要说明的是,HTML文档我们要获取其TITLE,BODY中的内容就要先获取源文件,然后再对源文件进行标签上的过滤,很麻烦
?
Html代码??

- <span?style="font-size:?large;">public?static?String?readHtml(String?urlString)?{??
- StringBuffer?content?=?new?StringBuffer("");??
- File?file?=?new?File(urlString);??
- FileInputStream?fis?=?null;??
- try?{??
- fis?=?new?FileInputStream(file);??
- BufferedReader?reader?=?new?BufferedReader(new?InputStreamReader(??
- fis,?"utf-8"));??
- String?line?=?null;??
- while?((line?=?reader.readLine())?!=?null)?{??
- content.append(line?+?"\n");??
- }??
- reader.close();??
- }?catch?(Exception?e)?{??
- e.printStackTrace();??
- }??
- String?contentcontentString?=?content.toString();??
- String?htmlStr?=?contentString;?//?含html标签的字符串??
- String?textStr?=?"";??
- java.util.regex.Pattern?p_script;??
- java.util.regex.Matcher?m_script;??
- java.util.regex.Pattern?p_style;??
- java.util.regex.Matcher?m_style;??
- java.util.regex.Pattern?p_html;??
- java.util.regex.Matcher?m_html;??
- try?{??
- String?regEx_script?=?"<[\\s]*?script[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?script[\\??
- String?regEx_style?=?"<[\\s]*?style[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?style[\\s]*??
- String?regEx_html?=?"<[^>]+>";?//?定义HTML标签的正则表达式??
- p_script?=?Pattern.compile(regEx_script,?Pattern.CASE_INSENSITIVE);??
- m_script?=?p_script.matcher(htmlStr);??
- htmlStr?=?m_script.replaceAll("");?//?过滤script标签??
- p_style?=?Pattern.compile(regEx_style,?Pattern.CASE_INSENSITIVE);??
- m_style?=?p_style.matcher(htmlStr);??
- htmlStr?=?m_style.replaceAll("");?//?过滤style标签??
- p_html?=?Pattern.compile(regEx_html,?Pattern.CASE_INSENSITIVE);??
- m_html?=?p_html.matcher(htmlStr);??
- htmlStr?=?m_html.replaceAll("");?//?过滤html标签??
- textStr?=?htmlStr;??
- }?catch?(Exception?e)?{??
- System.err.println("Html2Text:?"?+?e.getMessage());??
- }??
- return?textStr;//?返回文本字符串??
- }</span>??
?
?
?第六、TXT的文档,给TXT文本建立索引时要注意
?? 本项目实现了组合查询的功能
? //这一步如果不设置为GBK,TXT内容将全部乱码 BufferedReader reader=new BufferedReader(new InputStreamReader(is,"GBK")); 具体代码如下?
?
Java代码??

- <span?style="font-size:?medium;"><span?style="font-size:?large;">public?static?String?readTxt(String?path)?throws?IOException?{??
- StringBuffer?sb?=?new?StringBuffer("");??
- InputStream?is?=?new?FileInputStream(path);??
- //?必须设置成GBK,否则将出现乱码??
- BufferedReader?reader?=?new?BufferedReader(new?InputStreamReader(is,??
- "GBK"));??
- try?{??
- String?line?=?"";??
- while?((line?=?reader.readLine())?!=?null)?{??
- sb.append(line?+?"\r");??
- }??
- }?catch?(FileNotFoundException?e)?{??
- e.printStackTrace();??
- }??
- return?sb.toString().trim();??
- }</span></span>??
?
??
?
第七、RTF文档,rtf的转换则在javax中就有
?
Java代码??

- <span?style="font-size:?medium;"><span?style="font-size:?large;">public?static?String?readRtf(String?path)?{??
- String?result?=?null;??
- File?file?=?new?File(path);??
- try?{??
- DefaultStyledDocument?styledDoc?=?new?DefaultStyledDocument();??
- InputStream?is?=?new?FileInputStream(file);??
- new?RTFEditorKit().read(is,?styledDoc,?0);??
- result?=?new?String(styledDoc.getText(0,?styledDoc.getLength())??
- .getBytes("iso8859-1"),?"gbk");??
- //?提取文本,读取中文需要使用gbk编码,否则会出现乱码??
- }?catch?(IOException?e)?{??
- e.printStackTrace();??
- }?catch?(BadLocationException?e)?{??
- e.printStackTrace();??
- }??
- return?result;??
- }</span></span>??
?
?