开发中相关读取pdf,html,word,rtf,txt,powerpoint,excel等文档的操作

2012-10-25

开发中有关读取pdf,html,word,rtf,txt,powerpoint,excel等文档的操作?关于这七种文档，我相信应该是最常用

开发中有关读取pdf,html,word,rtf,txt,powerpoint,excel等文档的操作

关于这七种文档，我相信应该是最常用的文档了

在以下的介绍中会提到POI，现介绍下POI吧

poi处理WORD,EXCEL比较好:http://jakarta.apache.org/poi/

poi处理至少需要如下几个JAR包

开发中相关读取pdf,html,word,rtf,txt,powerpoint,excel等文档的操作
?
PDFbox处理PDF比较好：http://pdfbox.apache.org/download.html

下面一一介绍了

第一和第二是只支持03版的word和excel文档

?? 第一、首先来看WORD文档：
我这里用的是poi，相关jar包自己去下载，然后加到工程中（以下所要用的jar包也是，不再重复说）

Java代码??

开发中相关读取pdf,html,word,rtf,txt,powerpoint,excel等文档的操作

<span?style="font-size:?medium;"><span?style="font-size:?large;">public?static?String?readWord(String?path)?throws?Exception?{??
String?bodyText?=?null;??
try?{??
FileInputStream?is?=?new?FileInputStream(path);??
bodyText?=?new?WordExtractor(is).getText();??
}?catch?(Exception?e)?{??
System.out.println("=======");??
}??
return?bodyText;??
}??

? 第二、Exel的文档

Java代码??

<span?style="font-size:?medium;"><span?style="font-size:?large;">public?static?String?ReadExcel(String?path)?throws?IOException?{??
InputStream?inputStream?=?null;??
String?content?=?null;??
try?{??
inputStream?=?new?FileInputStream(path);??
HSSFWorkbook?wb?=?new?HSSFWorkbook(inputStream);??
ExcelExtractor?extractor?=?new?ExcelExtractor(wb);??
extractor.setFormulasNotResults(true);??
extractor.setIncludeSheetNames(false);??
content?=?extractor.getText();??
}?catch?(FileNotFoundException?e)?{??
e.printStackTrace();??
}??
return?content;??
}??

?针对07版的word和excel的操作

???package com.test;

Java代码??

<span?style="font-size:?large;">?????
/**????
?*?需要的jar包：????
?*?poi-3.0.2-FINAL-20080204.jar????
?*?poi-contrib-3.0.2-FINAL-20080204.jar????
?*?poi-scratchpad-3.0.2-FINAL-20080204.jar????
?*?poi-3.5-beta6-20090622.jar????
?*?geronimo-stax-api_1.0_spec-1.0.jar????
?*?ooxml-schemas-1.0.jar????
?*?openxml4j-bin-beta.jar????
?*?poi-ooxml-3.5-beta6-20090622.jar????
?*?xmlbeans-2.3.0.jar????
?*?dom4j-1.6.1.jar????
?*/?????
?????
import?java.io.FileInputStream;??????
import?java.io.IOException;??????
import?java.io.InputStream;??????
?????
import?org.apache.poi.POIXMLDocument;??????
import?org.apache.poi.POIXMLTextExtractor;??????
import?org.apache.poi.hssf.usermodel.HSSFCell;??????
import?org.apache.poi.hssf.usermodel.HSSFRow;??????
import?org.apache.poi.hssf.usermodel.HSSFSheet;??????
import?org.apache.poi.hssf.usermodel.HSSFWorkbook;??????
import?org.apache.poi.hwpf.extractor.WordExtractor;??????
import?org.apache.poi.openxml4j.exceptions.OpenXML4JException;??????
import?org.apache.poi.openxml4j.opc.OPCPackage;??????
import?org.apache.poi.xssf.usermodel.XSSFCell;??????
import?org.apache.poi.xssf.usermodel.XSSFRow;??????
import?org.apache.poi.xssf.usermodel.XSSFSheet;??????
import?org.apache.poi.xssf.usermodel.XSSFWorkbook;??????
import?org.apache.poi.xwpf.extractor.XWPFWordExtractor;??????
import?org.apache.xmlbeans.XmlException;??????
?????
public?class?WordAndExcelExtractor?{??????
?public?static?void?main(String[]?args){??????
??try{??????
???String?wordFile?=?"D:/松山血战.docx";??????
???String?wordText2007?=?WordAndExcelExtractor.extractTextFromDOC2007(wordFile);??????
???System.out.println("wordText2007======="+wordText2007);??????
?????????
???InputStream?is?=?new?FileInputStream("D:/XXX研发中心技术岗位职位需求.xls");?????????
???String?excelText?=?WordAndExcelExtractor.extractTextFromXLS(is);?????????
???System.out.println("text2003=========="?+?excelText);??????
?????????
???String?excelFile?=?"D:/Hello2007.xlsx";?????????
???String?excelText2007?=?WordAndExcelExtractor.extractTextFromXLS2007(excelFile);??????
???System.out.println("excelText2007=========="?+?excelText2007);??????
?????
?????????
??}catch(Exception?e?){??????
???e.printStackTrace();??????
??}??????
?}??????
???????
?/**????
??*?@Method:?extractTextFromDOCX????
??*?@Description:?从word?2003文档中提取纯文本????
??*????
??*?@param?????
??*?@return?String????
??*?@throws????
??*/?????
????public?static?String?extractTextFromDOC(InputStream?is)?throws?IOException?{??????
????????WordExtractor?ex?=?new?WordExtractor(is);?//is是WORD文件的InputStream???????
?????
????????return?ex.getText();??????
????}??????
???????
?/**????
??*?@Method:?extractTextFromDOCX????
??*?@Description:?从word?2007文档中提取纯文本????
??*????
??*?@param?????
??*?@return?String????
??*?@throws????
??*/?????
????public?static?String?extractTextFromDOC2007(String?fileName)?throws?IOException,?OpenXML4JException,?XmlException?{??????
?????OPCPackage?opcPackage?=?POIXMLDocument.openPackage(fileName);??????
?????POIXMLTextExtractor?ex?=?new?XWPFWordExtractor(opcPackage);?????????
?????
????????return?ex.getText();??????
????}??????
???????
?/**????
??*?@Method:?extractTextFromXLS????
??*?@Description:?从excel?2003文档中提取纯文本????
??*????
??*?@param?????
??*?@return?String????
??*?@throws????
??*/?????
????@SuppressWarnings("deprecation")??????
?private?static?String?extractTextFromXLS(InputStream?is)??????
????????throws?IOException?{??????
????????StringBuffer?content??=?new?StringBuffer();??????
????????HSSFWorkbook?workbook?=?new?HSSFWorkbook(is);?//创建对Excel工作簿文件的引用???????
?????
????????for?(int?numSheets?=?0;?numSheets?<?workbook.getNumberOfSheets();?numSheets++)?{??????
????????????if?(null?!=?workbook.getSheetAt(numSheets))?{??????
????????????????HSSFSheet?aSheet?=?workbook.getSheetAt(numSheets);?//获得一个sheet??????
?????
????????????????for?(int?rowNumOfSheet?=?0;?rowNumOfSheet?<=?aSheet.getLastRowNum();?rowNumOfSheet++)?{??????
????????????????????if?(null?!=?aSheet.getRow(rowNumOfSheet))?{??????
????????????????????????HSSFRow?aRow?=?aSheet.getRow(rowNumOfSheet);?//获得一行??????
?????
????????????????????????for?(short?cellNumOfRow?=?0;?cellNumOfRow?<=?aRow.getLastCellNum();?cellNumOfRow++)?{??????
????????????????????????????if?(null?!=?aRow.getCell(cellNumOfRow))?{??????
????????????????????????????????HSSFCell?aCell?=?aRow.getCell(cellNumOfRow);?//获得列值??????
??????????????????????????????????????????????????????????????????????
????????????????????????????????if(aCell.getCellType()?==?HSSFCell.CELL_TYPE_NUMERIC){??????
?????????????????????????????????content.append(aCell.getNumericCellValue());??????
????????????????????????????????}else?if(aCell.getCellType()?==?HSSFCell.CELL_TYPE_BOOLEAN){??????
?????????????????????????????????content.append(aCell.getBooleanCellValue());??????
????????????????????????????????}else?{??????
?????????????????????????????????content.append(aCell.getStringCellValue());??????
????????????????????????????????}??????
????????????????????????????}??????
????????????????????????}??????
????????????????????}??????
????????????????}??????
????????????}??????
????????}??????
?????
????????return?content.toString();??????
????}??????
??????????
????/**????
?????*?@Method:?extractTextFromXLS2007????
?????*?@Description:?从excel?2007文档中提取纯文本????
?????*????
?????*?@param?????
?????*?@return?String????
?????*?@throws????
?????*/?????
????private?static?String?extractTextFromXLS2007(String?fileName)?throws?Exception{??????
?????StringBuffer?content?=?new?StringBuffer();??????
???????????
?????//构造?XSSFWorkbook?对象，strPath?传入文件路径??????????
??XSSFWorkbook?xwb?=?new?XSSFWorkbook(fileName);??????
????????
??//循环工作表Sheet??????
??for(int?numSheet?=?0;?numSheet?<?xwb.getNumberOfSheets();?numSheet++){??????
???XSSFSheet?xSheet?=?xwb.getSheetAt(numSheet);???????
???if(xSheet?==?null){??????
????continue;??????
???}??????
?????????
???//循环行Row??????
???for(int?rowNum?=?0;?rowNum?<=?xSheet.getLastRowNum();?rowNum++){??????
????XSSFRow?xRow?=?xSheet.getRow(rowNum);??????
????if(xRow?==?null){??????
?????continue;??????
????}??????
??????????
????//循环列Cell??????
????for(int?cellNum?=?0;?cellNum?<=?xRow.getLastCellNum();?cellNum++){??????
?????XSSFCell?xCell?=?xRow.getCell(cellNum);??????
?????if(xCell?==?null){??????
??????continue;??????
?????}??????
???????????
?????if(xCell.getCellType()?==?XSSFCell.CELL_TYPE_BOOLEAN){??????
??????content.append(xCell.getBooleanCellValue());??????
?????}else?if(xCell.getCellType()?==?XSSFCell.CELL_TYPE_NUMERIC){??????
??????content.append(xCell.getNumericCellValue());??????
?????}else{??????
??????content.append(xCell.getStringCellValue());??????
?????}??????
????}??????
???}??????
??}??????
????????
??return?content.toString();??????
????}??????
??????????
}??????
??

? 第三、PowerPoint的文档

Java代码??

<span?style="font-size:?medium;"><span?style="font-size:?large;">public?static?String?readPowerPoint(String?path)?{??
StringBuffer?content?=?new?StringBuffer("");??
try?{??
SlideShow?ss?=?new?SlideShow(new?HSLFSlideShow(new?FileInputStream(??
path)));//?is??
//?为文件的InputStream，建立SlideShow??
Slide[]?slides?=?ss.getSlides();//?获得每一张幻灯片??
for?(int?i?=?0;?i?<?slides.length;?i++)?{??
TextRun[]?t?=?slides[i].getTextRuns();//?为了取得幻灯片的文字内容，建立TextRun??
for?(int?j?=?0;?j?<?t.length;?j++)?{??
content.append(t[j].getText());//?这里会将文字内容加到content中去??
}??
}??
}?catch?(Exception?ex)?{??
System.out.println(ex.toString());??
}??
return?content.toString();??
}??

?第四、PDF的文档

Java代码??

<span?style="font-size:?medium;"><span?style="font-size:?large;">public?static?String?readPdf(String?path)?throws?Exception?{??
StringBuffer?content?=?new?StringBuffer("");??
FileInputStream?fis?=?new?FileInputStream(path);??
PDFParser?p?=?new?PDFParser(fis);??
p.parse();??
PDFTextStripper?ts?=?new?PDFTextStripper();??
content.append(ts.getText(p.getPDDocument()));??
fis.close();??
return?content.toString().trim();??
}??

?? 第五、HTML的文档，要说明的是，HTML文档我们要获取其TITLE，BODY中的内容就要先获取源文件，然后再对源文件进行标签上的过滤，很麻烦

Html代码??

<span?style="font-size:?large;">public?static?String?readHtml(String?urlString)?{??
StringBuffer?content?=?new?StringBuffer("");??
File?file?=?new?File(urlString);??
FileInputStream?fis?=?null;??
try?{??
fis?=?new?FileInputStream(file);??
BufferedReader?reader?=?new?BufferedReader(new?InputStreamReader(??
fis,?"utf-8"));??
String?line?=?null;??
while?((line?=?reader.readLine())?!=?null)?{??
content.append(line?+?"\n");??
}??
reader.close();??
}?catch?(Exception?e)?{??
e.printStackTrace();??
}??
String?contentcontentString?=?content.toString();??
String?htmlStr?=?contentString;?//?含html标签的字符串??
String?textStr?=?"";??
java.util.regex.Pattern?p_script;??
java.util.regex.Matcher?m_script;??
java.util.regex.Pattern?p_style;??
java.util.regex.Matcher?m_style;??
java.util.regex.Pattern?p_html;??
java.util.regex.Matcher?m_html;??
try?{??
String?regEx_script?=?"<[\\s]*?script[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?script[\\??
String?regEx_style?=?"<[\\s]*?style[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?style[\\s]*??
String?regEx_html?=?"<[^>]+>";?//?定义HTML标签的正则表达式??
p_script?=?Pattern.compile(regEx_script,?Pattern.CASE_INSENSITIVE);??
m_script?=?p_script.matcher(htmlStr);??
htmlStr?=?m_script.replaceAll("");?//?过滤script标签??
p_style?=?Pattern.compile(regEx_style,?Pattern.CASE_INSENSITIVE);??
m_style?=?p_style.matcher(htmlStr);??
htmlStr?=?m_style.replaceAll("");?//?过滤style标签??
p_html?=?Pattern.compile(regEx_html,?Pattern.CASE_INSENSITIVE);??
m_html?=?p_html.matcher(htmlStr);??
htmlStr?=?m_html.replaceAll("");?//?过滤html标签??
textStr?=?htmlStr;??
}?catch?(Exception?e)?{??
System.err.println("Html2Text:?"?+?e.getMessage());??
}??
return?textStr;//?返回文本字符串??
}??

?第六、TXT的文档，给TXT文本建立索引时要注意
?? 本项目实现了组合查询的功能
? //这一步如果不设置为GBK，TXT内容将全部乱码 BufferedReader reader=new BufferedReader(new InputStreamReader(is,"GBK")); 具体代码如下?

Java代码??

<span?style="font-size:?medium;"><span?style="font-size:?large;">public?static?String?readTxt(String?path)?throws?IOException?{??
StringBuffer?sb?=?new?StringBuffer("");??
InputStream?is?=?new?FileInputStream(path);??
//?必须设置成GBK，否则将出现乱码??
BufferedReader?reader?=?new?BufferedReader(new?InputStreamReader(is,??
"GBK"));??
try?{??
String?line?=?"";??
while?((line?=?reader.readLine())?!=?null)?{??
sb.append(line?+?"\r");??
}??
}?catch?(FileNotFoundException?e)?{??
e.printStackTrace();??
}??
return?sb.toString().trim();??
}??

第七、RTF文档，rtf的转换则在javax中就有

Java代码??

<span?style="font-size:?medium;"><span?style="font-size:?large;">public?static?String?readRtf(String?path)?{??
String?result?=?null;??
File?file?=?new?File(path);??
try?{??
DefaultStyledDocument?styledDoc?=?new?DefaultStyledDocument();??
InputStream?is?=?new?FileInputStream(file);??
new?RTFEditorKit().read(is,?styledDoc,?0);??
result?=?new?String(styledDoc.getText(0,?styledDoc.getLength())??
.getBytes("iso8859-1"),?"gbk");??
//?提取文本，读取中文需要使用gbk编码，否则会出现乱码??
}?catch?(IOException?e)?{??
e.printStackTrace();??
}?catch?(BadLocationException?e)?{??
e.printStackTrace();??
}??
return?result;??
}??

热点排行

PowerDesigner

开发中相关读取pdf,html,word,rtf,txt,powerpoint,excel等文档的操作