有关pdfbox-1.3.1中Identity-H编码为乱码的解决办法

2012-12-26

有关pdfbox-1.3.1中Identity-H编码为乱码的解决方法最近用lucene做一个搜索文档的小程序，其中索引pdf文件

有关pdfbox-1.3.1中Identity-H编码为乱码的解决方法

最近用lucene做一个搜索文档的小程序，其中索引pdf文件时使用pdfbox1.3时出现乱码。

索引pdf的函数如下：（使用pdfbox-1.3.1.jar以及fontbox-1.3.1.jar）

package luceneTest;

import java.io.File;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;

public class LucenePdf {
? public static Document getDocument(File pdf){
???????????? String pdfpath = pdf.getAbsolutePath();
???????????? PDDocument pdDocument = null;
???????????? Document document = new Document();
???????????? String title = pdf.getName();
???????????? try{

??????????????? pdDocument = PDDocument.load(pdf);
??????????????? PDFTextStripper stripper = new PDFTextStripper();
??????????????? String s1 = stripper.getText(pdDocument);
??????????????? System.out.println(s1);
???????????????? Reader contents = new StringReader(s1);
???????????????? document.add(new Field("title", title, Field.Store.YES, Field.Index.ANALYZED));
????????? document.add(new Field("contents",contents));
????????? document.add(new Field("path", pdfpath, Field.Store.YES, Field.Index.NO));
?? pdDocument.close();
?????? }catch(Exception e){
?? e.printStackTrace();
?????? }
?????? return document;
? }
}

结果出现了乱码，调试时发现pdf文档的编码格式为Identify-H。我又用了pdfbox-1.2.1.jar来替换pdfbox-1.3.1.jar，结果该文档可以正常显示。通过比较两个不同版本中的org.apache.pdfbox.pdmodel.font.PDFont源代码，我发现1.2.1中有一段代码专门用来处理Identity-H编码，而在1.3.1中则没有。于是将这段代码放入1.3.1版本中的PDFont中。

下面是pdfbox-1.3.1.jar中的org.apache.pdfbox.pdmodel.font.PDFont中有关编码的函数

? ?private void determineEncoding() throws IOException
??? {
??????? String cmapName = null;
??????? COSName encodingName = null;
??????? COSBase toUnicode = font.getDictionaryObject( COSName.TO_UNICODE );
??????? COSBase encoding = getEncodingObject();
??????? if( toUnicode != null )
??????? {
??????????? if ( toUnicode instanceof COSStream )
??????????? {
??????????????? try {
??????????????????? parseCmap(null, ((COSStream)toUnicode).getUnfilteredStream(), null);
??????????????? }
??????????????? catch(IOException exception)
??????????????? {
??????????????????? log.error("Error: Could not load embedded CMAP" );
??????????????? }
??????????? }
??????????? else if ( toUnicode instanceof COSName)
??????????? {
??????????????? encodingName = (COSName)toUnicode;
??????????????? cmap = cmapObjects.get( encodingName.getName() );
??????????????? if (cmap == null)
??????????????? {
??????????????????? cmapName = encodingName.getName();
??????????????? }
??????????? }
??????? }
??????? if (encoding != null)
??????? {
??????????? if (encoding instanceof COSName)
??????????? {
??????????????? if (cmap == null)
??????????????? {
??????????????????? encodingName = (COSName)encoding;
??????????????????? cmap = cmapObjects.get( encodingName.getName() );
??????????????????? if (cmap == null)
??????????????????? {
??????????????????????? cmapName = encodingName.getName();

??????????????????????? //其中红色部分为我后加的为解决idefntity-H编码的代码
???????????????????????? if (encodingName.getName().equals( COSName.IDENTITY_H.getName() ))
??????????????????????????? {
??????????????????????????????? COSArray descendantFontArray =
??????????????????????????????????? (COSArray)font.getDictionaryObject( COSName.DESCENDANT_FONTS );
??????????????????????????????? if (descendantFontArray != null)
??????????????????????????????? {
??????????????????????????????????? COSDictionary descendantFontDictionary =
??????????????????????????????????????? (COSDictionary)descendantFontArray.getObject( 0 );
??????????????????????????????????? PDFont descendentFont = PDFontFactory.createFont( descendantFontDictionary );
??????????????????????????????????? COSDictionary cidsysteminfo =
??????????????????????????????????????? (COSDictionary)descendentFont.font.getDictionaryObject(COSName.CIDSYSTEMINFO);
??????????????????????????????????? if (cidsysteminfo != null)
??????????????????????????????????? {
??????????????????????????????????????? String ordering = cidsysteminfo.getString(COSName.ORDERING);
??????????????????????????????????????? String registry = cidsysteminfo.getString(COSName.REGISTRY);
??????????????????????????????????????? cmapName = registry + "-" + ordering+"-UCS2";
??????????????????????????????????? }
??????????????????????????????? }
??????????????????????????? }
??????????????????? }
??????????????? }
??????????????? if (cmap == null && cmapName != null)
??????????????? {
??????????????????? try
??????????????????? {
??????????????????????? fontEncoding =
??????????????????????????? EncodingManager.INSTANCE.getEncoding(encodingName);
??????????????????? }
??????????????????? catch(IOException exception)
??????????????????? {
??????????????????????? log.debug("Debug: Could not find encoding for " + encodingName );
??????????????????? }
??????????????? }
??????????? }
??????????? else if (encoding instanceof COSDictionary)
??????????? {
??????????????? try
??????????????? {
??????????????????? fontEncoding = new DictionaryEncoding((COSDictionary)encoding);
??????????????? }
??????????????? catch(IOException exception)
??????????????? {
??????????????????? log.error("Error: Could not create the DictionaryEncoding" );
??????????????? }
??????????? }
??????????? else if(encoding instanceof COSStream )
??????????? {
??????????????? if (cmap == null)
??????????????? {
??????????????????? COSStream encodingStream = (COSStream)encoding;
??????????????????? try
??????????????????? {
??????????????????????? parseCmap( null, encodingStream.getUnfilteredStream(), null );
??????????????????? }
??????????????????? catch(IOException exception)
??????????????????? {
??????????????????????? log.error("Error: Could not parse the embedded CMAP" );
??????????????????? }
??????????????? }
??????????? }
??????? }
??????? COSDictionary cidsysteminfo = (COSDictionary)font.getDictionaryObject(COSName.CIDSYSTEMINFO);
??????? if (cidsysteminfo != null)
??????? {
??????????? String ordering = cidsysteminfo.getString(COSName.ORDERING);
??????????? String registry = cidsysteminfo.getString(COSName.REGISTRY);
??????????? int supplement = cidsysteminfo.getInt(COSName.SUPPLEMENT);
??????????? cmapName = registry + "-" + ordering+ "-" + supplement;
??????????? cmapName = CMapSubstitution.substituteCMap( cmapName );
??????????? cmap = cmapObjects.get( cmapName );
??????? }
??????? FontMetric metric = getAFM();
??????? if( metric != null )
??????? {
??????????? fontEncoding = new AFMEncoding( metric );
??????? }
???????
??????? if (cmap == null && cmapName != null)
??????? {
??????????? String resourceName = resourceRootCMAP + cmapName;
??????????? try {
??????????????? parseCmap( resourceRootCMAP, ResourceLoader.loadResource( resourceName ), encodingName );
??????????????? if( cmap == null && encodingName == null)
??????????????? {
??????????????????? log.error("Error: Could not parse predefined CMAP file for '" + cmapName + "'" );
??????????????? }
??????????? }
??????????? catch(IOException exception)
??????????? {
??????????????? log.error("Error: Could not find predefined CMAP file for '" + cmapName + "'" );
??????????? }
??????? }
??????? getEncodingFromFont();
??? }

这样还是不行，原因是1.3.1版本jar包的cmap中没有adobe-gb1-us2的转码表，于是我又将pdfbox-1.2.1.jar解压缩，将其中\org\apache\pdfbox\resources\cmap目录下的Adobe-GB1-US2表复制到pdfbox-1.3.1.jar中相同目录下的cmap中，这样在将修改后的pdfbox-1.3.1.jar放入工程中，则Identity-H编码问题解决了。

热点排行

PowerDesigner

有关pdfbox-1.3.1中Identity-H编码为乱码的解决办法