星期三, 3月 19, 2008

PDFBox 讀中文備忘記

這裡使用PDFBox 讀取 UniCNS-UCS2-H 及 UniGB-UCS2-H 等中文字集,
從網上的資料裡作簡單的整理備忘.只適用於 PDFBox 0.7.2, 不適用於其他版本.

下載 PDFBox-0.7.2.zip
http://sourceforge.net/project/showfiles.php?group_id=78314

在 eclipse 裡建立 project PDFExtractor-0.7.2

然後下載 pdfbox-0.7.2-cjk.patch
http://sourceforge.net/tracker/index.php?func=detail&aid=1640071&group_id=78314&atid=552834

原本是使用 cygwin 的 patch 指令, 後來不成功
出現 patch: **** malformed patch at line 9: 的錯誤信息.

最後只好用 notepad 打開 pdfbox-0.7.2-cjk.patch, 然後用手工修改對應的檔案
打開後如下所示:
@@@@@@@@ pdfbox-0.7.2-cjk.patch 開始 @@@@@@@@@@
diff -NurEb PDFBox-0.7.2/src/org/pdfbox/cmaptypes/CMap.java PDFBox-0.7.2.work/src/org/pdfbox/cmaptypes/CMap.java
--- PDFBox-0.7.2/src/org/pdfbox/cmaptypes/CMap.java 2005-02-20 19:55:10.000000000 +0800
+++ PDFBox-0.7.2.work/src/org/pdfbox/cmaptypes/CMap.java 2006-09-02 11:19:06.000000000 +0800
@@ -36,6 +36,7 @@
import java.util.HashMap;
import java.util.List;
import java.util.Map;
+import java.util.Iterator;

/**
* This class represents a CMap file.
@@ -158,4 +159,31 @@
return codeSpaceRanges;
}

+ /**
+ * Check whether a byte array is in codespace ranges
+ */
+ public boolean isInCodeSpaceRanges(byte [] code)
+ {
+ return isInCodeSpaceRanges(code, 0, code.length);
+ }
+
+ /**
+ * Check whether a byte array is in codespace ranges
+ */
+ public boolean isInCodeSpaceRanges(byte [] code, int offset, int length)
+ {
+ Iterator it = codeSpaceRanges.iterator();
+
+ while ( it.hasNext() ) {
+ CodespaceRange range = (CodespaceRange)it.next();
+
+ if ( range != null && range.isInRange(code, offset, length) )
+ return true;
+
+ } // while
+
+ return false;
+ }
+
+
}
\ No newline at end of file
diff -NurEb PDFBox-0.7.2/src/org/pdfbox/cmaptypes/CodespaceRange.java PDFBox-0.7.2.work/src/org/pdfbox/cmaptypes/CodespaceRange.java
--- PDFBox-0.7.2/src/org/pdfbox/cmaptypes/CodespaceRange.java 2004-08-28 18:58:22.000000000 +0800
+++ PDFBox-0.7.2.work/src/org/pdfbox/cmaptypes/CodespaceRange.java 2006-09-02 12:04:22.000000000 +0800
@@ -30,6 +30,8 @@
*/
package org.pdfbox.cmaptypes;

+import org.apache.log4j.Logger;
+
/**
* This represents a single entry in the codespace range.
*
@@ -42,6 +44,8 @@
private byte[] start;
private byte[] end;

+ private static Logger log = Logger.getLogger(CodespaceRange.class);
+
/**
* Creates a new instance of CodespaceRange.
*/
@@ -85,4 +89,57 @@
start = startBytes;
}

+ /**
+ * Check whether a byte array is in this codespace range
+ */
+ public boolean isInRange(byte [] code, int offset, int length)
+ {
+ if ( log.isDebugEnabled() ) {
+
+ log.debug("code: offset = " + offset + ", length = " + length);
+
+ String s = new String();
+ for ( int i = 0; i < s = "" length = " + start.length); + for ( int i = 0; i < start.length; i++ ) + s += " s = "" length = " + end.length); + for ( int i = 0; i < end.length; i++ ) + s += "> end.length )
+ return false;
+
+ if ( end.length == length ) {
+ for ( int i = 0; i < endint =" ((int)end[i])" codeint =" ((int)code[offset" length ="=" i =" 0;" startint =" ((int)start[i])" codeint =" ((int)code[offset"> codeInt )
+ return false;
+ }
+ }
+
+ return true;
+
+ }
+
}
\ No newline at end of file
diff -NurEb PDFBox-0.7.2/src/org/pdfbox/encoding/conversion/CJKConverter.java PDFBox-0.7.2.work/src/org/pdfbox/encoding/conversion/CJKConverter.java
--- PDFBox-0.7.2/src/org/pdfbox/encoding/conversion/CJKConverter.java 1970-01-01 08:00:00.000000000 +0800
+++ PDFBox-0.7.2.work/src/org/pdfbox/encoding/conversion/CJKConverter.java 2006-09-02 12:29:16.000000000 +0800
@@ -0,0 +1,119 @@
+/**
+ * Copyright (c) 2003-2005, www.pdfbox.org
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ * 3. Neither the name of pdfbox; nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * http://www.pdfbox.org
+ *
+ */
+package org.pdfbox.encoding.conversion;
+
+import org.pdfbox.cmaptypes.CMap;
+
+import java.io.UnsupportedEncodingException;
+
+import org.apache.log4j.Logger;
+
+/**
+ * CJKConverter converts encodings defined in CJKEncodings
+ *
+ * @auther pinxue , Holly Lee
+ */
+class CJKConverter implements EncodingConverter
+{
+ /** The encoding */
+ private String _encoding = null;
+ /** The java charset name */
+ private String _charset = null;
+
+ /** The logger */
+ private static Logger log = Logger.getLogger(CJKConverter.class);
+
+ /**
+ * Constructs a CJKConverter from a PDF encoding name
+ */
+ public CJKConverter(String encoding)
+ {
+ _encoding = encoding;
+ _charset = CJKEncodings.getCharset(encoding);
+ }
+
+ /**
+ * Convert a string. It occurs when a cmap lookup returned
+ * converted bytes successfully, but we still need to convert its
+ * encoding. The parameter s is constructs as one byte or a UTF-16BE
+ * encoded string.
+ *
+ * Note: pdfbox set string to UTF-16BE charset before calling into
+ * this.
+ */
+ public String convertString(String s)
+ {
+ if ( s.length() == 1 )
+ return s;
+
+ if ( _charset.equalsIgnoreCase("UTF-16BE") )
+ return s;
+
+ try {
+ return new String(s.getBytes("UTF-16BE"), _charset);
+ }
+ catch ( UnsupportedEncodingException uee ) {
+ return s;
+ }
+ }
+
+ /**
+ * Convert bytes to a string. We just convert bytes within
+ * coderange defined in CMap.
+ *
+ * @return Converted string.
+ */
+ public String convertBytes(byte [] c, int offset, int length, CMap cmap)
+ {
+ log.debug("offset = " + offset + ", length = " + length + ", CMap = " + cmap +
+ (cmap == null ? "" : ", codespaceRanges size = " + cmap.getCodeSpaceRanges().size()));
+
+ if ( cmap != null ) {
+
+ try {
+ if ( cmap.isInCodeSpaceRanges(c, offset, length) )
+ return new String(c, offset, length, _charset);
+ else
+ return null;
+
+ }
+ catch ( UnsupportedEncodingException uee ) {
+ return new String(c, offset, length);
+ }
+
+ }
+
+ // No cmap?
+ return null;
+ }
+
+}
+
diff -NurEb PDFBox-0.7.2/src/org/pdfbox/encoding/conversion/CJKEncodings.java PDFBox-0.7.2.work/src/org/pdfbox/encoding/conversion/CJKEncodings.java
--- PDFBox-0.7.2/src/org/pdfbox/encoding/conversion/CJKEncodings.java 1970-01-01 08:00:00.000000000 +0800
+++ PDFBox-0.7.2.work/src/org/pdfbox/encoding/conversion/CJKEncodings.java 2006-09-02 11:12:14.000000000 +0800
@@ -0,0 +1,140 @@
+/**
+ * Copyright (c) 2003-2005, www.pdfbox.org
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ * 3. Neither the name of pdfbox; nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * http://www.pdfbox.org
+ *
+ */
+package org.pdfbox.encoding.conversion;
+
+import java.util.HashMap;
+import java.util.Iterator;
+
+/**
+ * This class represents PDF encoding name to Java charset name mapping
+ *
+ * @author Pin Xue (http://www.pinxue.net), Holly Lee (holly.lee (at) gmail.com)
+ * @version $Revision: 1.0 $
+ */
+class CJKEncodings
+{
+ // Mapping: PDF encoding name -> Java (IANA) charset name
+ private static HashMap _mapping = new HashMap();
+
+ static
+ {
+ // Chinese (Simplified)
+ _mapping.put("GB-EUC-H", "GB2312"); // Microsoft Code Page 936 (lfCharSet 0x86), GB 2312-80 character set, EUC-CN encoding
+ _mapping.put("GB-EUC-V", "GB2312"); // Vertical version of GB-EUC-H
+ _mapping.put("GBpc-EUC-H", "GB2312"); // Mac OS, GB 2312-80 character set, EUC-CN encoding, Script Manager code 19
+ _mapping.put("GBpc-EUC-V", "GB2312"); // Vertical version of GBpc-EUC-H
+ _mapping.put("GBK-EUC-H", "GBK"); // Microsoft Code Page 936 (lfCharSet 0x86), GBK character set, GBK encoding
+ _mapping.put("GBK-EUC-V", "GBK"); // Vertical version of GBK-EUC-H
+ _mapping.put("GBKp-EUC-H", "GBK"); // Same as GBK-EUC-H but replaces half-width Latin characters with proportional forms and maps character code 0x24 to a dollar sign ($) instead of a yuan symbol (㏒∟)
+ _mapping.put("GBKp-EUC-V", "GBK"); // Vertical version of GBKp-EUC-H
+ _mapping.put("GBK2K-H", "GB18030"); // GB 18030-2000 character set, mixed 1-, 2-, and 4-byte encoding
+ _mapping.put("GBK2K-V", "GB18030"); // Vertical version of GBK2K-H
+ _mapping.put("UniGB-UCS2-H", "ISO-10646-UCS-2"); // Unicode (UCS-2) encoding for the Adobe-GB1 character collection
+ _mapping.put("UniGB-UCS2-V", "ISO-10646-UCS-2"); // Vertical version of UniGB-UCS2-H
+ _mapping.put("UniGB-UTF16-H", "UTF-16BE"); // Unicode (UTF-16BE) encoding for the Adobe-GB1 character collection; contains mappings for all characters in the GB18030-2000 character set
+ _mapping.put("UniGB-UTF16-V", "UTF-16BE"); // Vertical version of UniGB-UTF16-H
+
+ // Chinese (Traditional)
+ _mapping.put("B5pc-H", "BIG5"); // Mac OS, Big Five character set, Big Five encoding, Script Manager code 2
+ _mapping.put("B5pc-V", "BIG5"); // Vertical version of B5pc-H
+ _mapping.put("HKscs-B5-H", "Big5-HKSCS"); // Hong Kong SCS, an extension to the Big Five character set and encoding
+ _mapping.put("HKscs-B5-V", "Big5-HKSCS"); // Vertical version of HKscs-B5-H
+ _mapping.put("ETen-B5-H", "BIG5"); // Microsoft Code Page 950 (lfCharSet 0x88), Big Five character set with ETen extensions
+ _mapping.put("ETen-B5-V", "BIG5"); // Vertical version of ETen-B5-H
+ _mapping.put("ETenms-B5-H", "BIG5"); // Same as ETen-B5-H but replaces half-width Latin characters with proportional forms
+ _mapping.put("ETenms-B5-V", "BIG5"); // Vertical version of ETenms-B5-H
+ _mapping.put("CNS-EUC-H", "HZ"); // CNS 11643-1992 character set, EUC-TW encoding
+ _mapping.put("CNS-EUC-V", "HZ"); // Vertical version of CNS-EUC-H
+ _mapping.put("UniCNS-UCS2-H", "ISO-10646-UCS-2"); // Unicode (UCS-2) encoding for the Adobe-CNS1 character collection
+ _mapping.put("UniCNS-UCS2-V", "ISO-10646-UCS-2"); // Vertical version of UniCNS-UCS2-H
+ _mapping.put("UniCNS-UTF16-H", "UTF-16BE"); // Unicode (UTF-16BE) encoding for the Adobe-CNS1 character collection; contains mappings for all the characters in the HKSCS-2001 character set and contains both 2- and 4- byte character codes
+ _mapping.put("UniCNS-UTF16-V", "UTF-16BE"); // Vertical version of UniCNS-UTF16-H
+
+ //Japanese
+ _mapping.put("83pv-RKSJ-H", "JIS"); // Mac OS, JIS X 0208 character set with KanjiTalk6 extensions, Shift-JIS encoding, Script Manager code 1
+ _mapping.put("90ms-RKSJ-H", "JIS"); // Microsoft Code Page 932 (lfCharSet 0x80), JIS X 0208 character set with NEC and IBM- extensions
+ _mapping.put("90ms-RKSJ-V", "JIS"); // Vertical version of 90ms-RKSJ-H
+ _mapping.put("90msp-RKSJ-H", "JIS"); // Same as 90ms-RKSJ-H but replaces half-width Latin characters with proportional forms
+ _mapping.put("90msp-RKSJ-V", "JIS"); // Vertical version of 90msp-RKSJ-H
+ _mapping.put("90pv-RKSJ-H", "JIS"); // Mac OS, JIS X 0208 character set with KanjiTalk7 extensions, Shift-JIS encoding, Script Manager code 1
+ _mapping.put("Add-RKSJ-H", "JIS"); // JIS X 0208 character set with Fujitsu FMR extensions, Shift-JIS encoding
+ _mapping.put("Add-RKSJ-V", "JIS"); // Vertical version of Add-RKSJ-H
+ _mapping.put("EUC-H", "JIS"); // JIS X 0208 character set, EUC-JP encoding
+ _mapping.put("EUC-V", "JIS"); // Vertical version of EUC-H
+ _mapping.put("Ext-RKSJ-H", "JIS"); // JIS C 6226 (JIS78) character set with NEC extensions, Shift-JIS encoding
+ _mapping.put("Ext-RKSJ-V", "JIS"); // Vertical version of Ext-RKSJ-H
+ _mapping.put("H", "JIS"); // JIS X 0208 character set, ISO-2022-JP encoding
+ _mapping.put("V", "JIS"); // Vertical version of H
+ _mapping.put("UniJIS-UCS2-H", "ISO-10646-UCS-2"); // Unicode (UCS-2) encoding for the Adobe-Japan1 character collection
+ _mapping.put("UniJIS-UCS2-V", "ISO-10646-UCS-2"); // Vertical version of UniJIS-UCS2-H
+ _mapping.put("UniJIS-UCS2-HW-H", "ISO-10646-UCS-2"); // Same as UniJIS-UCS2-H but replaces proportional Latin characters with half-width forms
+ _mapping.put("UniJIS-UCS2-HW-V", "ISO-10646-UCS-2"); // Vertical version of UniJIS-UCS2-HW-H
+ _mapping.put("UniJIS-UTF16-H", "UTF-16BE"); // Unicode (UTF-16BE) encoding for the Adobe-Japan1 character collection; contains mappings for all characters in the JIS X 0213:1000 character set
+ _mapping.put("UniJIS-UTF16-V", "UTF-16BE"); // Vertical version of UniJIS-UTF16-H
+
+ //Korean
+ _mapping.put("KSC-EUC-H", "KSC"); // KS X 1001:1992 character set, EUC-KR encoding
+ _mapping.put("KSC-EUC-V", "KSC"); // Vertical version of KSC-EUC-H
+ _mapping.put("KSCms-UHC-H", "KSC"); // Microsoft Code Page 949 (lfCharSet 0x81), KS X 1001:1992 character set plus 8822.putitional hangul, Unified Hangul Code (UHC) encoding
+ _mapping.put("KSCms-UHC-V", "KSC"); // Vertical version of KSCms-UHC-H
+ _mapping.put("KSCms-UHC-HW-H", "KSC"); // Same as KSCms-UHC-H but replaces proportional Latin characters with half-width forms
+ _mapping.put("KSCms-UHC-HW-V", "KSC"); // Vertical version of KSCms-UHC-HW-H
+ _mapping.put("KSCpc-EUC-H", "KSC"); // Mac OS, KS X 1001:1992 character set with Mac OS KH extensions, Script Manager Code 3
+ _mapping.put("UniKS-UCS2-H", "ISO-10646-UCS-2"); // Unicode (UCS-2) encoding for the Adobe-Korea1 character collection
+ _mapping.put("UniKS-UCS2-V", "ISO-10646-UCS-2"); // Vertical version of UniKS-UCS2-H
+ _mapping.put("UniKS-UTF16-H", "UTF-16BE"); // Unicode (UTF-16BE) encoding for the Adobe-Korea1 character collection
+ _mapping.put("UniKS-UTF16-V", "UTF-16BE"); // Vertical version of UniKS-UTF16-H
+ }
+
+
+ /**
+ * Get respective Java charset name from given PDF encoding name.
+ *
+ * @param encoding PDF encoding name
+ * @return Java charset name, or null if not found
+ */
+ public static final String getCharset( String encoding )
+ {
+ if ( encoding.startsWith("COSName"))
+ encoding = encoding.substring(8, encoding.length()-1);
+
+ return (String)(_mapping.get(encoding));
+ }
+
+ /**
+ * Return an iterator to iterate through all encodings
+ */
+ public static final Iterator getEncodingIterator()
+ {
+ return _mapping.keySet().iterator();
+ }
+
+}
diff -NurEb PDFBox-0.7.2/src/org/pdfbox/encoding/conversion/EncodingConversionManager.java PDFBox-0.7.2.work/src/org/pdfbox/encoding/conversion/EncodingConversionManager.java
--- PDFBox-0.7.2/src/org/pdfbox/encoding/conversion/EncodingConversionManager.java 1970-01-01 08:00:00.000000000 +0800
+++ PDFBox-0.7.2.work/src/org/pdfbox/encoding/conversion/EncodingConversionManager.java 2006-09-02 11:29:40.000000000 +0800
@@ -0,0 +1,76 @@
+/**
+ * Copyright (c) 2003-2005, www.pdfbox.org
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ * 3. Neither the name of pdfbox; nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * http://www.pdfbox.org
+ *
+ */
+package org.pdfbox.encoding.conversion;
+
+import java.util.Iterator;
+import java.util.HashMap;
+
+/**
+ * EncodingConversionManager maintains relationship between PDF encoding name
+ * and respective EncodingConverter instance. Those PDF encoding name like
+ * GBK-EUC-H should be converted to java charset name before constructing a
+ * java string instance
+ */
+public class EncodingConversionManager
+{
+ /**
+ * Mapping from PDF encoding name to EncodingConverter instance
+ */
+ private static HashMap _encodingMap = new HashMap();
+
+ /**
+ * Initialize the encodingMap before anything calls us
+ */
+ static {
+
+ // Add CJK encodings to map
+ Iterator it = CJKEncodings.getEncodingIterator();
+
+ while ( it.hasNext() ) {
+ String encodingName = (String)(it.next());
+ _encodingMap.put(encodingName, new CJKConverter(encodingName));
+ }
+
+ // If there is any other encoding conversions, please add it here.
+
+ }
+
+ /**
+ * Get converter from given encoding name. If no converted defined,
+ * a null is returned
+ */
+ public static final EncodingConverter getConverter(String encoding)
+ {
+ return (EncodingConverter)(_encodingMap.get(encoding));
+ }
+
+
+}
diff -NurEb PDFBox-0.7.2/src/org/pdfbox/encoding/conversion/EncodingConverter.java PDFBox-0.7.2.work/src/org/pdfbox/encoding/conversion/EncodingConverter.java
--- PDFBox-0.7.2/src/org/pdfbox/encoding/conversion/EncodingConverter.java 1970-01-01 08:00:00.000000000 +0800
+++ PDFBox-0.7.2.work/src/org/pdfbox/encoding/conversion/EncodingConverter.java 2006-09-02 11:14:24.000000000 +0800
@@ -0,0 +1,52 @@
+/**
+ * Copyright (c) 2003-2005, www.pdfbox.org
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ * 3. Neither the name of pdfbox; nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * http://www.pdfbox.org
+ *
+ */
+package org.pdfbox.encoding.conversion;
+
+import org.pdfbox.cmaptypes.CMap;
+
+/**
+ * EncodingConverter converts string or characters in one encoding, which is specified in PDF
+ * file, to another string with respective java charset. The mapping from
+ * PDF encoding name to java charset name is maintained by EncodingConversionManager
+ */
+public interface EncodingConverter
+{
+ /**
+ * Convert a string
+ */
+ public String convertString(String s);
+
+ /**
+ * Convert bytes to a string
+ */
+ public String convertBytes(byte [] c, int offset, int length, CMap cmap);
+}
+
diff -NurEb PDFBox-0.7.2/src/org/pdfbox/pdmodel/font/PDFont.java PDFBox-0.7.2.work/src/org/pdfbox/pdmodel/font/PDFont.java
--- PDFBox-0.7.2/src/org/pdfbox/pdmodel/font/PDFont.java 2005-09-05 19:58:40.000000000 +0800
+++ PDFBox-0.7.2.work/src/org/pdfbox/pdmodel/font/PDFont.java 2006-09-02 12:20:04.000000000 +0800
@@ -43,6 +43,9 @@
import org.pdfbox.encoding.Encoding;
import org.pdfbox.encoding.EncodingManager;

+import org.pdfbox.encoding.conversion.EncodingConversionManager;
+import org.pdfbox.encoding.conversion.EncodingConverter;
+
import org.pdfbox.cos.COSArray;
import org.pdfbox.cos.COSBase;
import org.pdfbox.cos.COSDictionary;
@@ -461,6 +465,28 @@
log.debug( "cmap.lookup(" +c + ")='" +retval + "'" );
}
}
+
+ // Regardless cmap setting, we need to do encoding conversion for those
+ // encoding specified in font information, but not the encoding types
+ // defined in EncodingManager. Handle COSName type encoding only?
+ //
+ // We use an EncodingConversionManager to handle this case
+ COSBase encoding_COS = font.getDictionaryObject(COSName.ENCODING);
+
+ if ( encoding_COS instanceof COSName ) {
+ EncodingConverter converter = EncodingConversionManager.getConverter(((COSName)encoding_COS).getName());
+
+ if ( converter != null ) {
+ if ( retval != null )
+ retval = converter.convertString(retval);
+ else
+ retval = converter.convertBytes(c, offset, length, cmap);
+
+ // If we are the encoding, we handle all.
+ return retval;
+ }
+ }
+
//if we havn't found a value yet and
//we are still on the first byte and
//there is no cmap or the cmap does not have 2 byte mappings then try to encode
@@@@@@@@ pdfbox-0.7.2-cjk.patch 結束 @@@@@@@@@@

使用手工修改對應的檔案時, "+" 表示新加的部份, "!" 表示修改的部份[這裡沒有出現].

修改的java檔案:
org.pdfbox.cmaptypes.CMap.java
org.pdfbox.cmaptypes.CodespaceRange.java
org.pdfbox.pdmodel.font.PDFont.java

新增的java檔案:
org.pdfbox.encoding.conversion.EncodingConverter.java
org.pdfbox.encoding.conversion.CJKConverter.java
org.pdfbox.encoding.conversion.CJKEncodings.java
org.pdfbox.encoding.conversion.EncodingConversionManager.java

測試讀取的中文pdf的java, 如下所示
/************ PdfExtracter 開始 *************/
package test.read.pdf;

import java.io.ByteArrayOutputStream;
import java.io.FileInputStream;
import java.io.OutputStreamWriter;

import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;

public class PdfExtracter {

public PdfExtracter() {
}

public String GetTextFromPdf(String filename) throws Exception {
FileInputStream is = new FileInputStream(filename);
PDFParser parser = new PDFParser(is);
parser.parse();
PDDocument pdDocument = parser.getPDDocument();
ByteArrayOutputStream out = new ByteArrayOutputStream();
OutputStreamWriter writer = new OutputStreamWriter(out);
PDFTextStripper stripper = new PDFTextStripper();
stripper.writeText(pdDocument, writer);
writer.close();
byte[] contents = out.toByteArray();

String ts = new String(contents);
System.out.println("the string length is" + contents.length + "n");
return ts;
}

public static void main(String args[]) {
PdfExtracter pf = new PdfExtracter();

try {
String ts = pf.GetTextFromPdf("test_pdf.pdf");

System.out.println(ts);
} catch (Exception e) {
e.printStackTrace();
}
}

}
/************ PdfExtracter 開始 *************/


參考資料:
http://www.jsfsoft.com:8080/beyond-pebble/pinxue/2006/08/22/1156185680727.html
http://www.pinxue.net/java/PDFBox_String_Charset_analyze.html
http://sourceforge.net/tracker/index.php?func=detail&aid=1640071&group_id=78314&atid=552834
http://www.yuanma.org/data/2006/0911/article_1511.htm