From 40b9be137f276ab54598aad8726c982ed6d58075 Mon Sep 17 00:00:00 2001 From: Nils Maier Date: Sun, 27 May 2012 22:49:28 +0200 Subject: [PATCH 1/4] Decode XML metadata as UTF-8 XML uses UTF-8 by default, which needs to be decoded to a Javascript String prior to feeding it to the DOMParser. In an ideal world, the XML would actually be analyzed and the specified charset would be used, however that does not seem feasible unless JS engines get iconv bindings. Fixes GH-1692 --- src/obj.js | 7 ++++++- src/util.js | 4 ++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/obj.js b/src/obj.js index 3432ac68d..acc9e1284 100644 --- a/src/obj.js +++ b/src/obj.js @@ -140,7 +140,12 @@ var Catalog = (function CatalogClosure() { if (isName(type) && isName(subtype) && type.name === 'Metadata' && subtype.name === 'XML') { - metadata = stringToPDFString(bytesToString(stream.getBytes())); + // XXX: This should examine the charset the XML document defines, + // however since there are currently no real means to decode + // arbitrary charsets, let's just hope that the author of the PDF + // was reasonable enough to stick with the XML default charset, + // which is UTF-8. + metadata = stringToUTF8String(bytesToString(stream.getBytes())); } } diff --git a/src/util.js b/src/util.js index 90e6cee5d..fe5d895e3 100644 --- a/src/util.js +++ b/src/util.js @@ -302,6 +302,10 @@ function stringToPDFString(str) { return str2; } +function stringToUTF8String(str) { + return decodeURIComponent(escape(str)); +} + function isBool(v) { return typeof v == 'boolean'; } From 413e5357b9c948a9a90a69150b49afaf2b258c04 Mon Sep 17 00:00:00 2001 From: Yury Delendik Date: Sun, 27 May 2012 18:03:04 -0500 Subject: [PATCH 2/4] Suppress metadata decryption --- src/crypto.js | 4 +++- src/obj.js | 9 ++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/crypto.js b/src/crypto.js index c86551f36..cd0cf74e4 100644 --- a/src/crypto.js +++ b/src/crypto.js @@ -546,8 +546,10 @@ var CipherTransformFactory = (function CipherTransformFactoryClosure() { var userPassword = stringToBytes(dict.get('U')); var flags = dict.get('P'); var revision = dict.get('R'); - var encryptMetadata = + var encryptMetadata = algorithm == 4 && // meaningful when V is 4 dict.get('EncryptMetadata') !== false; // makes true as default value + this.encryptMetadata = encryptMetadata; + var fileIdBytes = stringToBytes(fileId); var passwordBytes; if (password) diff --git a/src/obj.js b/src/obj.js index acc9e1284..3b7eb563b 100644 --- a/src/obj.js +++ b/src/obj.js @@ -132,7 +132,14 @@ var Catalog = (function CatalogClosure() { Catalog.prototype = { get metadata() { - var stream = this.catDict.get('Metadata'); + var streamRef = this.catDict.getRaw('Metadata'); + if (!isRef(streamRef)) + return shadow(this, 'metadata', null); + + var encryptMetadata = !this.xref.encrypt ? false : + this.xref.encrypt.encryptMetadata; + + var stream = this.xref.fetch(streamRef, !encryptMetadata); var metadata; if (stream && isDict(stream.dict)) { var type = stream.dict.get('Type'); From 48811f362b1fe3035808720483507c896e39c477 Mon Sep 17 00:00:00 2001 From: Yury Delendik Date: Sun, 27 May 2012 19:00:13 -0500 Subject: [PATCH 3/4] Skipping incorrectly encoded metadata --- src/obj.js | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/obj.js b/src/obj.js index 3b7eb563b..43ec84723 100644 --- a/src/obj.js +++ b/src/obj.js @@ -152,7 +152,11 @@ var Catalog = (function CatalogClosure() { // arbitrary charsets, let's just hope that the author of the PDF // was reasonable enough to stick with the XML default charset, // which is UTF-8. - metadata = stringToUTF8String(bytesToString(stream.getBytes())); + try { + metadata = stringToUTF8String(bytesToString(stream.getBytes())); + } catch (e) { + log('Skipping invalid metadata.'); + } } } From 1fb02300a424d79802a2b3d479bda005c6210b31 Mon Sep 17 00:00:00 2001 From: Yury Delendik Date: Tue, 29 May 2012 11:01:46 -0500 Subject: [PATCH 4/4] Removing log --- src/obj.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/obj.js b/src/obj.js index 43ec84723..c01ffab58 100644 --- a/src/obj.js +++ b/src/obj.js @@ -155,7 +155,7 @@ var Catalog = (function CatalogClosure() { try { metadata = stringToUTF8String(bytesToString(stream.getBytes())); } catch (e) { - log('Skipping invalid metadata.'); + info('Skipping invalid metadata.'); } } }