diff --git a/src/core/metadata_parser.js b/src/core/metadata_parser.js new file mode 100644 index 000000000..5d60f5c8c --- /dev/null +++ b/src/core/metadata_parser.js @@ -0,0 +1,146 @@ +/* Copyright 2012 Mozilla Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import { SimpleXMLParser } from "../shared/xml_parser.js"; + +class MetadataParser { + constructor(data) { + // Ghostscript may produce invalid metadata, so try to repair that first. + data = this._repair(data); + + // Convert the string to an XML document. + const parser = new SimpleXMLParser({ lowerCaseName: true }); + const xmlDocument = parser.parseFromString(data); + + this._metadataMap = new Map(); + this._data = data; + + if (xmlDocument) { + this._parse(xmlDocument); + } + } + + _repair(data) { + // Start by removing any "junk" before the first tag (see issue 10395). + return data + .replace(/^[^<]+/, "") + .replace(/>\\376\\377([^<]+)/g, function (all, codes) { + const bytes = codes + .replace(/\\([0-3])([0-7])([0-7])/g, function (code, d1, d2, d3) { + return String.fromCharCode(d1 * 64 + d2 * 8 + d3 * 1); + }) + .replace(/&(amp|apos|gt|lt|quot);/g, function (str, name) { + switch (name) { + case "amp": + return "&"; + case "apos": + return "'"; + case "gt": + return ">"; + case "lt": + return "<"; + case "quot": + return '"'; + } + throw new Error(`_repair: ${name} isn't defined.`); + }); + + const charBuf = []; + for (let i = 0, ii = bytes.length; i < ii; i += 2) { + const code = bytes.charCodeAt(i) * 256 + bytes.charCodeAt(i + 1); + if ( + code >= /* Space = */ 32 && + code < /* Delete = */ 127 && + code !== /* '<' = */ 60 && + code !== /* '>' = */ 62 && + code !== /* '&' = */ 38 + ) { + charBuf.push(String.fromCharCode(code)); + } else { + charBuf.push( + "&#x" + (0x10000 + code).toString(16).substring(1) + ";" + ); + } + } + return ">" + charBuf.join(""); + }); + } + + _getSequence(entry) { + const name = entry.nodeName; + if (name !== "rdf:bag" && name !== "rdf:seq" && name !== "rdf:alt") { + return null; + } + return entry.childNodes.filter(node => node.nodeName === "rdf:li"); + } + + _parseArray(entry) { + if (!entry.hasChildNodes()) { + return; + } + // Child must be a Bag (unordered array) or a Seq. + const [seqNode] = entry.childNodes; + const sequence = this._getSequence(seqNode) || []; + + this._metadataMap.set( + entry.nodeName, + sequence.map(node => node.textContent.trim()) + ); + } + + _parse(xmlDocument) { + let rdf = xmlDocument.documentElement; + + if (rdf.nodeName !== "rdf:rdf") { + // Wrapped in + rdf = rdf.firstChild; + while (rdf && rdf.nodeName !== "rdf:rdf") { + rdf = rdf.nextSibling; + } + } + + if (!rdf || rdf.nodeName !== "rdf:rdf" || !rdf.hasChildNodes()) { + return; + } + + for (const desc of rdf.childNodes) { + if (desc.nodeName !== "rdf:description") { + continue; + } + + for (const entry of desc.childNodes) { + const name = entry.nodeName; + switch (name) { + case "#text": + continue; + case "dc:creator": + case "dc:subject": + this._parseArray(entry); + continue; + } + this._metadataMap.set(name, entry.textContent.trim()); + } + } + } + + get serializable() { + return { + parsedData: this._metadataMap, + rawData: this._data, + }; + } +} + +export { MetadataParser }; diff --git a/src/core/obj.js b/src/core/obj.js index 8352c23fa..3d4226d79 100644 --- a/src/core/obj.js +++ b/src/core/obj.js @@ -59,6 +59,7 @@ import { Lexer, Parser } from "./parser.js"; import { CipherTransformFactory } from "./crypto.js"; import { ColorSpace } from "./colorspace.js"; import { GlobalImageCache } from "./image_utils.js"; +import { MetadataParser } from "./metadata_parser.js"; function fetchDestination(dest) { return isDict(dest) ? dest.get("D") : dest; @@ -131,20 +132,22 @@ class Catalog { this.xref.encrypt && this.xref.encrypt.encryptMetadata ); const stream = this.xref.fetch(streamRef, suppressEncryption); - let metadata; + let metadata = null; - if (stream && isDict(stream.dict)) { + if (isStream(stream) && isDict(stream.dict)) { const type = stream.dict.get("Type"); const subtype = stream.dict.get("Subtype"); if (isName(type, "Metadata") && isName(subtype, "XML")) { // XXX: This should examine the charset the XML document defines, - // however since there are currently no real means to decode - // arbitrary charsets, let's just hope that the author of the PDF - // was reasonable enough to stick with the XML default charset, - // which is UTF-8. + // however since there are currently no real means to decode arbitrary + // charsets, let's just hope that the author of the PDF was reasonable + // enough to stick with the XML default charset, which is UTF-8. try { - metadata = stringToUTF8String(bytesToString(stream.getBytes())); + const data = stringToUTF8String(bytesToString(stream.getBytes())); + if (data) { + metadata = new MetadataParser(data).serializable; + } } catch (e) { if (e instanceof MissingDataException) { throw e; diff --git a/src/display/metadata.js b/src/display/metadata.js index 4b6c4b74f..3c7da06f7 100644 --- a/src/display/metadata.js +++ b/src/display/metadata.js @@ -13,129 +13,12 @@ * limitations under the License. */ -import { assert, objectFromEntries } from "../shared/util.js"; -import { SimpleXMLParser } from "../shared/xml_parser.js"; +import { objectFromEntries } from "../shared/util.js"; class Metadata { - constructor(data) { - assert(typeof data === "string", "Metadata: input is not a string"); - - // Ghostscript may produce invalid metadata, so try to repair that first. - data = this._repair(data); - - // Convert the string to an XML document. - const parser = new SimpleXMLParser({ lowerCaseName: true }); - const xmlDocument = parser.parseFromString(data); - - this._metadataMap = new Map(); - - if (xmlDocument) { - this._parse(xmlDocument); - } - this._data = data; - } - - _repair(data) { - // Start by removing any "junk" before the first tag (see issue 10395). - return data - .replace(/^[^<]+/, "") - .replace(/>\\376\\377([^<]+)/g, function (all, codes) { - const bytes = codes - .replace(/\\([0-3])([0-7])([0-7])/g, function (code, d1, d2, d3) { - return String.fromCharCode(d1 * 64 + d2 * 8 + d3 * 1); - }) - .replace(/&(amp|apos|gt|lt|quot);/g, function (str, name) { - switch (name) { - case "amp": - return "&"; - case "apos": - return "'"; - case "gt": - return ">"; - case "lt": - return "<"; - case "quot": - return '"'; - } - throw new Error(`_repair: ${name} isn't defined.`); - }); - - let chars = ""; - for (let i = 0, ii = bytes.length; i < ii; i += 2) { - const code = bytes.charCodeAt(i) * 256 + bytes.charCodeAt(i + 1); - if ( - code >= /* Space = */ 32 && - code < /* Delete = */ 127 && - code !== /* '<' = */ 60 && - code !== /* '>' = */ 62 && - code !== /* '&' = */ 38 - ) { - chars += String.fromCharCode(code); - } else { - chars += "&#x" + (0x10000 + code).toString(16).substring(1) + ";"; - } - } - - return ">" + chars; - }); - } - - _getSequence(entry) { - const name = entry.nodeName; - if (name !== "rdf:bag" && name !== "rdf:seq" && name !== "rdf:alt") { - return null; - } - - return entry.childNodes.filter(node => node.nodeName === "rdf:li"); - } - - _parseArray(entry) { - if (!entry.hasChildNodes()) { - return; - } - // Child must be a Bag (unordered array) or a Seq. - const [seqNode] = entry.childNodes; - const sequence = this._getSequence(seqNode) || []; - - this._metadataMap.set( - entry.nodeName, - sequence.map(node => node.textContent.trim()) - ); - } - - _parse(xmlDocument) { - let rdf = xmlDocument.documentElement; - - if (rdf.nodeName !== "rdf:rdf") { - // Wrapped in - rdf = rdf.firstChild; - while (rdf && rdf.nodeName !== "rdf:rdf") { - rdf = rdf.nextSibling; - } - } - - if (!rdf || rdf.nodeName !== "rdf:rdf" || !rdf.hasChildNodes()) { - return; - } - - for (const desc of rdf.childNodes) { - if (desc.nodeName !== "rdf:description") { - continue; - } - - for (const entry of desc.childNodes) { - const name = entry.nodeName; - switch (name) { - case "#text": - continue; - case "dc:creator": - case "dc:subject": - this._parseArray(entry); - continue; - } - this._metadataMap.set(name, entry.textContent.trim()); - } - } + constructor({ parsedData, rawData }) { + this._metadataMap = parsedData; + this._data = rawData; } getRaw() { diff --git a/src/shared/xml_parser.js b/src/shared/xml_parser.js index 6f3c94b80..96bf6d3df 100644 --- a/src/shared/xml_parser.js +++ b/src/shared/xml_parser.js @@ -16,7 +16,7 @@ // The code for XMLParserBase copied from // https://github.com/mozilla/shumway/blob/16451d8836fa85f4b16eeda8b4bda2fa9e2b22b0/src/avm2/natives/xml.ts -import { encodeToXmlString } from "./util.js"; +import { encodeToXmlString } from "../shared/util.js"; const XMLParserErrorCode = { NoError: 0, diff --git a/test/unit/metadata_spec.js b/test/unit/metadata_spec.js index 557e957bb..bbc8e6760 100644 --- a/test/unit/metadata_spec.js +++ b/test/unit/metadata_spec.js @@ -15,6 +15,12 @@ import { isEmptyObj } from "./test_utils.js"; import { Metadata } from "../../src/display/metadata.js"; +import { MetadataParser } from "../../src/core/metadata_parser.js"; + +function createMetadata(data) { + const metadataParser = new MetadataParser(data); + return new Metadata(metadataParser.serializable); +} describe("metadata", function () { it("should handle valid metadata", function () { @@ -24,7 +30,7 @@ describe("metadata", function () { "" + 'Foo bar baz' + ""; - const metadata = new Metadata(data); + const metadata = createMetadata(data); expect(metadata.has("dc:title")).toBeTruthy(); expect(metadata.has("dc:qux")).toBeFalsy(); @@ -42,7 +48,7 @@ describe("metadata", function () { "" + "\\376\\377\\000P\\000D\\000F\\000&" + ""; - const metadata = new Metadata(data); + const metadata = createMetadata(data); expect(metadata.has("dc:title")).toBeTruthy(); expect(metadata.has("dc:qux")).toBeFalsy(); @@ -85,7 +91,7 @@ describe("metadata", function () { "\\376\\377\\000O\\000D\\000I\\000S" + "" + ""; - const metadata = new Metadata(data); + const metadata = createMetadata(data); expect(metadata.has("dc:title")).toBeTruthy(); expect(metadata.has("dc:qux")).toBeFalsy(); @@ -128,7 +134,7 @@ describe("metadata", function () { "" + "" + ''; - const metadata = new Metadata(data); + const metadata = createMetadata(data); expect(isEmptyObj(metadata.getAll())).toEqual(true); }); @@ -159,7 +165,7 @@ describe("metadata", function () { '' + "application/pdf" + ''; - const metadata = new Metadata(data); + const metadata = createMetadata(data); expect(metadata.has("dc:title")).toBeTruthy(); expect(metadata.has("dc:qux")).toBeFalsy(); @@ -191,7 +197,7 @@ describe("metadata", function () { "" + ''Foo bar baz'' + ""; - const metadata = new Metadata(data); + const metadata = createMetadata(data); expect(metadata.has("dc:title")).toBeTruthy(); expect(metadata.has("dc:qux")).toBeFalsy(); @@ -220,7 +226,7 @@ describe("metadata", function () { "uuid:00000000-1c84-3cf9-89ba-bef0e729c831" + "" + ''; - const metadata = new Metadata(data); + const metadata = createMetadata(data); expect(isEmptyObj(metadata.getAll())).toEqual(true); }); @@ -249,7 +255,7 @@ describe("metadata", function () { " " + " " + ""; - const metadata = new Metadata(data); + const metadata = createMetadata(data); expect(metadata.has("dc:title")).toBeTruthy(); expect(metadata.has("dc:qux")).toBeFalsy(); diff --git a/web/app.js b/web/app.js index eb3c0abc6..e849e113d 100644 --- a/web/app.js +++ b/web/app.js @@ -1755,11 +1755,8 @@ const PDFViewerApplication = { `${this.pdfViewer.enableWebGL ? " [WebGL]" : ""})` ); - let pdfTitle; - const infoTitle = info?.Title; - if (infoTitle) { - pdfTitle = infoTitle; - } + let pdfTitle = info?.Title; + const metadataTitle = metadata?.get("dc:title"); if (metadataTitle) { // Ghostscript can produce invalid 'dc:title' Metadata entries: