diff --git a/examples/node/domparsermock.js b/examples/node/domparsermock.js deleted file mode 100644 index 1dde248c3..000000000 --- a/examples/node/domparsermock.js +++ /dev/null @@ -1,105 +0,0 @@ -/* Any copyright is dedicated to the Public Domain. - * http://creativecommons.org/publicdomain/zero/1.0/ */ - -// Dummy XML Parser - -function DOMNodeMock(nodeName, nodeValue) { - this.nodeName = nodeName; - this.nodeValue = nodeValue; - Object.defineProperty(this, 'parentNode', {value: null, writable: true}); -} -DOMNodeMock.prototype = { - get firstChild() { - return this.childNodes[0]; - }, - get nextSibling() { - var index = this.parentNode.childNodes.indexOf(this); - return this.parentNode.childNodes[index + 1]; - }, - get textContent() { - if (!this.childNodes) { - return this.nodeValue || ''; - } - return this.childNodes.map(function (child) { - return child.textContent; - }).join(''); - }, - hasChildNodes: function () { - return this.childNodes && this.childNodes.length > 0; - } -}; - -function decodeXML(text) { - if (text.indexOf('&') < 0) { - return text; - } - return text.replace(/&(#(x[0-9a-f]+|\d+)|\w+);/gi, function (all, entityName, number) { - if (number) { - return String.fromCharCode(number[0] === 'x' ? parseInt(number.substring(1), 16) : +number); - } - switch (entityName) { - case 'amp': - return '&'; - case 'lt': - return '<'; - case 'gt': - return '>'; - case 'quot': - return '\"'; - case 'apos': - return '\''; - } - return '&' + entityName + ';'; - }); -} - -function DOMParserMock() {}; -DOMParserMock.prototype = { - parseFromString: function (content) { - content = content.replace(/<\?[\s\S]*?\?>|/g, '').trim(); - var nodes = []; - content = content.replace(/>([\s\S]+?)<'; // ignoring whitespaces - } - return '>' + i + ',<'; - }); - content = content.replace(//g, function (all, text) { - var i = nodes.length; - var node = new DOMNodeMock('#text', text); - nodes.push(node); - return i + ','; - }); - var lastLength; - do { - lastLength = nodes.length; - content = content.replace(/<([\w\:]+)((?:[\s\w:=]|'[^']*'|"[^"]*")*)(?:\/>|>([\d,]*)<\/[^>]+>)/g, - function (all, name, attrs, content) { - var i = nodes.length; - var node = new DOMNodeMock(name); - var children = []; - if (content) { - content = content.split(','); - content.pop(); - content.forEach(function (child) { - var childNode = nodes[+child]; - childNode.parentNode = node; - children.push(childNode); - }) - } - node.childNodes = children; - nodes.push(node); - return i + ','; - - }); - } while(lastLength < nodes.length); - return { - documentElement: nodes.pop() - }; - } -}; - -exports.DOMParserMock = DOMParserMock; diff --git a/examples/node/getinfo.js b/examples/node/getinfo.js index 3dce2e20a..61034cfa3 100644 --- a/examples/node/getinfo.js +++ b/examples/node/getinfo.js @@ -9,9 +9,6 @@ var fs = require('fs'); -// HACK adding DOMParser to read XMP metadata. -global.DOMParser = require('./domparsermock.js').DOMParserMock; - // Run `gulp dist-install` to generate 'pdfjs-dist' npm package files. var pdfjsLib = require('pdfjs-dist'); @@ -34,7 +31,7 @@ pdfjsLib.getDocument(pdfPath).then(function (doc) { console.log(); if (data.metadata) { console.log('## Metadata'); - console.log(JSON.stringify(data.metadata.metadata, null, 2)); + console.log(JSON.stringify(data.metadata.getAll(), null, 2)); console.log(); } }); diff --git a/src/display/dom_utils.js b/src/display/dom_utils.js index 4d0116849..21cfb0425 100644 --- a/src/display/dom_utils.js +++ b/src/display/dom_utils.js @@ -131,6 +131,132 @@ class DOMSVGFactory { } } +class SimpleDOMNode { + constructor(nodeName, nodeValue) { + this.nodeName = nodeName; + this.nodeValue = nodeValue; + + Object.defineProperty(this, 'parentNode', { value: null, writable: true, }); + } + + get firstChild() { + return this.childNodes[0]; + } + + get nextSibling() { + let index = this.parentNode.childNodes.indexOf(this); + return this.parentNode.childNodes[index + 1]; + } + + get textContent() { + if (!this.childNodes) { + return this.nodeValue || ''; + } + return this.childNodes.map(function(child) { + return child.textContent; + }).join(''); + } + + hasChildNodes() { + return this.childNodes && this.childNodes.length > 0; + } +} + +class SimpleXMLParser { + parseFromString(data) { + let nodes = []; + + // Remove all comments and processing instructions. + data = data.replace(/<\?[\s\S]*?\?>|/g, '').trim(); + data = data.replace(/\[]+(\[[^\]]+)?[^>]+>/g, '').trim(); + + // Extract all text nodes and replace them with a numeric index in + // the nodes. + data = data.replace(/>([^<][\s\S]*?) { + let length = nodes.length; + let node = new SimpleDOMNode('#text', this._decodeXML(text)); + nodes.push(node); + if (node.textContent.trim().length === 0) { + return '><'; // Ignore whitespace. + } + return '>' + length + ',<'; + }); + + // Extract all CDATA nodes. + data = data.replace(//g, + function(all, text) { + let length = nodes.length; + let node = new SimpleDOMNode('#text', text); + nodes.push(node); + return length + ','; + }); + + // Until nodes without '<' and '>' content are present, replace them + // with a numeric index in the nodes. + let regex = + /<([\w\:]+)((?:[\s\w:=]|'[^']*'|"[^"]*")*)(?:\/>|>([\d,]*)<\/[^>]+>)/g; + let lastLength; + do { + lastLength = nodes.length; + data = data.replace(regex, function(all, name, attrs, data) { + let length = nodes.length; + let node = new SimpleDOMNode(name); + let children = []; + if (data) { + data = data.split(','); + data.pop(); + data.forEach(function(child) { + let childNode = nodes[+child]; + childNode.parentNode = node; + children.push(childNode); + }); + } + + node.childNodes = children; + nodes.push(node); + return length + ','; + }); + } while (lastLength < nodes.length); + + // We should only have one root index left, which will be last in the nodes. + return { + documentElement: nodes.pop(), + }; + } + + _decodeXML(text) { + if (text.indexOf('&') < 0) { + return text; + } + + return text.replace(/&(#(x[0-9a-f]+|\d+)|\w+);/gi, + function(all, entityName, number) { + if (number) { + if (number[0] === 'x') { + number = parseInt(number.substring(1), 16); + } else { + number = +number; + } + return String.fromCharCode(number); + } + + switch (entityName) { + case 'amp': + return '&'; + case 'lt': + return '<'; + case 'gt': + return '>'; + case 'quot': + return '\"'; + case 'apos': + return '\''; + } + return '&' + entityName + ';'; + }); + } +} + /** * Optimised CSS custom property getter/setter. * @class @@ -353,4 +479,5 @@ export { DOMCanvasFactory, DOMCMapReaderFactory, DOMSVGFactory, + SimpleXMLParser, }; diff --git a/src/display/metadata.js b/src/display/metadata.js index 0b2fe2a20..7878ec838 100644 --- a/src/display/metadata.js +++ b/src/display/metadata.js @@ -13,18 +13,19 @@ * limitations under the License. */ +import { assert, deprecated } from '../shared/util'; +import { SimpleXMLParser } from './dom_utils'; + class Metadata { constructor(data) { - if (typeof data === 'string') { - // Ghostscript may produce invalid metadata, so try to repair that first. - data = this._repair(data); + assert(typeof data === 'string', 'Metadata: input is not a string'); - // Convert the string to a DOM `Document`. - let parser = new DOMParser(); - data = parser.parseFromString(data, 'application/xml'); - } else if (!(data instanceof Document)) { - throw new Error('Metadata: input is not a string or `Document`'); - } + // Ghostscript may produce invalid metadata, so try to repair that first. + data = this._repair(data); + + // Convert the string to a DOM `Document`. + let parser = new SimpleXMLParser(); + data = parser.parseFromString(data); this._metadata = Object.create(null); @@ -90,9 +91,18 @@ class Metadata { return this._metadata[name] || null; } + getAll() { + return this._metadata; + } + has(name) { return typeof this._metadata[name] !== 'undefined'; } + + get metadata() { + deprecated('`metadata` getter; use `getAll()` instead.'); + return this.getAll(); + } } export { diff --git a/test/unit/metadata_spec.js b/test/unit/metadata_spec.js index 548bf4318..f7fa947fa 100644 --- a/test/unit/metadata_spec.js +++ b/test/unit/metadata_spec.js @@ -16,15 +16,37 @@ import { Metadata } from '../../src/display/metadata'; describe('metadata', function() { - describe('incorrect_xmp', function() { - it('should fix the incorrect XMP data', function() { - var invalidXMP = '' + - '' + - '' + - '\\376\\377\\000P\\000D\\000F\\000&' + - ''; - var meta = new Metadata(invalidXMP); - expect(meta.get('dc:title')).toEqual('PDF&'); - }); + it('should handle valid metadata', function() { + var validData = '' + + '' + + '' + + 'Foo bar baz' + + ''; + var metadata = new Metadata(validData); + + expect(metadata.has('dc:title')).toBeTruthy(); + expect(metadata.has('dc:qux')).toBeFalsy(); + + expect(metadata.get('dc:title')).toEqual('Foo bar baz'); + expect(metadata.get('dc:qux')).toEqual(null); + + expect(metadata.getAll()).toEqual({ 'dc:title': 'Foo bar baz', }); + }); + + it('should repair and handle invalid metadata', function() { + var invalidData = '' + + '' + + '' + + '\\376\\377\\000P\\000D\\000F\\000&' + + ''; + var metadata = new Metadata(invalidData); + + expect(metadata.has('dc:title')).toBeTruthy(); + expect(metadata.has('dc:qux')).toBeFalsy(); + + expect(metadata.get('dc:title')).toEqual('PDF&'); + expect(metadata.get('dc:qux')).toEqual(null); + + expect(metadata.getAll()).toEqual({ 'dc:title': 'PDF&', }); }); });