mirror of
https://github.com/mozilla/pdf.js.git
synced 2025-04-22 16:18:08 +02:00
Replace DOMParser
with SimpleXMLParser
The `DOMParser` is most likely overkill and may be less secure. Moreover, it is not supported in Node.js environments. This patch replaces the `DOMParser` with a simple XML parser. This should be faster and gives us Node.js support for free. The simple XML parser is a port of the one that existed in the examples folder with a small regex fix to make the parsing work correctly. The unit tests are extended for increased test coverage of the metadata code. The new method `getAll` is provided so the example does not have to access internal properties of the object anymore.
This commit is contained in:
parent
bc9afdf3c4
commit
d4309614f9
5 changed files with 179 additions and 128 deletions
|
@ -131,6 +131,132 @@ class DOMSVGFactory {
|
|||
}
|
||||
}
|
||||
|
||||
class SimpleDOMNode {
|
||||
constructor(nodeName, nodeValue) {
|
||||
this.nodeName = nodeName;
|
||||
this.nodeValue = nodeValue;
|
||||
|
||||
Object.defineProperty(this, 'parentNode', { value: null, writable: true, });
|
||||
}
|
||||
|
||||
get firstChild() {
|
||||
return this.childNodes[0];
|
||||
}
|
||||
|
||||
get nextSibling() {
|
||||
let index = this.parentNode.childNodes.indexOf(this);
|
||||
return this.parentNode.childNodes[index + 1];
|
||||
}
|
||||
|
||||
get textContent() {
|
||||
if (!this.childNodes) {
|
||||
return this.nodeValue || '';
|
||||
}
|
||||
return this.childNodes.map(function(child) {
|
||||
return child.textContent;
|
||||
}).join('');
|
||||
}
|
||||
|
||||
hasChildNodes() {
|
||||
return this.childNodes && this.childNodes.length > 0;
|
||||
}
|
||||
}
|
||||
|
||||
class SimpleXMLParser {
|
||||
parseFromString(data) {
|
||||
let nodes = [];
|
||||
|
||||
// Remove all comments and processing instructions.
|
||||
data = data.replace(/<\?[\s\S]*?\?>|<!--[\s\S]*?-->/g, '').trim();
|
||||
data = data.replace(/<!DOCTYPE[^>\[]+(\[[^\]]+)?[^>]+>/g, '').trim();
|
||||
|
||||
// Extract all text nodes and replace them with a numeric index in
|
||||
// the nodes.
|
||||
data = data.replace(/>([^<][\s\S]*?)</g, (all, text) => {
|
||||
let length = nodes.length;
|
||||
let node = new SimpleDOMNode('#text', this._decodeXML(text));
|
||||
nodes.push(node);
|
||||
if (node.textContent.trim().length === 0) {
|
||||
return '><'; // Ignore whitespace.
|
||||
}
|
||||
return '>' + length + ',<';
|
||||
});
|
||||
|
||||
// Extract all CDATA nodes.
|
||||
data = data.replace(/<!\[CDATA\[([\s\S]*?)\]\]>/g,
|
||||
function(all, text) {
|
||||
let length = nodes.length;
|
||||
let node = new SimpleDOMNode('#text', text);
|
||||
nodes.push(node);
|
||||
return length + ',';
|
||||
});
|
||||
|
||||
// Until nodes without '<' and '>' content are present, replace them
|
||||
// with a numeric index in the nodes.
|
||||
let regex =
|
||||
/<([\w\:]+)((?:[\s\w:=]|'[^']*'|"[^"]*")*)(?:\/>|>([\d,]*)<\/[^>]+>)/g;
|
||||
let lastLength;
|
||||
do {
|
||||
lastLength = nodes.length;
|
||||
data = data.replace(regex, function(all, name, attrs, data) {
|
||||
let length = nodes.length;
|
||||
let node = new SimpleDOMNode(name);
|
||||
let children = [];
|
||||
if (data) {
|
||||
data = data.split(',');
|
||||
data.pop();
|
||||
data.forEach(function(child) {
|
||||
let childNode = nodes[+child];
|
||||
childNode.parentNode = node;
|
||||
children.push(childNode);
|
||||
});
|
||||
}
|
||||
|
||||
node.childNodes = children;
|
||||
nodes.push(node);
|
||||
return length + ',';
|
||||
});
|
||||
} while (lastLength < nodes.length);
|
||||
|
||||
// We should only have one root index left, which will be last in the nodes.
|
||||
return {
|
||||
documentElement: nodes.pop(),
|
||||
};
|
||||
}
|
||||
|
||||
_decodeXML(text) {
|
||||
if (text.indexOf('&') < 0) {
|
||||
return text;
|
||||
}
|
||||
|
||||
return text.replace(/&(#(x[0-9a-f]+|\d+)|\w+);/gi,
|
||||
function(all, entityName, number) {
|
||||
if (number) {
|
||||
if (number[0] === 'x') {
|
||||
number = parseInt(number.substring(1), 16);
|
||||
} else {
|
||||
number = +number;
|
||||
}
|
||||
return String.fromCharCode(number);
|
||||
}
|
||||
|
||||
switch (entityName) {
|
||||
case 'amp':
|
||||
return '&';
|
||||
case 'lt':
|
||||
return '<';
|
||||
case 'gt':
|
||||
return '>';
|
||||
case 'quot':
|
||||
return '\"';
|
||||
case 'apos':
|
||||
return '\'';
|
||||
}
|
||||
return '&' + entityName + ';';
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Optimised CSS custom property getter/setter.
|
||||
* @class
|
||||
|
@ -353,4 +479,5 @@ export {
|
|||
DOMCanvasFactory,
|
||||
DOMCMapReaderFactory,
|
||||
DOMSVGFactory,
|
||||
SimpleXMLParser,
|
||||
};
|
||||
|
|
|
@ -13,18 +13,19 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import { assert, deprecated } from '../shared/util';
|
||||
import { SimpleXMLParser } from './dom_utils';
|
||||
|
||||
class Metadata {
|
||||
constructor(data) {
|
||||
if (typeof data === 'string') {
|
||||
// Ghostscript may produce invalid metadata, so try to repair that first.
|
||||
data = this._repair(data);
|
||||
assert(typeof data === 'string', 'Metadata: input is not a string');
|
||||
|
||||
// Convert the string to a DOM `Document`.
|
||||
let parser = new DOMParser();
|
||||
data = parser.parseFromString(data, 'application/xml');
|
||||
} else if (!(data instanceof Document)) {
|
||||
throw new Error('Metadata: input is not a string or `Document`');
|
||||
}
|
||||
// Ghostscript may produce invalid metadata, so try to repair that first.
|
||||
data = this._repair(data);
|
||||
|
||||
// Convert the string to a DOM `Document`.
|
||||
let parser = new SimpleXMLParser();
|
||||
data = parser.parseFromString(data);
|
||||
|
||||
this._metadata = Object.create(null);
|
||||
|
||||
|
@ -90,9 +91,18 @@ class Metadata {
|
|||
return this._metadata[name] || null;
|
||||
}
|
||||
|
||||
getAll() {
|
||||
return this._metadata;
|
||||
}
|
||||
|
||||
has(name) {
|
||||
return typeof this._metadata[name] !== 'undefined';
|
||||
}
|
||||
|
||||
get metadata() {
|
||||
deprecated('`metadata` getter; use `getAll()` instead.');
|
||||
return this.getAll();
|
||||
}
|
||||
}
|
||||
|
||||
export {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue