mirror of
https://github.com/mozilla/pdf.js.git
synced 2025-04-25 09:38:06 +02:00
Move the Metadata parsing to the worker-thread
The only reason, as far as I can tell, for parsing the Metadata on the main-thread is how it was originally implemented. When Metadata support was first implemented, it utilized the [`DOMParser`](https://developer.mozilla.org/en-US/docs/Web/API/DOMParser) which isn't available in workers. Today, with the custom XML-parser being used, that's no longer an issue and it seems reasonable to move the Metadata parsing to the worker-thread[1], since that's where all parsing should happen (for performance reasons). Based on these changes, we'll be able to reduce the now unnecessary duplication of the XML-parser (and related code) in both of the *built* `pdf.js`/`pdf.worker.js` files. Finally, this patch changes the `_repair` method to use "Array + join" rather than string concatenation. --- [1] This needed the previous patch, to enable sending of `Map`s between threads with workers disabled.
This commit is contained in:
parent
73bf45e64b
commit
cc3a6563ee
6 changed files with 177 additions and 142 deletions
146
src/core/metadata_parser.js
Normal file
146
src/core/metadata_parser.js
Normal file
|
@ -0,0 +1,146 @@
|
|||
/* Copyright 2012 Mozilla Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import { SimpleXMLParser } from "../shared/xml_parser.js";
|
||||
|
||||
class MetadataParser {
|
||||
constructor(data) {
|
||||
// Ghostscript may produce invalid metadata, so try to repair that first.
|
||||
data = this._repair(data);
|
||||
|
||||
// Convert the string to an XML document.
|
||||
const parser = new SimpleXMLParser({ lowerCaseName: true });
|
||||
const xmlDocument = parser.parseFromString(data);
|
||||
|
||||
this._metadataMap = new Map();
|
||||
this._data = data;
|
||||
|
||||
if (xmlDocument) {
|
||||
this._parse(xmlDocument);
|
||||
}
|
||||
}
|
||||
|
||||
_repair(data) {
|
||||
// Start by removing any "junk" before the first tag (see issue 10395).
|
||||
return data
|
||||
.replace(/^[^<]+/, "")
|
||||
.replace(/>\\376\\377([^<]+)/g, function (all, codes) {
|
||||
const bytes = codes
|
||||
.replace(/\\([0-3])([0-7])([0-7])/g, function (code, d1, d2, d3) {
|
||||
return String.fromCharCode(d1 * 64 + d2 * 8 + d3 * 1);
|
||||
})
|
||||
.replace(/&(amp|apos|gt|lt|quot);/g, function (str, name) {
|
||||
switch (name) {
|
||||
case "amp":
|
||||
return "&";
|
||||
case "apos":
|
||||
return "'";
|
||||
case "gt":
|
||||
return ">";
|
||||
case "lt":
|
||||
return "<";
|
||||
case "quot":
|
||||
return '"';
|
||||
}
|
||||
throw new Error(`_repair: ${name} isn't defined.`);
|
||||
});
|
||||
|
||||
const charBuf = [];
|
||||
for (let i = 0, ii = bytes.length; i < ii; i += 2) {
|
||||
const code = bytes.charCodeAt(i) * 256 + bytes.charCodeAt(i + 1);
|
||||
if (
|
||||
code >= /* Space = */ 32 &&
|
||||
code < /* Delete = */ 127 &&
|
||||
code !== /* '<' = */ 60 &&
|
||||
code !== /* '>' = */ 62 &&
|
||||
code !== /* '&' = */ 38
|
||||
) {
|
||||
charBuf.push(String.fromCharCode(code));
|
||||
} else {
|
||||
charBuf.push(
|
||||
"&#x" + (0x10000 + code).toString(16).substring(1) + ";"
|
||||
);
|
||||
}
|
||||
}
|
||||
return ">" + charBuf.join("");
|
||||
});
|
||||
}
|
||||
|
||||
_getSequence(entry) {
|
||||
const name = entry.nodeName;
|
||||
if (name !== "rdf:bag" && name !== "rdf:seq" && name !== "rdf:alt") {
|
||||
return null;
|
||||
}
|
||||
return entry.childNodes.filter(node => node.nodeName === "rdf:li");
|
||||
}
|
||||
|
||||
_parseArray(entry) {
|
||||
if (!entry.hasChildNodes()) {
|
||||
return;
|
||||
}
|
||||
// Child must be a Bag (unordered array) or a Seq.
|
||||
const [seqNode] = entry.childNodes;
|
||||
const sequence = this._getSequence(seqNode) || [];
|
||||
|
||||
this._metadataMap.set(
|
||||
entry.nodeName,
|
||||
sequence.map(node => node.textContent.trim())
|
||||
);
|
||||
}
|
||||
|
||||
_parse(xmlDocument) {
|
||||
let rdf = xmlDocument.documentElement;
|
||||
|
||||
if (rdf.nodeName !== "rdf:rdf") {
|
||||
// Wrapped in <xmpmeta>
|
||||
rdf = rdf.firstChild;
|
||||
while (rdf && rdf.nodeName !== "rdf:rdf") {
|
||||
rdf = rdf.nextSibling;
|
||||
}
|
||||
}
|
||||
|
||||
if (!rdf || rdf.nodeName !== "rdf:rdf" || !rdf.hasChildNodes()) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (const desc of rdf.childNodes) {
|
||||
if (desc.nodeName !== "rdf:description") {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (const entry of desc.childNodes) {
|
||||
const name = entry.nodeName;
|
||||
switch (name) {
|
||||
case "#text":
|
||||
continue;
|
||||
case "dc:creator":
|
||||
case "dc:subject":
|
||||
this._parseArray(entry);
|
||||
continue;
|
||||
}
|
||||
this._metadataMap.set(name, entry.textContent.trim());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
get serializable() {
|
||||
return {
|
||||
parsedData: this._metadataMap,
|
||||
rawData: this._data,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export { MetadataParser };
|
Loading…
Add table
Add a link
Reference in a new issue