From 680e0efb9de1c9e347edf3c72a82a0b0aa9b05f2 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Fri, 26 Nov 2021 14:11:29 +0100 Subject: [PATCH 1/3] Use Array-destructuring in the `XRef.readXRefStream`-method --- src/core/xref.js | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/src/core/xref.js b/src/core/xref.js index 568c44cad..3f7dd61a0 100644 --- a/src/core/xref.js +++ b/src/core/xref.js @@ -289,19 +289,15 @@ class XRef { } readXRefStream(stream) { - let i, j; const streamState = this.streamState; stream.pos = streamState.streamPos; - const byteWidths = streamState.byteWidths; - const typeFieldWidth = byteWidths[0]; - const offsetFieldWidth = byteWidths[1]; - const generationFieldWidth = byteWidths[2]; + const [typeFieldWidth, offsetFieldWidth, generationFieldWidth] = + streamState.byteWidths; const entryRanges = streamState.entryRanges; while (entryRanges.length > 0) { - const first = entryRanges[0]; - const n = entryRanges[1]; + const [first, n] = entryRanges; if (!Number.isInteger(first) || !Number.isInteger(n)) { throw new FormatError(`Invalid XRef range fields: ${first}, ${n}`); @@ -315,14 +311,14 @@ class XRef { `Invalid XRef entry fields length: ${first}, ${n}` ); } - for (i = streamState.entryNum; i < n; ++i) { + for (let i = streamState.entryNum; i < n; ++i) { streamState.entryNum = i; streamState.streamPos = stream.pos; let type = 0, offset = 0, generation = 0; - for (j = 0; j < typeFieldWidth; ++j) { + for (let j = 0; j < typeFieldWidth; ++j) { const typeByte = stream.getByte(); if (typeByte === -1) { throw new FormatError("Invalid XRef byteWidths 'type'."); @@ -333,14 +329,14 @@ class XRef { if (typeFieldWidth === 0) { type = 1; } - for (j = 0; j < offsetFieldWidth; ++j) { + for (let j = 0; j < offsetFieldWidth; ++j) { const offsetByte = stream.getByte(); if (offsetByte === -1) { throw new FormatError("Invalid XRef byteWidths 'offset'."); } offset = (offset << 8) | offsetByte; } - for (j = 0; j < generationFieldWidth; ++j) { + for (let j = 0; j < generationFieldWidth; ++j) { const generationByte = stream.getByte(); if (generationByte === -1) { throw new FormatError("Invalid XRef byteWidths 'generation'."); From a669fce762818c29b78d0a3665e207bf31801d88 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Fri, 26 Nov 2021 14:11:34 +0100 Subject: [PATCH 2/3] Inline the `isDict`, `isRef`, and `isStream` checks in the `src/core/xref.js` file --- src/core/xref.js | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/src/core/xref.js b/src/core/xref.js index 3f7dd61a0..5473c335c 100644 --- a/src/core/xref.js +++ b/src/core/xref.js @@ -21,15 +21,7 @@ import { InvalidPDFException, warn, } from "../shared/util.js"; -import { - Cmd, - Dict, - isCmd, - isDict, - isRef, - isStream, - Ref, -} from "./primitives.js"; +import { Cmd, Dict, isCmd, Ref } from "./primitives.js"; import { DocStats, MissingDataException, @@ -38,6 +30,7 @@ import { XRefParseException, } from "./core_utils.js"; import { Lexer, Parser } from "./parser.js"; +import { BaseStream } from "./base_stream.js"; import { CipherTransformFactory } from "./crypto.js"; class XRef { @@ -88,7 +81,7 @@ class XRef { } warn(`XRef.parse - Invalid "Encrypt" reference: "${ex}".`); } - if (isDict(encrypt)) { + if (encrypt instanceof Dict) { const ids = trailerDict.get("ID"); const fileId = ids && ids.length ? ids[0] : ""; // The 'Encrypt' dictionary itself should not be encrypted, and by @@ -113,7 +106,7 @@ class XRef { } warn(`XRef.parse - Invalid "Root" reference: "${ex}".`); } - if (isDict(root) && root.has("Pages")) { + if (root instanceof Dict && root.has("Pages")) { this.root = root; } else { if (!recoveryMode) { @@ -155,10 +148,10 @@ class XRef { let dict = parser.getObj(); // The pdflib PDF generator can generate a nested trailer dictionary - if (!isDict(dict) && dict.dict) { + if (!(dict instanceof Dict) && dict.dict) { dict = dict.dict; } - if (!isDict(dict)) { + if (!(dict instanceof Dict)) { throw new FormatError( "Invalid XRef table: could not parse trailer dictionary" ); @@ -564,7 +557,7 @@ class XRef { } // read the trailer dictionary const dict = parser.getObj(); - if (!isDict(dict)) { + if (!(dict instanceof Dict)) { continue; } // Do some basic validation of the trailer/root dictionary candidate. @@ -656,7 +649,7 @@ class XRef { if ( !Number.isInteger(parser.getObj()) || !isCmd(parser.getObj(), "obj") || - !isStream((obj = parser.getObj())) + !((obj = parser.getObj()) instanceof BaseStream) ) { throw new FormatError("Invalid XRef stream"); } @@ -675,7 +668,7 @@ class XRef { obj = dict.get("Prev"); if (Number.isInteger(obj)) { this.startXRefQueue.push(obj); - } else if (isRef(obj)) { + } else if (obj instanceof Ref) { // The spec says Prev must not be a reference, i.e. "/Prev NNN" // This is a fallback for non-compliant PDFs, i.e. "/Prev NNN 0 R" this.startXRefQueue.push(obj.num); @@ -746,9 +739,9 @@ class XRef { } else { xrefEntry = this.fetchCompressed(ref, xrefEntry, suppressEncryption); } - if (isDict(xrefEntry)) { + if (xrefEntry instanceof Dict) { xrefEntry.objId = ref.toString(); - } else if (isStream(xrefEntry)) { + } else if (xrefEntry instanceof BaseStream) { xrefEntry.dict.objId = ref.toString(); } return xrefEntry; @@ -790,7 +783,7 @@ class XRef { } else { xrefEntry = parser.getObj(); } - if (!isStream(xrefEntry)) { + if (!(xrefEntry instanceof BaseStream)) { if ( typeof PDFJSDev === "undefined" || PDFJSDev.test("!PRODUCTION || TESTING") @@ -808,7 +801,7 @@ class XRef { fetchCompressed(ref, xrefEntry, suppressEncryption = false) { const tableOffset = xrefEntry.offset; const stream = this.fetch(Ref.get(tableOffset, 0)); - if (!isStream(stream)) { + if (!(stream instanceof BaseStream)) { throw new FormatError("bad ObjStm stream"); } const first = stream.dict.get("First"); @@ -859,7 +852,7 @@ class XRef { const obj = parser.getObj(); entries[i] = obj; - if (isStream(obj)) { + if (obj instanceof BaseStream) { continue; } const num = nums[i], From a807ffe907a417f27a26e93a0348fe6465aacad3 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Fri, 26 Nov 2021 14:11:39 +0100 Subject: [PATCH 3/3] Prevent circular references in XRef tables from hanging the worker-thread (issue 14303) *Please note:* While this patch on its own is sufficient to prevent the worker-thread from hanging, however in combination with PR 14311 these PDF documents will both load *and* render correctly. Rather than focusing on the particular structure of these PDF documents, it seemed (at least to me) to make sense to try and prevent all circular references when fetching/looking-up data using the XRef table. To avoid a solution that required tracking the references manually everywhere, the implementation settled on here instead handles that internally in the `XRef.fetch`-method. This should work, since that method *and* the `Parser`/`Lexer`-implementations are completely synchronous. Note also that the existing `XRef`-caching, used for all data-types *except* Streams, should hopefully help to lessen the performance impact of these changes. One *potential* problem with these changes could be certain *browser* exceptions, since those are generally not catchable in JavaScript code, however those would most likely "stop" worker-thread parsing anyway (at least I hope so). Finally, note that I settled on returning dummy-data rather than throwing an exception. This was done to allow parsing, for the rest of the document, to continue such that *one* bad reference doesn't prevent an entire document from loading. Fixes two of the issues listed in issue 14303, namely the `poppler-91414-0.zip-2.gz-53.pdf` and `poppler-91414-0.zip-2.gz-54.pdf` documents. --- src/core/parser.js | 2 +- src/core/primitives.js | 2 + src/core/xref.js | 26 ++++++++--- test/pdfs/.gitignore | 2 + test/pdfs/poppler-91414-0-53.pdf | 70 +++++++++++++++++++++++++++++ test/pdfs/poppler-91414-0-54.pdf | 77 ++++++++++++++++++++++++++++++++ test/unit/api_spec.js | 34 ++++++++++++++ 7 files changed, 207 insertions(+), 6 deletions(-) create mode 100644 test/pdfs/poppler-91414-0-53.pdf create mode 100644 test/pdfs/poppler-91414-0-54.pdf diff --git a/src/core/parser.js b/src/core/parser.js index 4d9a4267c..c5e141f9c 100644 --- a/src/core/parser.js +++ b/src/core/parser.js @@ -632,7 +632,7 @@ class Parser { // Get the length. let length = dict.get("Length"); if (!Number.isInteger(length)) { - info(`Bad length "${length}" in stream`); + info(`Bad length "${length && length.toString()}" in stream.`); length = 0; } diff --git a/src/core/primitives.js b/src/core/primitives.js index cc37a5f68..4b5c53221 100644 --- a/src/core/primitives.js +++ b/src/core/primitives.js @@ -16,6 +16,7 @@ import { assert, shadow, unreachable } from "../shared/util.js"; import { BaseStream } from "./base_stream.js"; +const CIRCULAR_REF = Symbol("CIRCULAR_REF"); const EOF = Symbol("EOF"); const Name = (function NameClosure() { @@ -422,6 +423,7 @@ function clearPrimitiveCaches() { } export { + CIRCULAR_REF, clearPrimitiveCaches, Cmd, Dict, diff --git a/src/core/xref.js b/src/core/xref.js index 5473c335c..7c2c2172a 100644 --- a/src/core/xref.js +++ b/src/core/xref.js @@ -21,7 +21,7 @@ import { InvalidPDFException, warn, } from "../shared/util.js"; -import { Cmd, Dict, isCmd, Ref } from "./primitives.js"; +import { CIRCULAR_REF, Cmd, Dict, isCmd, Ref, RefSet } from "./primitives.js"; import { DocStats, MissingDataException, @@ -40,6 +40,7 @@ class XRef { this.entries = []; this.xrefstms = Object.create(null); this._cacheMap = new Map(); // Prepare the XRef cache. + this._pendingRefs = new RefSet(); this.stats = new DocStats(pdfManager.msgHandler); this._newRefNum = null; } @@ -733,11 +734,26 @@ class XRef { this._cacheMap.set(num, xrefEntry); return xrefEntry; } + // Prevent circular references, in corrupt PDF documents, from hanging the + // worker-thread. This relies, implicitly, on the parsing being synchronous. + if (this._pendingRefs.has(ref)) { + this._pendingRefs.remove(ref); - if (xrefEntry.uncompressed) { - xrefEntry = this.fetchUncompressed(ref, xrefEntry, suppressEncryption); - } else { - xrefEntry = this.fetchCompressed(ref, xrefEntry, suppressEncryption); + warn(`Ignoring circular reference: ${ref}.`); + return CIRCULAR_REF; + } + this._pendingRefs.put(ref); + + try { + if (xrefEntry.uncompressed) { + xrefEntry = this.fetchUncompressed(ref, xrefEntry, suppressEncryption); + } else { + xrefEntry = this.fetchCompressed(ref, xrefEntry, suppressEncryption); + } + this._pendingRefs.remove(ref); + } catch (ex) { + this._pendingRefs.remove(ref); + throw ex; } if (xrefEntry instanceof Dict) { xrefEntry.objId = ref.toString(); diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index c4fcf9d30..d8681a1e7 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -492,3 +492,5 @@ !xfa_issue14315.pdf !poppler-67295-0.pdf !poppler-85140-0.pdf +!poppler-91414-0-53.pdf +!poppler-91414-0-54.pdf diff --git a/test/pdfs/poppler-91414-0-53.pdf b/test/pdfs/poppler-91414-0-53.pdf new file mode 100644 index 000000000..3d9305e76 --- /dev/null +++ b/test/pdfs/poppler-91414-0-53.pdf @@ -0,0 +1,70 @@ +%PDF-1.5 +%€€€€ +1 0 obj +<< + /Type /Catalog + /Pages 2 0 R +>> +endobj +2 0 obj +<< + /Count 6 0 R + /Kids [3 0 R] + /Type /Pages +>> +endobj +3 0 obj +<< + /Resources << + /Font << + /F1 5 0 R + >> + >> + /MediaBox [0 0 795 842] + /Parent 2 0 R + /Contents 4 0 R + /Type /Page +>> +endobj +4 0 obj +<< /Length 43 >> +stream +BT 1 Tr /F1 30 Tf 350 750 Td (foobar) Tj ET +endstream +endobj +5 0 obj +<< + /Name /F1 + /BaseFont /Helvetica + /Type /Font + /Subtype /Type1 +>> +endobj +6 0 obj +<< /Length 6 0 R >> +stream +2 +endstream +endobj +7 0 obj +<<>> +endobj +xref +0 8 +0000000000 65535 f +0000000015 00000 n +0000000066 00000 n +0000000130 00000 n +0000000269 00000 n +0000000362 00000 n +0000000446 00000 n +0000000500 00000 n +trailer +<< + /Size 8 + /Root 1 0 R + /Info 7 0 R +>> +startxref +520 +%%EOF \ No newline at end of file diff --git a/test/pdfs/poppler-91414-0-54.pdf b/test/pdfs/poppler-91414-0-54.pdf new file mode 100644 index 000000000..c6ef3a691 --- /dev/null +++ b/test/pdfs/poppler-91414-0-54.pdf @@ -0,0 +1,77 @@ +%PDF-1.5 +%€€€€ +1 0 obj +<< + /Type /Catalog + /Pages 2 0 R +>> +endobj +2 0 obj +<< + /Count 6 0 R + /Kids [3 0 R] + /Type /Pages +>> +endobj +3 0 obj +<< + /Resources << + /Font << + /F1 5 0 R + >> + >> + /MediaBox [0 0 795 842] + /Parent 2 0 R + /Contents 4 0 R + /Type /Page +>> +endobj +4 0 obj +<< /Length 43 >> +stream +BT 1 Tr /F1 30 Tf 350 750 Td (foobar) Tj ET +endstream +endobj +5 0 obj +<< + /Name /F1 + /BaseFont /Helvetica + /Type /Font + /Subtype /Type1 +>> +endobj +6 0 obj +<< /Length 7 0 R >> +stream + foobar +endstream +endobj +7 0 obj +<< /Length 6 0 R >> +stream + foobar +endstream +endobj +8 0 obj +<<>> +endobj +xref +0 9 +0000000000 65535 f +0000000015 00000 n +0000000066 00000 n +0000000130 00000 n +0000000269 00000 n +0000000362 00000 n +0000000446 00000 n +0000000506 00000 n +0000000566 00000 n +trailer +<< + /Size 9 + /Root 1 0 R + /Info 8 0 R +>> +startxref +586 +%%EOF \ No newline at end of file diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index c8d98acf3..6b8c1c4f2 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -511,6 +511,40 @@ describe("api", function () { await Promise.all([loadingTask1.destroy(), loadingTask2.destroy()]); }); + + it("creates pdf doc from PDF files, with circular references", async function () { + const loadingTask1 = getDocument( + buildGetDocumentParams("poppler-91414-0-53.pdf") + ); + const loadingTask2 = getDocument( + buildGetDocumentParams("poppler-91414-0-54.pdf") + ); + expect(loadingTask1 instanceof PDFDocumentLoadingTask).toEqual(true); + expect(loadingTask2 instanceof PDFDocumentLoadingTask).toEqual(true); + + const pdfDocument1 = await loadingTask1.promise; + const pdfDocument2 = await loadingTask2.promise; + + expect(pdfDocument1.numPages).toEqual(1); + expect(pdfDocument2.numPages).toEqual(1); + + const pageA = await pdfDocument1.getPage(1); + const pageB = await pdfDocument2.getPage(1); + + expect(pageA instanceof PDFPageProxy).toEqual(true); + expect(pageB instanceof PDFPageProxy).toEqual(true); + + for (const opList of [ + await pageA.getOperatorList(), + await pageB.getOperatorList(), + ]) { + expect(opList.fnArray.length).toBeGreaterThan(5); + expect(opList.argsArray.length).toBeGreaterThan(5); + expect(opList.lastChunk).toEqual(true); + } + + await Promise.all([loadingTask1.destroy(), loadingTask2.destroy()]); + }); }); describe("PDFWorker", function () {