From 2516ffa78e829c1cf80b564860e3b47862eb4e42 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Thu, 20 Oct 2022 16:24:17 +0200 Subject: [PATCH 1/2] Fallback to finding the first "obj" occurrence, when the trailer-dictionary is incomplete (issue 15590) Note that the "trailer"-case is already a fallback, since normally we're able to use the "xref"-operator even in corrupt documents. However, when a "trailer"-operator is found we still expect "startxref" to exist and be usable in order to advance the stream position. When that's not the case, as happens in the referenced issue, we use a simple fallback to find the first "obj" occurrence instead. This *partially* fixes issue 15590, since without this patch we fail to find any objects at all during `XRef.indexObjects`. However, note that the PDF document is still corrupt and won't render since there's no actual /Pages-dictionary and the /Root-entry simply points to the /OpenAction-dictionary instead. --- src/core/xref.js | 26 ++++++++++++++++++++++++-- test/pdfs/.gitignore | 1 + test/pdfs/issue15590.pdf | 23 +++++++++++++++++++++++ test/unit/api_spec.js | 27 +++++++++++++++++++++++++++ 4 files changed, 75 insertions(+), 2 deletions(-) create mode 100644 test/pdfs/issue15590.pdf diff --git a/src/core/xref.js b/src/core/xref.js index e273eeeba..0f1e957ea 100644 --- a/src/core/xref.js +++ b/src/core/xref.js @@ -503,7 +503,7 @@ class XRef { // Find the next "obj" string, rather than "endobj", to ensure that // we won't skip over a new 'obj' operator in corrupt files where // 'endobj' operators are missing (fixes issue9105_reduced.pdf). - while (startPos < buffer.length) { + while (startPos < length) { const endPos = startPos + skipUntil(buffer, startPos, objBytes) + 4; contentLength = endPos - position; @@ -545,7 +545,29 @@ class XRef { (token.length === 7 || /\s/.test(token[7])) ) { trailers.push(position); - position += skipUntil(buffer, position, startxrefBytes); + + const contentLength = skipUntil(buffer, position, startxrefBytes); + // Attempt to handle (some) corrupt documents, where no 'startxref' + // operators are present (fixes issue15590.pdf). + if (position + contentLength >= length) { + const endPos = position + skipUntil(buffer, position, objBytes) + 4; + + const checkPos = Math.max(endPos - CHECK_CONTENT_LENGTH, position); + const tokenStr = bytesToString(buffer.subarray(checkPos, endPos)); + + // Find the first "obj" occurrence after the 'trailer' operator. + const objToken = nestedObjRegExp.exec(tokenStr); + + if (objToken && objToken[1]) { + warn( + 'indexObjects: Found first "obj" after "trailer", ' + + 'caused by missing "startxref" -- trying to recover.' + ); + position = endPos - objToken[1].length; + continue; + } + } + position += contentLength; } else { position += token.length + 1; } diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 87d4c0278..c0ef6527d 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -351,6 +351,7 @@ !issue9534_reduced.pdf !attachment.pdf !basicapi.pdf +!issue15590.pdf !issue15594_reduced.pdf !issue2884_reduced.pdf !mixedfonts.pdf diff --git a/test/pdfs/issue15590.pdf b/test/pdfs/issue15590.pdf new file mode 100644 index 000000000..7af8ce482 --- /dev/null +++ b/test/pdfs/issue15590.pdf @@ -0,0 +1,23 @@ +%PDF-1.7 + +trailer +<< +/Root 1 0 R +>> + +1 0 obj +<< +/Type /Catalog +/Pages 2 0 R +/OpenAction 2 0 R +>> +endobj + +2 0 obj +<< +/S /JavaScript +/JS(func=function(){app.alert(1)};func();) +>> +endobj + +%%EOF diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index cea08e235..1cb6f10cb 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -738,6 +738,33 @@ describe("api", function () { await loadingTask.destroy(); }); + + it("creates pdf doc from PDF file, with incomplete trailer", async function () { + const loadingTask = getDocument(buildGetDocumentParams("issue15590.pdf")); + expect(loadingTask instanceof PDFDocumentLoadingTask).toEqual(true); + + const pdfDocument = await loadingTask.promise; + expect(pdfDocument.numPages).toEqual(1); + + const jsActions = await pdfDocument.getJSActions(); + expect(jsActions).toEqual({ + OpenAction: ["func=function(){app.alert(1)};func();"], + }); + + try { + await pdfDocument.getPage(1); + + // Shouldn't get here. + expect(false).toEqual(true); + } catch (reason) { + expect(reason instanceof UnknownErrorException).toEqual(true); + expect(reason.message).toEqual( + "Page dictionary kids object is not an array." + ); + } + + await loadingTask.destroy(); + }); }); describe("PDFWorker", function () { From 23930a249e1a135d2cc00c489179aac293d2742a Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Fri, 21 Oct 2022 15:49:45 +0200 Subject: [PATCH 2/2] [api-minor] Let `Catalog.getAllPageDicts` return an *empty* dictionary when loading the first /Page fails (issue 15590) In order to support opening certain corrupt PDF documents, particularly hand-edited ones, this patch adds support for letting the `Catalog.getAllPageDicts` method fallback to returning an *empty* dictionary to replace (only) the first /Page of the document. Given that the viewer cannot initialize/load without access to the first page, this will thus allow e.g. document-level scripting to run as expected. Note that by effectively replacing a corrupt or missing first /Page in this way[1], we'll now render nothing but a *blank* page for certain cases of broken/corrupt PDF documents which may look weird. *Please note:* This functionality is controlled via the existing `stopAtErrors` option, that can be passed to `getDocument`, since it's easy to imagine use-cases where this sort of fallback behaviour isn't desirable. --- [1] Currently we still require that a /Pages-dictionary is found though, however it *may* be possible to relax even that assumption if that becomes absolutely necessary in future corrupt documents. --- src/core/catalog.js | 7 ++++++ test/unit/api_spec.js | 50 ++++++++++++++++++++++++++----------------- 2 files changed, 37 insertions(+), 20 deletions(-) diff --git a/src/core/catalog.js b/src/core/catalog.js index c3911c352..2dff6d2d0 100644 --- a/src/core/catalog.js +++ b/src/core/catalog.js @@ -1191,6 +1191,8 @@ class Catalog { * @returns {Promise} */ async getAllPageDicts(recoveryMode = false) { + const { ignoreErrors } = this.pdfManager.evaluatorOptions; + const queue = [{ currentNode: this.toplevelPagesDict, posInKids: 0 }]; const visitedNodes = new RefSet(); @@ -1215,6 +1217,11 @@ class Catalog { if (error instanceof XRefEntryException && !recoveryMode) { throw error; } + if (recoveryMode && ignoreErrors && pageIndex === 0) { + // Ensure that the viewer will always load (fixes issue15590.pdf). + warn(`getAllPageDicts - Skipping invalid first page: "${error}".`); + error = Dict.empty; + } map.set(pageIndex++, [error, null]); } diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index 1cb6f10cb..16c12be66 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -602,27 +602,42 @@ describe("api", function () { const loadingTask2 = getDocument( buildGetDocumentParams("poppler-85140-0.pdf") ); + const loadingTask3 = getDocument( + buildGetDocumentParams("poppler-85140-0.pdf", { stopAtErrors: true }) + ); expect(loadingTask1 instanceof PDFDocumentLoadingTask).toEqual(true); expect(loadingTask2 instanceof PDFDocumentLoadingTask).toEqual(true); + expect(loadingTask3 instanceof PDFDocumentLoadingTask).toEqual(true); const pdfDocument1 = await loadingTask1.promise; const pdfDocument2 = await loadingTask2.promise; + const pdfDocument3 = await loadingTask3.promise; expect(pdfDocument1.numPages).toEqual(1); expect(pdfDocument2.numPages).toEqual(1); + expect(pdfDocument3.numPages).toEqual(1); - const page = await pdfDocument1.getPage(1); - expect(page instanceof PDFPageProxy).toEqual(true); + const pageA = await pdfDocument1.getPage(1); + expect(pageA instanceof PDFPageProxy).toEqual(true); - const opList = await page.getOperatorList(); - expect(opList.fnArray.length).toBeGreaterThan(5); - expect(opList.argsArray.length).toBeGreaterThan(5); - expect(opList.lastChunk).toEqual(true); - expect(opList.separateAnnots).toEqual(null); + const opListA = await pageA.getOperatorList(); + expect(opListA.fnArray.length).toBeGreaterThan(5); + expect(opListA.argsArray.length).toBeGreaterThan(5); + expect(opListA.lastChunk).toEqual(true); + expect(opListA.separateAnnots).toEqual(null); + + const pageB = await pdfDocument2.getPage(1); + expect(pageB instanceof PDFPageProxy).toEqual(true); + + const opListB = await pageB.getOperatorList(); + expect(opListB.fnArray.length).toBe(0); + expect(opListB.argsArray.length).toBe(0); + expect(opListB.lastChunk).toEqual(true); + expect(opListB.separateAnnots).toEqual(null); try { - await pdfDocument2.getPage(1); + await pdfDocument3.getPage(1); // Shouldn't get here. expect(false).toEqual(true); @@ -631,7 +646,11 @@ describe("api", function () { expect(reason.message).toEqual("Bad (uncompressed) XRef entry: 3R"); } - await Promise.all([loadingTask1.destroy(), loadingTask2.destroy()]); + await Promise.all([ + loadingTask1.destroy(), + loadingTask2.destroy(), + loadingTask3.destroy(), + ]); }); it("creates pdf doc from PDF files, with circular references", async function () { @@ -751,17 +770,8 @@ describe("api", function () { OpenAction: ["func=function(){app.alert(1)};func();"], }); - try { - await pdfDocument.getPage(1); - - // Shouldn't get here. - expect(false).toEqual(true); - } catch (reason) { - expect(reason instanceof UnknownErrorException).toEqual(true); - expect(reason.message).toEqual( - "Page dictionary kids object is not an array." - ); - } + const page = await pdfDocument.getPage(1); + expect(page instanceof PDFPageProxy).toEqual(true); await loadingTask.destroy(); });