[api-minor] Validate the /Pages-tree /Count entry during document initialization (issue 14303)

*This patch basically extends the approach from PR 10392, by also checking the last page.* Currently, in e.g. the `Catalog.numPages`-getter, we're simply assuming that if the /Pages-tree has an *integer* /Count entry it must also be correct/valid. As can be seen in the referenced PDF documents, that entry may be completely bogus which causes general parsing to breaking down elsewhere in the worker-thread (and hanging the browser). Rather than hoping that the /Count entry is correct, similar to all other data found in PDF documents, we obviously need to validate it. This turns out to be a little less straightforward than one would like, since the only way to do this (as far as I know) is to parse the *entire* /Pages-tree and essentially counting the pages. To avoid doing that for all documents, this patch tries to take a short-cut by checking if the last page (based on the /Count entry) can be successfully fetched. If so, we assume that the /Count entry is correct and use it as-is, otherwise we'll iterate through (potentially) the *entire* /Pages-tree to determine the number of pages. Unfortunately these changes will have a number of *somewhat* negative side-effects, please see a possibly incomplete list below, however I cannot see a better way to address this bug. - This will slow down initial loading/rendering of all documents, at least by some amount, since we now need to fetch/parse more of the /Pages-tree in order to be able to access the *last* page of the PDF documents. - For poorly generated PDF documents, where the entire /Pages-tree only has *one* level, we'll unfortunately need to fetch/parse the *entire* /Pages-tree to get to the last page. While there's a cache to help reduce repeated data lookups, this will affect initial loading/rendering of *some* long PDF documents, - This will affect the `disableAutoFetch = true` mode negatively, since we now need to fetch/parse more data during document initialization. While the `disableAutoFetch = true` mode should still be helpful in larger/longer PDF documents, for smaller ones the effect/usefulness may unfortunately be lost. As one *small* additional bonus, we should now also be able to support opening PDF documents where the /Pages-tree /Count entry is completely invalid (e.g. contains a non-integer value). Fixes two of the issues listed in issue 14303, namely the `poppler-67295-0.pdf` and `poppler-85140-0.pdf` documents.
2025-04-20 15:18:08 +02:00 · 2021-11-25 18:34:11 +01:00 · 2021-11-25 18:34:11 +01:00 · d0c4bbd828
commit d0c4bbd828
parent 9a1e27efc5
8 changed files with 215 additions and 16 deletions
--- a/test/pdfs/.gitignore
+++ b/test/pdfs/.gitignore
@ -490,3 +490,5 @@
 !PDFBOX-4352-0.pdf
 !REDHAT-1531897-0.pdf
 !xfa_issue14315.pdf
+!poppler-67295-0.pdf
+!poppler-85140-0.pdf
--- a/test/pdfs/poppler-67295-0.pdf
+++ b/test/pdfs/poppler-67295-0.pdf
@ -0,0 +1,40 @@
+%PDF-1.2
+1 0 obj
+<</Type /Catalog /Outlines 2 0 R /Pages 6 0 R>>
+endobj
+2 0 obj
+<</Type /Outlines /Count 0>>
+endobj
+3 0 obj
+<</Length 44>>
+stream
+BT /F1 24 Tf 20 750 Td (TestString123) Tj ET
+endstream
+endobj
+4 0 obj
+[/PDF /Text]
+endobj
+5 0 obj
+<</Type /Font /Subtype /Type1 /Name /F1 /BaseFont /Helvetica /Encoding /MacRomanEncoding>>
+endobj
+6 0 obj
+<</Type /Pages /Kids [7 0 R] /Count 9999999999>>
+endobj
+7 0 obj
+<</Type /Page /Parent 6 0 R /MediaBox [0 0 612 792] /Contents 3 0 R /Resources <</ProcSet 4 0 R /Font <</F1 5 0 R>>>>>>
+endobj
+xref
+0 8
+0000000000 65535 f
+0000000010 00000 n
+0000000076 00000 n
+0000000123 00000 n
+0000000221 00000 n
+0000000252 00000 n
+0000000361 00000 n
+0000000428 00000 n
+trailer
+<</Size 8 /Root 1 0 R>>
+startxref
+566
+%%EOF
--- a/test/pdfs/poppler-85140-0.pdf
+++ b/test/pdfs/poppler-85140-0.pdf
@ -0,0 +1,36 @@
+%PDF-1.4
+1 0 obj
+<<
+	/Type /Catalog
+	/Pages 2 0 R
+>>
+endobj
+2 0 obj
+<<
+	/Type /Pages
+	/Kids [3 0 R]
+	/Count 213804087
+>>
+endobj
+3 18446744073709551616 obj
+<<
+	/Type /Page
+	/Parent 2 0 R
+	/MediaBox [0 0 595 2147483647]
+	/Contents 4 0 R
+>>
+endobj
+4 233245 obj
+<<
+>>
+stream
+endstream
+endobj
+5 0 obj
+>>
+endobj
+trailer
+<< 
+	/Root 1 0 R
+>>
+%%EOF
--- a/test/unit/api_spec.js
+++ b/test/unit/api_spec.js
@ -25,6 +25,7 @@ import {
  PasswordResponses,
  PermissionFlag,
  StreamType,
+  UnknownErrorException,
 } from "../../src/shared/util.js";
 import {
  buildGetDocumentParams,
@ -478,6 +479,38 @@ describe("api", function () {

      await loadingTask.destroy();
    });
+
+    it("creates pdf doc from PDF files, with bad /Pages tree /Count", async function () {
+      const loadingTask1 = getDocument(
+        buildGetDocumentParams("poppler-67295-0.pdf")
+      );
+      const loadingTask2 = getDocument(
+        buildGetDocumentParams("poppler-85140-0.pdf")
+      );
+      expect(loadingTask1 instanceof PDFDocumentLoadingTask).toEqual(true);
+      expect(loadingTask2 instanceof PDFDocumentLoadingTask).toEqual(true);
+
+      const pdfDocument1 = await loadingTask1.promise;
+      const pdfDocument2 = await loadingTask2.promise;
+
+      expect(pdfDocument1.numPages).toEqual(1);
+      expect(pdfDocument2.numPages).toEqual(1);
+
+      const pageA = await pdfDocument1.getPage(1);
+      expect(pageA instanceof PDFPageProxy).toEqual(true);
+
+      try {
+        await pdfDocument2.getPage(1);
+
+        // Shouldn't get here.
+        expect(false).toEqual(true);
+      } catch (reason) {
+        expect(reason instanceof UnknownErrorException).toEqual(true);
+        expect(reason.message).toEqual("Bad (uncompressed) XRef entry: 3R");
+      }
+
+      await Promise.all([loadingTask1.destroy(), loadingTask2.destroy()]);
+    });
  });

  describe("PDFWorker", function () {
@ -683,7 +716,7 @@ describe("api", function () {
            throw new Error("shall fail for invalid page");
          },
          function (reason) {
-            expect(reason instanceof Error).toEqual(true);
+            expect(reason instanceof UnknownErrorException).toEqual(true);
            expect(reason.message).toEqual(
              "Pages tree contains circular reference."
            );
@ -724,7 +757,10 @@ describe("api", function () {
        // Shouldn't get here.
        expect(false).toEqual(true);
      } catch (reason) {
-        expect(reason instanceof Error).toEqual(true);
+        expect(reason instanceof UnknownErrorException).toEqual(true);
+        expect(reason.message).toEqual(
+          "The reference does not point to a /Page dictionary."
+        );
      }
    });