[Regression] Eagerly fetch/parse the entire /Pages-tree in corrupt documents (issue 14303, PR 14311 follow-up)

*Please note:* This is similar to the method that existed prior to PR 3848, but the new method will *only* be used as a fallback when parsing of corrupt PDF documents. The implementation in PR 14311 unfortunately turned out to be *way* too simplistic, as evident by the recently added test-files in issue 14303, since it may *cause* infinite loops in `PDFDocument.checkLastPage` for some corrupt PDF documents.[1] To avoid this, the easiest solution that I could come up with was to fallback to eagerly parsing the *entire* /Pages-tree when the /Count-entry validation fails during document initialization. Fixes *at least* two of the issues listed in issue 14303, namely the `poppler-395-0.pdf...` and `GHOSTSCRIPT-698804-1.pdf...` documents. --- [1] The whole point of PR 14311 was obviously to *get rid of* infinte loops during document initialization, not to introduce any more of those.
2025-04-20 15:18:08 +02:00 · 2021-12-02 01:40:52 +01:00 · 2021-12-02 01:40:52 +01:00 · 1fac6371d3
commit 1fac6371d3
parent f61b74e38e
7 changed files with 504 additions and 35 deletions
--- a/test/pdfs/.gitignore
+++ b/test/pdfs/.gitignore
@ -492,6 +492,8 @@
 !xfa_issue14315.pdf
 !poppler-67295-0.pdf
 !poppler-85140-0.pdf
+!poppler-395-0-fuzzed.pdf
+!GHOSTSCRIPT-698804-1-fuzzed.pdf
 !poppler-91414-0-53.pdf
 !poppler-91414-0-54.pdf
 !poppler-742-0-fuzzed.pdf
--- a/test/pdfs/GHOSTSCRIPT-698804-1-fuzzed.pdf
+++ b/test/pdfs/GHOSTSCRIPT-698804-1-fuzzed.pdf
@ -0,0 +1,69 @@
+%PDF-1.4
+%âãÏÓ
+
+1 0 obj
+<<
+  /Type /Catalog
+  /Outline 2 0 R
+  /Pages 3 0 R
+>>
+endobj
+
+2 0 obj
+<<
+  /Type /Outlines
+  /Count 0
+>>
+endobj
+
+3 0 obj
+<<
+  /Type /Pages
+  /Kids [ 4 0 R ]
+  /Count 1
+>>
+endobj
+
+4 0 obj
+<<
+  /Type /Page
+  /Parent 3 0 R
+  /MediaBox [ 0 0 612 792 ]
+  /Contents 5 0 R
+  /Resources <<
+    /ProcSet 6 0 R
+  >>
+>>
+endobj
+
+5 0 obj
+<<
+  /Length 0
+>>
+stream
+endstream
+endobj
+
+6 0 obj
+[ /PDF ]
+endobj
+
+xref
+0 2
+0000000000 65536 f 
+0000000016 00000 n 
+00000004294967296 3
+0000000138 00000 n 
+0000000204 00000 n 
+0000000342 00000 n 
+                   
+
+trailer
+<<
+  /Size 7
+  /Root 1 0 R
+>>
+
+startxref
+418
+%%EOF
--- a/test/pdfs/poppler-395-0-fuzzed.pdf
+++ b/test/pdfs/poppler-395-0-fuzzed.pdf
--- a/test/unit/api_spec.js
+++ b/test/unit/api_spec.js
@ -495,14 +495,27 @@ describe("api", function () {
      const loadingTask2 = getDocument(
        buildGetDocumentParams("poppler-85140-0.pdf")
      );
+      const loadingTask3 = getDocument(
+        buildGetDocumentParams("poppler-395-0-fuzzed.pdf")
+      );
+      const loadingTask4 = getDocument(
+        buildGetDocumentParams("GHOSTSCRIPT-698804-1-fuzzed.pdf")
+      );
+
      expect(loadingTask1 instanceof PDFDocumentLoadingTask).toEqual(true);
      expect(loadingTask2 instanceof PDFDocumentLoadingTask).toEqual(true);
+      expect(loadingTask3 instanceof PDFDocumentLoadingTask).toEqual(true);
+      expect(loadingTask4 instanceof PDFDocumentLoadingTask).toEqual(true);

      const pdfDocument1 = await loadingTask1.promise;
      const pdfDocument2 = await loadingTask2.promise;
+      const pdfDocument3 = await loadingTask3.promise;
+      const pdfDocument4 = await loadingTask4.promise;

      expect(pdfDocument1.numPages).toEqual(1);
      expect(pdfDocument2.numPages).toEqual(1);
+      expect(pdfDocument3.numPages).toEqual(1);
+      expect(pdfDocument4.numPages).toEqual(1);

      const pageA = await pdfDocument1.getPage(1);
      expect(pageA instanceof PDFPageProxy).toEqual(true);
@ -516,6 +529,28 @@ describe("api", function () {
        expect(reason instanceof UnknownErrorException).toEqual(true);
        expect(reason.message).toEqual("Bad (uncompressed) XRef entry: 3R");
      }
+      try {
+        await pdfDocument3.getPage(1);
+
+        // Shouldn't get here.
+        expect(false).toEqual(true);
+      } catch (reason) {
+        expect(reason instanceof UnknownErrorException).toEqual(true);
+        expect(reason.message).toEqual(
+          "Page dictionary kid reference points to wrong type of object."
+        );
+      }
+      try {
+        await pdfDocument4.getPage(1);
+
+        // Shouldn't get here.
+        expect(false).toEqual(true);
+      } catch (reason) {
+        expect(reason instanceof UnknownErrorException).toEqual(true);
+        expect(reason.message).toEqual(
+          "Page dictionary kid reference points to wrong type of object."
+        );
+      }

      await Promise.all([loadingTask1.destroy(), loadingTask2.destroy()]);
    });