mirror of
https://github.com/mozilla/pdf.js.git
synced 2025-04-20 15:18:08 +02:00
[api-minor] Validate the /Pages-tree /Count entry during document initialization (issue 14303)
*This patch basically extends the approach from PR 10392, by also checking the last page.* Currently, in e.g. the `Catalog.numPages`-getter, we're simply assuming that if the /Pages-tree has an *integer* /Count entry it must also be correct/valid. As can be seen in the referenced PDF documents, that entry may be completely bogus which causes general parsing to breaking down elsewhere in the worker-thread (and hanging the browser). Rather than hoping that the /Count entry is correct, similar to all other data found in PDF documents, we obviously need to validate it. This turns out to be a little less straightforward than one would like, since the only way to do this (as far as I know) is to parse the *entire* /Pages-tree and essentially counting the pages. To avoid doing that for all documents, this patch tries to take a short-cut by checking if the last page (based on the /Count entry) can be successfully fetched. If so, we assume that the /Count entry is correct and use it as-is, otherwise we'll iterate through (potentially) the *entire* /Pages-tree to determine the number of pages. Unfortunately these changes will have a number of *somewhat* negative side-effects, please see a possibly incomplete list below, however I cannot see a better way to address this bug. - This will slow down initial loading/rendering of all documents, at least by some amount, since we now need to fetch/parse more of the /Pages-tree in order to be able to access the *last* page of the PDF documents. - For poorly generated PDF documents, where the entire /Pages-tree only has *one* level, we'll unfortunately need to fetch/parse the *entire* /Pages-tree to get to the last page. While there's a cache to help reduce repeated data lookups, this will affect initial loading/rendering of *some* long PDF documents, - This will affect the `disableAutoFetch = true` mode negatively, since we now need to fetch/parse more data during document initialization. While the `disableAutoFetch = true` mode should still be helpful in larger/longer PDF documents, for smaller ones the effect/usefulness may unfortunately be lost. As one *small* additional bonus, we should now also be able to support opening PDF documents where the /Pages-tree /Count entry is completely invalid (e.g. contains a non-integer value). Fixes two of the issues listed in issue 14303, namely the `poppler-67295-0.pdf` and `poppler-85140-0.pdf` documents.
This commit is contained in:
parent
9a1e27efc5
commit
d0c4bbd828
8 changed files with 215 additions and 16 deletions
2
test/pdfs/.gitignore
vendored
2
test/pdfs/.gitignore
vendored
|
@ -490,3 +490,5 @@
|
|||
!PDFBOX-4352-0.pdf
|
||||
!REDHAT-1531897-0.pdf
|
||||
!xfa_issue14315.pdf
|
||||
!poppler-67295-0.pdf
|
||||
!poppler-85140-0.pdf
|
||||
|
|
40
test/pdfs/poppler-67295-0.pdf
Normal file
40
test/pdfs/poppler-67295-0.pdf
Normal file
|
@ -0,0 +1,40 @@
|
|||
%PDF-1.2
|
||||
1 0 obj
|
||||
<</Type /Catalog /Outlines 2 0 R /Pages 6 0 R>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<</Type /Outlines /Count 0>>
|
||||
endobj
|
||||
3 0 obj
|
||||
<</Length 44>>
|
||||
stream
|
||||
BT /F1 24 Tf 20 750 Td (TestString123) Tj ET
|
||||
endstream
|
||||
endobj
|
||||
4 0 obj
|
||||
[/PDF /Text]
|
||||
endobj
|
||||
5 0 obj
|
||||
<</Type /Font /Subtype /Type1 /Name /F1 /BaseFont /Helvetica /Encoding /MacRomanEncoding>>
|
||||
endobj
|
||||
6 0 obj
|
||||
<</Type /Pages /Kids [7 0 R] /Count 9999999999>>
|
||||
endobj
|
||||
7 0 obj
|
||||
<</Type /Page /Parent 6 0 R /MediaBox [0 0 612 792] /Contents 3 0 R /Resources <</ProcSet 4 0 R /Font <</F1 5 0 R>>>>>>
|
||||
endobj
|
||||
xref
|
||||
0 8
|
||||
0000000000 65535 f
|
||||
0000000010 00000 n
|
||||
0000000076 00000 n
|
||||
0000000123 00000 n
|
||||
0000000221 00000 n
|
||||
0000000252 00000 n
|
||||
0000000361 00000 n
|
||||
0000000428 00000 n
|
||||
trailer
|
||||
<</Size 8 /Root 1 0 R>>
|
||||
startxref
|
||||
566
|
||||
%%EOF
|
36
test/pdfs/poppler-85140-0.pdf
Normal file
36
test/pdfs/poppler-85140-0.pdf
Normal file
|
@ -0,0 +1,36 @@
|
|||
%PDF-1.4
|
||||
1 0 obj
|
||||
<<
|
||||
/Type /Catalog
|
||||
/Pages 2 0 R
|
||||
>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<<
|
||||
/Type /Pages
|
||||
/Kids [3 0 R]
|
||||
/Count 213804087
|
||||
>>
|
||||
endobj
|
||||
3 18446744073709551616 obj
|
||||
<<
|
||||
/Type /Page
|
||||
/Parent 2 0 R
|
||||
/MediaBox [0 0 595 2147483647]
|
||||
/Contents 4 0 R
|
||||
>>
|
||||
endobj
|
||||
4 233245 obj
|
||||
<<
|
||||
>>
|
||||
stream
|
||||
endstream
|
||||
endobj
|
||||
5 0 obj
|
||||
>>
|
||||
endobj
|
||||
trailer
|
||||
<<
|
||||
/Root 1 0 R
|
||||
>>
|
||||
%%EOF
|
|
@ -25,6 +25,7 @@ import {
|
|||
PasswordResponses,
|
||||
PermissionFlag,
|
||||
StreamType,
|
||||
UnknownErrorException,
|
||||
} from "../../src/shared/util.js";
|
||||
import {
|
||||
buildGetDocumentParams,
|
||||
|
@ -478,6 +479,38 @@ describe("api", function () {
|
|||
|
||||
await loadingTask.destroy();
|
||||
});
|
||||
|
||||
it("creates pdf doc from PDF files, with bad /Pages tree /Count", async function () {
|
||||
const loadingTask1 = getDocument(
|
||||
buildGetDocumentParams("poppler-67295-0.pdf")
|
||||
);
|
||||
const loadingTask2 = getDocument(
|
||||
buildGetDocumentParams("poppler-85140-0.pdf")
|
||||
);
|
||||
expect(loadingTask1 instanceof PDFDocumentLoadingTask).toEqual(true);
|
||||
expect(loadingTask2 instanceof PDFDocumentLoadingTask).toEqual(true);
|
||||
|
||||
const pdfDocument1 = await loadingTask1.promise;
|
||||
const pdfDocument2 = await loadingTask2.promise;
|
||||
|
||||
expect(pdfDocument1.numPages).toEqual(1);
|
||||
expect(pdfDocument2.numPages).toEqual(1);
|
||||
|
||||
const pageA = await pdfDocument1.getPage(1);
|
||||
expect(pageA instanceof PDFPageProxy).toEqual(true);
|
||||
|
||||
try {
|
||||
await pdfDocument2.getPage(1);
|
||||
|
||||
// Shouldn't get here.
|
||||
expect(false).toEqual(true);
|
||||
} catch (reason) {
|
||||
expect(reason instanceof UnknownErrorException).toEqual(true);
|
||||
expect(reason.message).toEqual("Bad (uncompressed) XRef entry: 3R");
|
||||
}
|
||||
|
||||
await Promise.all([loadingTask1.destroy(), loadingTask2.destroy()]);
|
||||
});
|
||||
});
|
||||
|
||||
describe("PDFWorker", function () {
|
||||
|
@ -683,7 +716,7 @@ describe("api", function () {
|
|||
throw new Error("shall fail for invalid page");
|
||||
},
|
||||
function (reason) {
|
||||
expect(reason instanceof Error).toEqual(true);
|
||||
expect(reason instanceof UnknownErrorException).toEqual(true);
|
||||
expect(reason.message).toEqual(
|
||||
"Pages tree contains circular reference."
|
||||
);
|
||||
|
@ -724,7 +757,10 @@ describe("api", function () {
|
|||
// Shouldn't get here.
|
||||
expect(false).toEqual(true);
|
||||
} catch (reason) {
|
||||
expect(reason instanceof Error).toEqual(true);
|
||||
expect(reason instanceof UnknownErrorException).toEqual(true);
|
||||
expect(reason.message).toEqual(
|
||||
"The reference does not point to a /Page dictionary."
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue