From 280207c7402342ad7a1334dc40a077967f1080ef Mon Sep 17 00:00:00 2001 From: Tim van der Meij Date: Sun, 23 Aug 2020 14:04:49 +0200 Subject: [PATCH] Redo the form type detection logic and include unit tests Good form type detection is important to get reliable telemetry and to only show the fallback bar if a form cannot be filled out by the user. PDF.js only supports AcroForm data, so XFA data is explicitly unsupported (tracked in issue #2373). However, the previous form type detection couldn't separate AcroForm and XFA well enough, causing form type telemetry to be incorrect sometimes and the fallback bar to be shown for forms that could in fact be filled out by the user. The solution in this commit is found by studying the specification and the form documents that are available to us. In a nutshell the rules are: - There is XFA data if the `XFA` entry is a non-empty array or stream. - There is AcroForm data if the `Fields` entry is a non-empty array and it doesn't consist of only document signatures. The document signatures part was not handled in the old code, causing a document with only XFA data to also be marked as having AcroForm data. Moreover, the old code didn't check all the data types. Now that AcroForm and XFA can be distinguished, the viewer is configured to only show the fallback bar for documents that only have XFA data. If a document also has AcroForm data, the viewer can use that to render the form. We have not found documents where the XFA data was necessary in that case. Finally, we include unit tests to ensure that all cases are covered and move the form type detection out of the `parse` function so that it's only executed if the document information is actually requested (potentially making initial parsing a tiny bit faster). --- src/core/document.js | 77 +++++++++++++++++++++---- test/unit/document_spec.js | 112 ++++++++++++++++++++++++++++++++++++- web/app.js | 12 ++-- 3 files changed, 183 insertions(+), 18 deletions(-) diff --git a/src/core/document.js b/src/core/document.js index 98ef34638..c970399da 100644 --- a/src/core/document.js +++ b/src/core/document.js @@ -582,16 +582,6 @@ class PDFDocument { if (this.catalog.version) { this._version = this.catalog.version; } - - // Check if AcroForms are present in the document. - this._hasAcroForm = !!this.catalog.acroForm; - if (this._hasAcroForm) { - this.xfa = this.catalog.acroForm.get("XFA"); - const fields = this.catalog.acroForm.get("Fields"); - if ((!Array.isArray(fields) || fields.length === 0) && !this.xfa) { - this._hasAcroForm = false; // No fields and no XFA, so it's not a form. - } - } } get linearization() { @@ -697,6 +687,69 @@ class PDFDocument { return shadow(this, "numPages", num); } + /** + * @private + */ + _hasOnlyDocumentSignatures(fields, recursionDepth = 0) { + const RECURSION_LIMIT = 10; + return fields.every(field => { + field = this.xref.fetchIfRef(field); + if (field.has("Kids")) { + if (++recursionDepth > RECURSION_LIMIT) { + warn("_hasOnlyDocumentSignatures: maximum recursion depth reached"); + return false; + } + return this._hasOnlyDocumentSignatures( + field.get("Kids"), + recursionDepth + ); + } + const isSignature = isName(field.get("FT"), "Sig"); + const rectangle = field.get("Rect"); + const isInvisible = + Array.isArray(rectangle) && rectangle.every(value => value === 0); + return isSignature && isInvisible; + }); + } + + get formInfo() { + const formInfo = { hasAcroForm: false, hasXfa: false }; + const acroForm = this.catalog.acroForm; + if (!acroForm) { + return shadow(this, "formInfo", formInfo); + } + + try { + // The document contains XFA data if the `XFA` entry is a non-empty + // array or stream. + const xfa = acroForm.get("XFA"); + const hasXfa = + (Array.isArray(xfa) && xfa.length > 0) || + (isStream(xfa) && !xfa.isEmpty); + formInfo.hasXfa = hasXfa; + + // The document contains AcroForm data if the `Fields` entry is a + // non-empty array and it doesn't consist of only document signatures. + // This second check is required for files that don't actually contain + // AcroForm data (only XFA data), but that use the `Fields` entry to + // store (invisible) document signatures. This can be detected using + // the first bit of the `SigFlags` integer (see Table 219 in the + // specification). + const fields = acroForm.get("Fields"); + const hasFields = Array.isArray(fields) && fields.length > 0; + const sigFlags = acroForm.get("SigFlags"); + const hasOnlyDocumentSignatures = + !!(sigFlags & 0x1) && this._hasOnlyDocumentSignatures(fields); + formInfo.hasAcroForm = hasFields && !hasOnlyDocumentSignatures; + } catch (ex) { + if (ex instanceof MissingDataException) { + throw ex; + } + info("Cannot fetch form information."); + } + return shadow(this, "formInfo", formInfo); + } + get documentInfo() { const DocumentInfoValidators = { Title: isString, @@ -722,8 +775,8 @@ class PDFDocument { const docInfo = { PDFFormatVersion: version, IsLinearized: !!this.linearization, - IsAcroFormPresent: this._hasAcroForm, - IsXFAPresent: !!this.xfa, + IsAcroFormPresent: this.formInfo.hasAcroForm, + IsXFAPresent: this.formInfo.hasXfa, IsCollectionPresent: !!this.catalog.collection, }; diff --git a/test/unit/document_spec.js b/test/unit/document_spec.js index 503a3ce95..0586898d7 100644 --- a/test/unit/document_spec.js +++ b/test/unit/document_spec.js @@ -13,7 +13,10 @@ * limitations under the License. */ -import { createIdFactory } from "./test_utils.js"; +import { createIdFactory, XRefMock } from "./test_utils.js"; +import { Dict, Name, Ref } from "../../src/core/primitives.js"; +import { PDFDocument } from "../../src/core/document.js"; +import { StringStream } from "../../src/core/stream.js"; describe("document", function () { describe("Page", function () { @@ -40,4 +43,111 @@ describe("document", function () { expect(idFactory1.getDocId()).toEqual("g_d0"); }); }); + + describe("PDFDocument", function () { + const pdfManager = { + get docId() { + return "d0"; + }, + }; + const stream = new StringStream("Dummy_PDF_data"); + + function getDocument(acroForm) { + const pdfDocument = new PDFDocument(pdfManager, stream); + pdfDocument.catalog = { acroForm }; + return pdfDocument; + } + + it("should get form info when no form data is present", function () { + const pdfDocument = getDocument(null); + expect(pdfDocument.formInfo).toEqual({ + hasAcroForm: false, + hasXfa: false, + }); + }); + + it("should get form info when XFA is present", function () { + const acroForm = new Dict(); + + // The `XFA` entry can only be a non-empty array or stream. + acroForm.set("XFA", []); + let pdfDocument = getDocument(acroForm); + expect(pdfDocument.formInfo).toEqual({ + hasAcroForm: false, + hasXfa: false, + }); + + acroForm.set("XFA", ["foo", "bar"]); + pdfDocument = getDocument(acroForm); + expect(pdfDocument.formInfo).toEqual({ + hasAcroForm: false, + hasXfa: true, + }); + + acroForm.set("XFA", new StringStream("")); + pdfDocument = getDocument(acroForm); + expect(pdfDocument.formInfo).toEqual({ + hasAcroForm: false, + hasXfa: false, + }); + + acroForm.set("XFA", new StringStream("non-empty")); + pdfDocument = getDocument(acroForm); + expect(pdfDocument.formInfo).toEqual({ + hasAcroForm: false, + hasXfa: true, + }); + }); + + it("should get form info when AcroForm is present", function () { + const acroForm = new Dict(); + + // The `Fields` entry can only be a non-empty array. + acroForm.set("Fields", []); + let pdfDocument = getDocument(acroForm); + expect(pdfDocument.formInfo).toEqual({ + hasAcroForm: false, + hasXfa: false, + }); + + acroForm.set("Fields", ["foo", "bar"]); + pdfDocument = getDocument(acroForm); + expect(pdfDocument.formInfo).toEqual({ + hasAcroForm: true, + hasXfa: false, + }); + + // If the first bit of the `SigFlags` entry is set and the `Fields` array + // only contains document signatures, then there is no AcroForm data. + acroForm.set("Fields", ["foo", "bar"]); + acroForm.set("SigFlags", 2); + pdfDocument = getDocument(acroForm); + expect(pdfDocument.formInfo).toEqual({ + hasAcroForm: true, + hasXfa: false, + }); + + const annotationDict = new Dict(); + annotationDict.set("FT", Name.get("Sig")); + annotationDict.set("Rect", [0, 0, 0, 0]); + const annotationRef = Ref.get(11, 0); + + const kidsDict = new Dict(); + kidsDict.set("Kids", [annotationRef]); + const kidsRef = Ref.get(10, 0); + + pdfDocument.xref = new XRefMock([ + { ref: annotationRef, data: annotationDict }, + { ref: kidsRef, data: kidsDict }, + ]); + + acroForm.set("Fields", [kidsRef]); + acroForm.set("SigFlags", 3); + pdfDocument = getDocument(acroForm); + expect(pdfDocument.formInfo).toEqual({ + hasAcroForm: false, + hasXfa: false, + }); + }); + }); }); diff --git a/web/app.js b/web/app.js index 9bbe26deb..1ba89133d 100644 --- a/web/app.js +++ b/web/app.js @@ -1426,14 +1426,14 @@ const PDFViewerApplication = { this.setTitle(contentDispositionFilename); } - if (info.IsXFAPresent) { + if (info.IsXFAPresent && !info.IsAcroFormPresent) { console.warn("Warning: XFA is not supported"); this._delayedFallback(UNSUPPORTED_FEATURES.forms); } else if ( - info.IsAcroFormPresent && + (info.IsAcroFormPresent || info.IsXFAPresent) && !this.pdfViewer.renderInteractiveForms ) { - console.warn("Warning: AcroForm support is not enabled"); + console.warn("Warning: Interactive form support is not enabled"); this._delayedFallback(UNSUPPORTED_FEATURES.forms); } @@ -1454,8 +1454,10 @@ const PDFViewerApplication = { }); } let formType = null; - if (info.IsAcroFormPresent) { - formType = info.IsXFAPresent ? "xfa" : "acroform"; + if (info.IsXFAPresent) { + formType = "xfa"; + } else if (info.IsAcroFormPresent) { + formType = "acroform"; } this.externalServices.reportTelemetry({ type: "documentInfo",