From 080996ac6860ff7ba9718a01dad0c277585ff3dd Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Fri, 26 Nov 2021 19:47:13 +0100 Subject: [PATCH 1/2] Change the `_pagePromises` cache, in the worker, from an Array to a Map Given that not all pages necessarily are being accessed, or that the pages may be accessed out of order, using a `Map` seems like a more appropriate data-structure here. Furthermore, this patch also adds (currently missing) caching for XFA-documents. Loading a couple of such documents in the viewer, with logging added, shows that we're currently re-creating `Page`-instances unnecessarily for XFA-documents. --- src/core/document.js | 50 +++++++++++++++++--------------------------- 1 file changed, 19 insertions(+), 31 deletions(-) diff --git a/src/core/document.js b/src/core/document.js index c379a09ae..d8b91f387 100644 --- a/src/core/document.js +++ b/src/core/document.js @@ -655,7 +655,7 @@ class PDFDocument { this.pdfManager = pdfManager; this.stream = stream; this.xref = new XRef(stream, pdfManager); - this._pagePromises = []; + this._pagePromises = new Map(); this._version = null; const idCounters = { @@ -1299,36 +1299,21 @@ class PDFDocument { } getPage(pageIndex) { - if (this._pagePromises[pageIndex] !== undefined) { - return this._pagePromises[pageIndex]; + const cachedPromise = this._pagePromises.get(pageIndex); + if (cachedPromise) { + return cachedPromise; } - const { catalog, linearization } = this; + const { catalog, linearization, xfaFactory } = this; - if (this.xfaFactory) { - return Promise.resolve( - new Page({ - pdfManager: this.pdfManager, - xref: this.xref, - pageIndex, - pageDict: Dict.empty, - ref: null, - globalIdFactory: this._globalIdFactory, - fontCache: catalog.fontCache, - builtInCMapCache: catalog.builtInCMapCache, - standardFontDataCache: catalog.standardFontDataCache, - globalImageCache: catalog.globalImageCache, - nonBlendModesSet: catalog.nonBlendModesSet, - xfaFactory: this.xfaFactory, - }) - ); + let promise; + if (xfaFactory) { + promise = Promise.resolve([Dict.empty, null]); + } else if (linearization && linearization.pageFirst === pageIndex) { + promise = this._getLinearizationPage(pageIndex); + } else { + promise = catalog.getPageDict(pageIndex); } - - const promise = - linearization && linearization.pageFirst === pageIndex - ? this._getLinearizationPage(pageIndex) - : catalog.getPageDict(pageIndex); - - return (this._pagePromises[pageIndex] = promise.then(([pageDict, ref]) => { + promise = promise.then(([pageDict, ref]) => { return new Page({ pdfManager: this.pdfManager, xref: this.xref, @@ -1341,9 +1326,12 @@ class PDFDocument { standardFontDataCache: catalog.standardFontDataCache, globalImageCache: catalog.globalImageCache, nonBlendModesSet: catalog.nonBlendModesSet, - xfaFactory: null, + xfaFactory, }); - })); + }); + + this._pagePromises.set(pageIndex, promise); + return promise; } checkFirstPage() { @@ -1352,7 +1340,7 @@ class PDFDocument { // Clear out the various caches to ensure that we haven't stored any // inconsistent and/or incorrect state, since that could easily break // subsequent `this.getPage` calls. - this._pagePromises.length = 0; + this._pagePromises.clear(); await this.cleanup(); throw new XRefParseException(); From 4c56214ab48dc2b3b379c88ffd3952961526152a Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Fri, 26 Nov 2021 19:57:47 +0100 Subject: [PATCH 2/2] Convert `PDFDocument._getLinearizationPage` to an async method This, ever so slightly, simplifies the code and reduces overall indentation. --- src/core/document.js | 41 +++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/src/core/document.js b/src/core/document.js index d8b91f387..3cee33389 100644 --- a/src/core/document.js +++ b/src/core/document.js @@ -1261,7 +1261,7 @@ class PDFDocument { ]); } - _getLinearizationPage(pageIndex) { + async _getLinearizationPage(pageIndex) { const { catalog, linearization } = this; if ( typeof PDFJSDev === "undefined" || @@ -1274,28 +1274,25 @@ class PDFDocument { } const ref = Ref.get(linearization.objectNumberFirst, 0); - return this.xref - .fetchAsync(ref) - .then(obj => { - // Ensure that the object that was found is actually a Page dictionary. - if ( - isDict(obj, "Page") || - (isDict(obj) && !obj.has("Type") && obj.has("Contents")) - ) { - if (ref && !catalog.pageKidsCountCache.has(ref)) { - catalog.pageKidsCountCache.put(ref, 1); // Cache the Page reference. - } - return [obj, ref]; + try { + const obj = await this.xref.fetchAsync(ref); + // Ensure that the object that was found is actually a Page dictionary. + if ( + isDict(obj, "Page") || + (isDict(obj) && !obj.has("Type") && obj.has("Contents")) + ) { + if (ref && !catalog.pageKidsCountCache.has(ref)) { + catalog.pageKidsCountCache.put(ref, 1); // Cache the Page reference. } - throw new FormatError( - "The Linearization dictionary doesn't point " + - "to a valid Page dictionary." - ); - }) - .catch(reason => { - info(reason); - return catalog.getPageDict(pageIndex); - }); + return [obj, ref]; + } + throw new FormatError( + "The Linearization dictionary doesn't point to a valid Page dictionary." + ); + } catch (reason) { + info(reason); + return catalog.getPageDict(pageIndex); + } } getPage(pageIndex) {