From 512aa50fddc336552b5c5b63739f620e778451db Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Sun, 29 Jan 2023 12:21:21 +0100 Subject: [PATCH 1/3] Re-factor the parameter parsing/validation in `getDocument` This is very old code, where we loop through the user-provided options and build an internal parameter object. To prevent errors we also need to ensure that the parameters are correct/valid, which is especially important for the ones that are sent to the worker-thread such that structured cloning won't fail.[1] Over the years this has led to more and more code being added in `getDocument` to validate the user-provided options, and at this point *most* of them have at least basic validation. However the way that this is implemented feels slightly backwards, since we first build the internal parameter object and only *afterwards* validate those parameters.[2] Hence this patch changes the `getDocument` function to instead check/validate the supported options upfront, and then *explicitly* build the internal parameter object with only the needed properties. --- [1] Note the supported types at https://developer.mozilla.org/en-US/docs/Web/API/Web_Workers_API/Structured_clone_algorithm#supported_types [2] The internal parameter object may also, because of the loop, end up with lots of unnecessary properties since anything that the user provides is being copied. --- src/display/api.js | 344 ++++++++++++++++++++++----------------------- web/app.js | 10 +- 2 files changed, 174 insertions(+), 180 deletions(-) diff --git a/src/display/api.js b/src/display/api.js index 5cf581c7b..2a84f2972 100644 --- a/src/display/api.js +++ b/src/display/api.js @@ -274,157 +274,82 @@ function getDocument(src) { } const task = new PDFDocumentLoadingTask(); - const params = Object.create(null); - let rangeTransport = null, - worker = null; + const url = src.url ? getUrlProp(src.url) : null; + const data = src.data ? getDataProp(src.data) : null; + const httpHeaders = src.httpHeaders || null; + const withCredentials = src.withCredentials === true; + const password = src.password ?? null; + const rangeTransport = + src.range instanceof PDFDataRangeTransport ? src.range : null; + const rangeChunkSize = + Number.isInteger(src.rangeChunkSize) && src.rangeChunkSize > 0 + ? src.rangeChunkSize + : DEFAULT_RANGE_CHUNK_SIZE; + let worker = src.worker instanceof PDFWorker ? src.worker : null; + const verbosity = src.verbosity; + // Ignore "data:"-URLs, since they can't be used to recover valid absolute + // URLs anyway. We want to avoid sending them to the worker-thread, since + // they contain the *entire* PDF document and can thus be arbitrarily long. + const docBaseUrl = + typeof src.docBaseUrl === "string" && !isDataScheme(src.docBaseUrl) + ? src.docBaseUrl + : null; + const cMapUrl = typeof src.cMapUrl === "string" ? src.cMapUrl : null; + const cMapPacked = src.cMapPacked !== false; + const CMapReaderFactory = src.CMapReaderFactory || DefaultCMapReaderFactory; + const standardFontDataUrl = + typeof src.standardFontDataUrl === "string" + ? src.standardFontDataUrl + : null; + const StandardFontDataFactory = + src.StandardFontDataFactory || DefaultStandardFontDataFactory; + const ignoreErrors = src.stopAtErrors !== true; + const maxImageSize = + Number.isInteger(src.maxImageSize) && src.maxImageSize > -1 + ? src.maxImageSize + : -1; + const isEvalSupported = src.isEvalSupported !== false; + const isOffscreenCanvasSupported = + typeof src.isOffscreenCanvasSupported === "boolean" + ? src.isOffscreenCanvasSupported + : !isNodeJS; + const disableFontFace = + typeof src.disableFontFace === "boolean" ? src.disableFontFace : isNodeJS; + const fontExtraProperties = src.fontExtraProperties === true; + const enableXfa = src.enableXfa === true; + const ownerDocument = src.ownerDocument || globalThis.document; + const disableRange = src.disableRange === true; + const disableStream = src.disableStream === true; + const disableAutoFetch = src.disableAutoFetch === true; + const pdfBug = src.pdfBug === true; - for (const key in src) { - const val = src[key]; + // Parameters whose default values depend on other parameters. + const length = rangeTransport ? rangeTransport.length : src.length ?? NaN; + const useSystemFonts = + typeof src.useSystemFonts === "boolean" + ? src.useSystemFonts + : !isNodeJS && !disableFontFace; + const useWorkerFetch = + typeof src.useWorkerFetch === "boolean" + ? src.useWorkerFetch + : (typeof PDFJSDev !== "undefined" && PDFJSDev.test("MOZCENTRAL")) || + (CMapReaderFactory === DOMCMapReaderFactory && + StandardFontDataFactory === DOMStandardFontDataFactory && + isValidFetchUrl(cMapUrl, document.baseURI) && + isValidFetchUrl(standardFontDataUrl, document.baseURI)); - switch (key) { - case "url": - if (typeof PDFJSDev !== "undefined" && PDFJSDev.test("MOZCENTRAL")) { - continue; // The 'url' is unused with `PDFDataRangeTransport`. - } else { - if (val instanceof URL) { - params[key] = val.href; - continue; - } - try { - // The full path is required in the 'url' field. - params[key] = new URL(val, window.location).href; - continue; - } catch (ex) { - if ( - typeof PDFJSDev !== "undefined" && - PDFJSDev.test("GENERIC") && - isNodeJS && - typeof val === "string" - ) { - break; // Use the url as-is in Node.js environments. - } - } - throw new Error( - "Invalid PDF url data: " + - "either string or URL-object is expected in the url property." - ); - } - case "range": - rangeTransport = val; - continue; - case "worker": - worker = val; - continue; - case "data": - // Converting string or array-like data to Uint8Array. - if ( - typeof PDFJSDev !== "undefined" && - PDFJSDev.test("GENERIC") && - isNodeJS && - typeof Buffer !== "undefined" && // eslint-disable-line no-undef - val instanceof Buffer // eslint-disable-line no-undef - ) { - params[key] = new Uint8Array(val); - } else if ( - val instanceof Uint8Array && - val.byteLength === val.buffer.byteLength - ) { - // Use the data as-is when it's already a Uint8Array that completely - // "utilizes" its underlying ArrayBuffer, to prevent any possible - // issues when transferring it to the worker-thread. - break; - } else if (typeof val === "string") { - params[key] = stringToBytes(val); - } else if ( - (typeof val === "object" && val !== null && !isNaN(val.length)) || - isArrayBuffer(val) - ) { - params[key] = new Uint8Array(val); - } else { - throw new Error( - "Invalid PDF binary data: either TypedArray, " + - "string, or array-like object is expected in the data property." - ); - } - continue; - } - params[key] = val; - } - - params.cMapPacked = params.cMapPacked !== false; - params.CMapReaderFactory = - params.CMapReaderFactory || DefaultCMapReaderFactory; - params.StandardFontDataFactory = - params.StandardFontDataFactory || DefaultStandardFontDataFactory; - params.ignoreErrors = params.stopAtErrors !== true; - params.fontExtraProperties = params.fontExtraProperties === true; - params.pdfBug = params.pdfBug === true; - params.enableXfa = params.enableXfa === true; - - if (!Number.isInteger(params.rangeChunkSize) || params.rangeChunkSize < 1) { - params.rangeChunkSize = DEFAULT_RANGE_CHUNK_SIZE; - } - if ( - typeof params.docBaseUrl !== "string" || - isDataScheme(params.docBaseUrl) - ) { - // Ignore "data:"-URLs, since they can't be used to recover valid absolute - // URLs anyway. We want to avoid sending them to the worker-thread, since - // they contain the *entire* PDF document and can thus be arbitrarily long. - params.docBaseUrl = null; - } - if (!Number.isInteger(params.maxImageSize) || params.maxImageSize < -1) { - params.maxImageSize = -1; - } - if (typeof params.cMapUrl !== "string") { - params.cMapUrl = null; - } - if (typeof params.standardFontDataUrl !== "string") { - params.standardFontDataUrl = null; - } - if (typeof params.useWorkerFetch !== "boolean") { - params.useWorkerFetch = - (typeof PDFJSDev !== "undefined" && PDFJSDev.test("MOZCENTRAL")) || - (params.CMapReaderFactory === DOMCMapReaderFactory && - params.StandardFontDataFactory === DOMStandardFontDataFactory && - isValidFetchUrl(params.cMapUrl, document.baseURI) && - isValidFetchUrl(params.standardFontDataUrl, document.baseURI)); - } - if (typeof params.isEvalSupported !== "boolean") { - params.isEvalSupported = true; - } - if (typeof params.isOffscreenCanvasSupported !== "boolean") { - params.isOffscreenCanvasSupported = !isNodeJS; - } - if (typeof params.disableFontFace !== "boolean") { - params.disableFontFace = isNodeJS; - } - if (typeof params.useSystemFonts !== "boolean") { - params.useSystemFonts = !isNodeJS && !params.disableFontFace; - } - if ( - typeof params.ownerDocument !== "object" || - params.ownerDocument === null - ) { - params.ownerDocument = globalThis.document; - } - - if (typeof params.disableRange !== "boolean") { - params.disableRange = false; - } - if (typeof params.disableStream !== "boolean") { - params.disableStream = false; - } - if (typeof params.disableAutoFetch !== "boolean") { - params.disableAutoFetch = false; - } + // Parameters only intended for development/testing purposes. + const styleElement = + typeof PDFJSDev === "undefined" || PDFJSDev.test("TESTING") + ? src.styleElement + : null; // Set the main-thread verbosity level. - setVerbosityLevel(params.verbosity); + setVerbosityLevel(verbosity); if (!worker) { const workerParams = { - verbosity: params.verbosity, + verbosity, port: GlobalWorkerOptions.workerPort, }; // Worker was not provided -- creating and owning our own. If message port @@ -435,44 +360,67 @@ function getDocument(src) { task._worker = worker; } const docId = task.docId; + + const params = { + data, + password, + rangeChunkSize, + length, + docBaseUrl, + cMapUrl, + cMapPacked, + CMapReaderFactory, + standardFontDataUrl, + StandardFontDataFactory, + ignoreErrors, + maxImageSize, + isEvalSupported, + isOffscreenCanvasSupported, + disableFontFace, + fontExtraProperties, + enableXfa, + ownerDocument, + disableAutoFetch, + pdfBug, + useSystemFonts, + useWorkerFetch, + styleElement, + }; + worker.promise .then(function () { if (task.destroyed) { throw new Error("Loading aborted"); } - const workerIdPromise = _fetchDocument( - worker, - params, - rangeTransport, - docId - ); + const workerIdPromise = _fetchDocument(worker, params, docId); const networkStreamPromise = new Promise(function (resolve) { let networkStream; if (rangeTransport) { networkStream = new PDFDataTransportStream( { - length: params.length, - initialData: params.initialData, - progressiveDone: params.progressiveDone, - contentDispositionFilename: params.contentDispositionFilename, - disableRange: params.disableRange, - disableStream: params.disableStream, + length, + initialData: rangeTransport.initialData, + progressiveDone: rangeTransport.progressiveDone, + contentDispositionFilename: + rangeTransport.contentDispositionFilename, + disableRange, + disableStream, }, rangeTransport ); - } else if (!params.data) { + } else if (!data) { if (typeof PDFJSDev !== "undefined" && PDFJSDev.test("MOZCENTRAL")) { throw new Error("Not implemented: createPDFNetworkStream"); } networkStream = createPDFNetworkStream({ - url: params.url, - length: params.length, - httpHeaders: params.httpHeaders, - withCredentials: params.withCredentials, - rangeChunkSize: params.rangeChunkSize, - disableRange: params.disableRange, - disableStream: params.disableStream, + url, + length, + httpHeaders, + withCredentials, + rangeChunkSize, + disableRange, + disableStream, }); } resolve(networkStream); @@ -510,24 +458,15 @@ function getDocument(src) { * * @param {PDFWorker} worker * @param {Object} source - * @param {PDFDataRangeTransport} pdfDataRangeTransport * @param {string} docId - Unique document ID, used in `MessageHandler`. * @returns {Promise} A promise that is resolved when the worker ID of * the `MessageHandler` is known. * @private */ -async function _fetchDocument(worker, source, pdfDataRangeTransport, docId) { +async function _fetchDocument(worker, source, docId) { if (worker.destroyed) { throw new Error("Worker was destroyed"); } - - if (pdfDataRangeTransport) { - source.length = pdfDataRangeTransport.length; - source.initialData = pdfDataRangeTransport.initialData; - source.progressiveDone = pdfDataRangeTransport.progressiveDone; - source.contentDispositionFilename = - pdfDataRangeTransport.contentDispositionFilename; - } const transfers = source.data ? [source.data.buffer] : null; const workerId = await worker.messageHandler.sendWithPromise( @@ -569,6 +508,63 @@ async function _fetchDocument(worker, source, pdfDataRangeTransport, docId) { return workerId; } +function getUrlProp(val) { + if (typeof PDFJSDev !== "undefined" && PDFJSDev.test("MOZCENTRAL")) { + return null; // The 'url' is unused with `PDFDataRangeTransport`. + } else if (val instanceof URL) { + return val.href; + } + try { + // The full path is required in the 'url' field. + return new URL(val, window.location).href; + } catch (ex) { + if ( + typeof PDFJSDev !== "undefined" && + PDFJSDev.test("GENERIC") && + isNodeJS && + typeof val === "string" + ) { + return val; // Use the url as-is in Node.js environments. + } + } + throw new Error( + "Invalid PDF url data: " + + "either string or URL-object is expected in the url property." + ); +} + +function getDataProp(val) { + // Converting string or array-like data to Uint8Array. + if ( + typeof PDFJSDev !== "undefined" && + PDFJSDev.test("GENERIC") && + isNodeJS && + typeof Buffer !== "undefined" && // eslint-disable-line no-undef + val instanceof Buffer // eslint-disable-line no-undef + ) { + return new Uint8Array(val); + } else if ( + val instanceof Uint8Array && + val.byteLength === val.buffer.byteLength + ) { + // Use the data as-is when it's already a Uint8Array that completely + // "utilizes" its underlying ArrayBuffer, to prevent any possible + // issues when transferring it to the worker-thread. + return val; + } else if (typeof val === "string") { + return stringToBytes(val); + } else if ( + (typeof val === "object" && !isNaN(val?.length)) || + isArrayBuffer(val) + ) { + return new Uint8Array(val); + } + throw new Error( + "Invalid PDF binary data: either TypedArray, " + + "string, or array-like object is expected in the data property." + ); +} + /** * @typedef {Object} OnProgressParameters * @property {number} loaded - Currently loaded number of bytes. diff --git a/web/app.js b/web/app.js index 58a3ea613..2cae5f515 100644 --- a/web/app.js +++ b/web/app.js @@ -932,12 +932,10 @@ const PDFViewerApplication = { ) { // The Firefox built-in viewer always calls `setTitleUsingUrl`, before // `initPassiveLoading`, and it never provides an `originalUrl` here. - if (args.originalUrl) { - this.setTitleUsingUrl(args.originalUrl, /* downloadUrl = */ args.url); - delete args.originalUrl; - } else { - this.setTitleUsingUrl(args.url, /* downloadUrl = */ args.url); - } + this.setTitleUsingUrl( + args.originalUrl || args.url, + /* downloadUrl = */ args.url + ); } // Set the necessary API parameters, using all the available options. const apiParams = AppOptions.getAll(OptionKind.API); From ce8ac6d96a46ad6602d61882c29be745e995ebd1 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Sun, 29 Jan 2023 18:20:53 +0100 Subject: [PATCH 2/3] Only pass the necessary parameters to `_fetchDocument` and `WorkerTransport` Currently we're passing all available parameters to this function respectively class, despite that not actually being necessary. By splitting the parameters we not only improve the structure, and basically "document" the code a little bit, but we can also simplify the `_fetchDocument` function considerably. --- src/display/api.js | 71 +++++++++++++++++++--------------------------- 1 file changed, 29 insertions(+), 42 deletions(-) diff --git a/src/display/api.js b/src/display/api.js index 2a84f2972..d9b59e059 100644 --- a/src/display/api.js +++ b/src/display/api.js @@ -361,28 +361,45 @@ function getDocument(src) { } const docId = task.docId; - const params = { + const fetchDocParams = { + docId, + apiVersion: + typeof PDFJSDev !== "undefined" && !PDFJSDev.test("TESTING") + ? PDFJSDev.eval("BUNDLE_VERSION") + : null, data, password, + disableAutoFetch, rangeChunkSize, length, docBaseUrl, + enableXfa, + evaluatorOptions: { + maxImageSize, + disableFontFace, + ignoreErrors, + isEvalSupported, + isOffscreenCanvasSupported, + fontExtraProperties, + useSystemFonts, + cMapUrl: useWorkerFetch ? cMapUrl : null, + standardFontDataUrl: useWorkerFetch ? standardFontDataUrl : null, + }, + }; + const transportParams = { cMapUrl, cMapPacked, CMapReaderFactory, standardFontDataUrl, StandardFontDataFactory, ignoreErrors, - maxImageSize, isEvalSupported, - isOffscreenCanvasSupported, disableFontFace, fontExtraProperties, enableXfa, ownerDocument, disableAutoFetch, pdfBug, - useSystemFonts, useWorkerFetch, styleElement, }; @@ -393,7 +410,7 @@ function getDocument(src) { throw new Error("Loading aborted"); } - const workerIdPromise = _fetchDocument(worker, params, docId); + const workerIdPromise = _fetchDocument(worker, fetchDocParams); const networkStreamPromise = new Promise(function (resolve) { let networkStream; if (rangeTransport) { @@ -441,7 +458,7 @@ function getDocument(src) { messageHandler, task, networkStream, - params + transportParams ); task._transport = transport; messageHandler.send("Ready", null); @@ -458,48 +475,18 @@ function getDocument(src) { * * @param {PDFWorker} worker * @param {Object} source - * @param {string} docId - Unique document ID, used in `MessageHandler`. * @returns {Promise} A promise that is resolved when the worker ID of * the `MessageHandler` is known. * @private */ -async function _fetchDocument(worker, source, docId) { +async function _fetchDocument(worker, source) { if (worker.destroyed) { throw new Error("Worker was destroyed"); } - const transfers = source.data ? [source.data.buffer] : null; - const workerId = await worker.messageHandler.sendWithPromise( "GetDocRequest", - // Only send the required properties, and *not* the entire `source` object. - { - docId, - apiVersion: - typeof PDFJSDev !== "undefined" && !PDFJSDev.test("TESTING") - ? PDFJSDev.eval("BUNDLE_VERSION") - : null, - data: source.data, - password: source.password, - disableAutoFetch: source.disableAutoFetch, - rangeChunkSize: source.rangeChunkSize, - length: source.length, - docBaseUrl: source.docBaseUrl, - enableXfa: source.enableXfa, - evaluatorOptions: { - maxImageSize: source.maxImageSize, - disableFontFace: source.disableFontFace, - ignoreErrors: source.ignoreErrors, - isEvalSupported: source.isEvalSupported, - isOffscreenCanvasSupported: source.isOffscreenCanvasSupported, - fontExtraProperties: source.fontExtraProperties, - useSystemFonts: source.useSystemFonts, - cMapUrl: source.useWorkerFetch ? source.cMapUrl : null, - standardFontDataUrl: source.useWorkerFetch - ? source.standardFontDataUrl - : null, - }, - }, - transfers + source, + source.data ? [source.data.buffer] : null ); if (worker.destroyed) { @@ -3075,10 +3062,10 @@ class WorkerTransport { } get loadingParams() { - const params = this._params; + const { disableAutoFetch, enableXfa } = this._params; return shadow(this, "loadingParams", { - disableAutoFetch: params.disableAutoFetch, - enableXfa: params.enableXfa, + disableAutoFetch, + enableXfa, }); } } From 0a0f3fc733c38a94eb89e9ffdbc7f8064ae772f3 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Mon, 30 Jan 2023 17:33:00 +0100 Subject: [PATCH 3/3] Move the main-thread CMap/StandardFontData factory initialization to `getDocument` By default we're using worker-thread fetching (in browsers) of this data nowadays, however in Node.js environments or if the user provides custom factories we still fallback to main-thread fetching. Hence it makes sense, as far as I'm concerned, to move this initialization into the `getDocument` function to ensure that the factories can actually be initialized *before* attempting to load the document. Also, this further reduces the amount of `getDocument` parameters that we need to pass into into the `WorkerTransport` class. --- src/display/api.js | 44 +++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/src/display/api.js b/src/display/api.js index d9b59e059..df89bcbfa 100644 --- a/src/display/api.js +++ b/src/display/api.js @@ -347,6 +347,20 @@ function getDocument(src) { // Set the main-thread verbosity level. setVerbosityLevel(verbosity); + // Ensure that the various factories can be initialized, when necessary, + // since the user may provide *custom* ones. + const transportFactory = useWorkerFetch + ? null + : { + cMapReaderFactory: new CMapReaderFactory({ + baseUrl: cMapUrl, + isCompressed: cMapPacked, + }), + standardFontDataFactory: new StandardFontDataFactory({ + baseUrl: standardFontDataUrl, + }), + }; + if (!worker) { const workerParams = { verbosity, @@ -387,11 +401,6 @@ function getDocument(src) { }, }; const transportParams = { - cMapUrl, - cMapPacked, - CMapReaderFactory, - standardFontDataUrl, - StandardFontDataFactory, ignoreErrors, isEvalSupported, disableFontFace, @@ -400,7 +409,6 @@ function getDocument(src) { ownerDocument, disableAutoFetch, pdfBug, - useWorkerFetch, styleElement, }; @@ -458,7 +466,8 @@ function getDocument(src) { messageHandler, task, networkStream, - transportParams + transportParams, + transportFactory ); task._transport = transport; messageHandler.send("Ready", null); @@ -2326,7 +2335,7 @@ class WorkerTransport { #pagePromises = new Map(); - constructor(messageHandler, loadingTask, networkStream, params) { + constructor(messageHandler, loadingTask, networkStream, params, factory) { this.messageHandler = messageHandler; this.loadingTask = loadingTask; this.commonObjs = new PDFObjects(); @@ -2337,15 +2346,8 @@ class WorkerTransport { }); this._params = params; - if (!params.useWorkerFetch) { - this.CMapReaderFactory = new params.CMapReaderFactory({ - baseUrl: params.cMapUrl, - isCompressed: params.cMapPacked, - }); - this.StandardFontDataFactory = new params.StandardFontDataFactory({ - baseUrl: params.standardFontDataUrl, - }); - } + this.cMapReaderFactory = factory?.cMapReaderFactory; + this.standardFontDataFactory = factory?.standardFontDataFactory; this.destroyed = false; this.destroyCapability = null; @@ -2813,28 +2815,28 @@ class WorkerTransport { if (this.destroyed) { return Promise.reject(new Error("Worker was destroyed.")); } - if (!this.CMapReaderFactory) { + if (!this.cMapReaderFactory) { return Promise.reject( new Error( "CMapReaderFactory not initialized, see the `useWorkerFetch` parameter." ) ); } - return this.CMapReaderFactory.fetch(data); + return this.cMapReaderFactory.fetch(data); }); messageHandler.on("FetchStandardFontData", data => { if (this.destroyed) { return Promise.reject(new Error("Worker was destroyed.")); } - if (!this.StandardFontDataFactory) { + if (!this.standardFontDataFactory) { return Promise.reject( new Error( "StandardFontDataFactory not initialized, see the `useWorkerFetch` parameter." ) ); } - return this.StandardFontDataFactory.fetch(data); + return this.standardFontDataFactory.fetch(data); }); }