Merge pull request #13105 from Snuffleupagus/BasePdfManager-parseDocBaseUrl

Improve memory usage around the `BasePdfManager.docBaseUrl` parameter (PR 7689 follow-up)
2025-04-22 16:18:08 +02:00 · 2021-03-19 23:03:20 +01:00 · 2021-03-19 23:03:20 +01:00 · 8269ddbd16
commit 8269ddbd16
parent e7e0ecf9c8 c4c7216171
10 changed files with 251 additions and 244 deletions
--- a/src/core/pdf_manager.js
+++ b/src/core/pdf_manager.js
@ -13,17 +13,23 @@
 * limitations under the License.
 */

-import {
-  createValidAbsoluteUrl,
-  shadow,
-  unreachable,
-  warn,
-} from "../shared/util.js";
+import { createValidAbsoluteUrl, unreachable, warn } from "../shared/util.js";
 import { ChunkedStreamManager } from "./chunked_stream.js";
 import { MissingDataException } from "./core_utils.js";
 import { PDFDocument } from "./document.js";
 import { Stream } from "./stream.js";

+function parseDocBaseUrl(url) {
+  if (url) {
+    const absoluteUrl = createValidAbsoluteUrl(url);
+    if (absoluteUrl) {
+      return absoluteUrl.href;
+    }
+    warn(`Invalid absolute docBaseUrl: "${url}".`);
+  }
+  return null;
+}
+
 class BasePdfManager {
  constructor() {
    if (this.constructor === BasePdfManager) {
@ -40,16 +46,7 @@ class BasePdfManager {
  }

  get docBaseUrl() {
-    let docBaseUrl = null;
-    if (this._docBaseUrl) {
-      const absoluteUrl = createValidAbsoluteUrl(this._docBaseUrl);
-      if (absoluteUrl) {
-        docBaseUrl = absoluteUrl.href;
-      } else {
-        warn(`Invalid absolute docBaseUrl: "${this._docBaseUrl}".`);
-      }
-    }
-    return shadow(this, "docBaseUrl", docBaseUrl);
+    return this._docBaseUrl;
  }

  onLoadedStream() {
@ -111,7 +108,7 @@ class LocalPdfManager extends BasePdfManager {

    this._docId = docId;
    this._password = password;
-    this._docBaseUrl = docBaseUrl;
+    this._docBaseUrl = parseDocBaseUrl(docBaseUrl);
    this.evaluatorOptions = evaluatorOptions;
    this.enableXfa = enableXfa;

@ -154,7 +151,7 @@ class NetworkPdfManager extends BasePdfManager {

    this._docId = docId;
    this._password = args.password;
-    this._docBaseUrl = docBaseUrl;
+    this._docBaseUrl = parseDocBaseUrl(docBaseUrl);
    this.msgHandler = args.msgHandler;
    this.evaluatorOptions = evaluatorOptions;
    this.enableXfa = enableXfa;
--- a/src/display/api.js
+++ b/src/display/api.js
@ -40,6 +40,7 @@ import {
  deprecated,
  DOMCanvasFactory,
  DOMCMapReaderFactory,
+  isDataScheme,
  loadScript,
  PageViewport,
  RenderingCancelledException,
@ -288,6 +289,15 @@ function getDocument(src) {
  params.pdfBug = params.pdfBug === true;
  params.enableXfa = params.enableXfa === true;

+  if (
+    typeof params.docBaseUrl !== "string" ||
+    isDataScheme(params.docBaseUrl)
+  ) {
+    // Ignore "data:"-URLs, since they can't be used to recover valid absolute
+    // URLs anyway. We want to avoid sending them to the worker-thread, since
+    // they contain the *entire* PDF document and can thus be arbitrarily long.
+    params.docBaseUrl = null;
+  }
  if (!Number.isInteger(params.maxImageSize)) {
    params.maxImageSize = -1;
  }
--- a/src/display/display_utils.js
+++ b/src/display/display_utils.js
@ -451,13 +451,23 @@ function addLinkAttributes(link, { url, target, rel, enabled = true } = {}) {
  link.rel = typeof rel === "string" ? rel : DEFAULT_LINK_REL;
 }

+function isDataScheme(url) {
+  const ii = url.length;
+  let i = 0;
+  while (i < ii && url[i].trim() === "") {
+    i++;
+  }
+  return url.substring(i, i + 5).toLowerCase() === "data:";
+}
+
 function isPdfFile(filename) {
  return typeof filename === "string" && /\.pdf$/i.test(filename);
 }

 /**
- * Gets the file name from a given URL.
+ * Gets the filename from a given URL.
 * @param {string} url
+ * @returns {string}
 */
 function getFilenameFromUrl(url) {
  const anchor = url.indexOf("#");
@ -469,6 +479,48 @@ function getFilenameFromUrl(url) {
  return url.substring(url.lastIndexOf("/", end) + 1, end);
 }

+/**
+ * Returns the filename or guessed filename from the url (see issue 3455).
+ * @param {string} url - The original PDF location.
+ * @param {string} defaultFilename - The value returned if the filename is
+ *   unknown, or the protocol is unsupported.
+ * @returns {string} Guessed PDF filename.
+ */
+function getPdfFilenameFromUrl(url, defaultFilename = "document.pdf") {
+  if (typeof url !== "string") {
+    return defaultFilename;
+  }
+  if (isDataScheme(url)) {
+    warn('getPdfFilenameFromUrl: ignore "data:"-URL for performance reasons.');
+    return defaultFilename;
+  }
+  const reURI = /^(?:(?:[^:]+:)?\/\/[^/]+)?([^?#]*)(\?[^#]*)?(#.*)?$/;
+  //              SCHEME        HOST        1.PATH  2.QUERY   3.REF
+  // Pattern to get last matching NAME.pdf
+  const reFilename = /[^/?#=]+\.pdf\b(?!.*\.pdf\b)/i;
+  const splitURI = reURI.exec(url);
+  let suggestedFilename =
+    reFilename.exec(splitURI[1]) ||
+    reFilename.exec(splitURI[2]) ||
+    reFilename.exec(splitURI[3]);
+  if (suggestedFilename) {
+    suggestedFilename = suggestedFilename[0];
+    if (suggestedFilename.includes("%")) {
+      // URL-encoded %2Fpath%2Fto%2Ffile.pdf should be file.pdf
+      try {
+        suggestedFilename = reFilename.exec(
+          decodeURIComponent(suggestedFilename)
+        )[0];
+      } catch (ex) {
+        // Possible (extremely rare) errors:
+        // URIError "Malformed URI", e.g. for "%AA.pdf"
+        // TypeError "null has no properties", e.g. for "%2F.pdf"
+      }
+    }
+  }
+  return suggestedFilename || defaultFilename;
+}
+
 class StatTimer {
  constructor() {
    this.started = Object.create(null);
@ -655,6 +707,8 @@ export {
  DOMCMapReaderFactory,
  DOMSVGFactory,
  getFilenameFromUrl,
+  getPdfFilenameFromUrl,
+  isDataScheme,
  isFetchSupported,
  isPdfFile,
  isValidFetchUrl,
--- a/src/pdf.js
+++ b/src/pdf.js
@ -17,6 +17,7 @@
 import {
  addLinkAttributes,
  getFilenameFromUrl,
+  getPdfFilenameFromUrl,
  isFetchSupported,
  isPdfFile,
  isValidFetchUrl,
@ -130,6 +131,7 @@ export {
  // From "./display/display_utils.js":
  addLinkAttributes,
  getFilenameFromUrl,
+  getPdfFilenameFromUrl,
  isPdfFile,
  LinkTarget,
  loadScript,