[api-minor] Replace PDFDocumentProxy.getStats with a synchronous PDFDocumentProxy.stats getter

*Please note:* These changes will primarily benefit longer documents, somewhat at the expense of e.g. one-page documents. The existing `PDFDocumentProxy.getStats` function, which in the default viewer is called for each rendered page, requires a round-trip to the worker-thread in order to obtain the current document stats. In the default viewer, we currently make one such API-call for *every rendered* page. This patch proposes replacing that method with a *synchronous* `PDFDocumentProxy.stats` getter instead, combined with re-factoring the worker-thread code by adding a `DocStats`-class to track Stream/Font-types and *only send* them to the main-thread *the first time* that a type is encountered. Note that in practice most PDF documents only use a fairly limited number of Stream/Font-types, which means that in longer documents most of the `PDFDocumentProxy.getStats`-calls will return the same data.[1] This re-factoring will obviously benefit longer document the most[2], and could actually be seen as a regression for one-page documents, since in practice there'll usually be a couple of "DocStats" messages sent during the parsing of the first page. However, if the user zooms/rotates the document (which causes re-rendering), note that even a one-page document would start to benefit from these changes. Another benefit of having the data available/cached in the API is that unless the document stats change during parsing, repeated `PDFDocumentProxy.stats`-calls will return *the same identical* object. This is something that we can easily take advantage of in the default viewer, by now *only* reporting "documentStats" telemetry[3] when the data actually have changed rather than once per rendered page (again beneficial in longer documents). --- [1] Furthermore, the maximium number of `StreamType`/`FontType` are `10` respectively `12`, which means that regardless of the complexity and page count in a PDF document there'll never be more than twenty-two "DocStats" messages sent; see 41ac3f0c07/src/shared/util.js (L206-L232) [2] One example is the `pdf.pdf` document in the test-suite, where rendering all of its 1310 pages only result in a total of seven "DocStats" messages being sent from the worker-thread. [3] Reporting telemetry, in Firefox, includes using `JSON.stringify` on the data and then sending an event to the `PdfStreamConverter.jsm`-code. In that code the event is handled and `JSON.parse` is used to retrieve the data, and in the "documentStats"-case we'll then iterate through the data to avoid double-reporting telemetry; see https://searchfox.org/mozilla-central/rev/8f4c180b87e52f3345ef8a3432d6e54bd1eb18dc/toolkit/components/pdfjs/content/PdfStreamConverter.jsm#515-549
2025-04-26 01:58:06 +02:00 · 2021-11-11 18:14:26 +01:00 · 2021-11-11 18:14:26 +01:00 · 6da0944fc7
commit 6da0944fc7
parent 41ac3f0c07
10 changed files with 158 additions and 67 deletions
--- a/src/core/core_utils.js
+++ b/src/core/core_utils.js
@ -16,7 +16,9 @@
 import {
  assert,
  BaseException,
+  FontType,
  objectSize,
+  StreamType,
  stringToPDFString,
  warn,
 } from "../shared/util.js";
@ -76,6 +78,55 @@ class XRefParseException extends BaseException {
  }
 }

+class DocStats {
+  constructor(handler) {
+    this._handler = handler;
+
+    this._streamTypes = new Set();
+    this._fontTypes = new Set();
+  }
+
+  _send() {
+    const streamTypes = Object.create(null),
+      fontTypes = Object.create(null);
+    for (const type of this._streamTypes) {
+      streamTypes[type] = true;
+    }
+    for (const type of this._fontTypes) {
+      fontTypes[type] = true;
+    }
+    this._handler.send("DocStats", { streamTypes, fontTypes });
+  }
+
+  addStreamType(type) {
+    if (
+      typeof PDFJSDev === "undefined" ||
+      PDFJSDev.test("!PRODUCTION || TESTING")
+    ) {
+      assert(StreamType[type] === type, 'addStreamType: Invalid "type" value.');
+    }
+    if (this._streamTypes.has(type)) {
+      return;
+    }
+    this._streamTypes.add(type);
+    this._send();
+  }
+
+  addFontType(type) {
+    if (
+      typeof PDFJSDev === "undefined" ||
+      PDFJSDev.test("!PRODUCTION || TESTING")
+    ) {
+      assert(FontType[type] === type, 'addFontType: Invalid "type" value.');
+    }
+    if (this._fontTypes.has(type)) {
+      return;
+    }
+    this._fontTypes.add(type);
+    this._send();
+  }
+}
+
 /**
 * Get the value of an inheritable property.
 *
@ -481,6 +532,7 @@ function recoverJsURL(str) {

 export {
  collectActions,
+  DocStats,
  encodeToXmlString,
  escapePDFName,
  getArrayLookupTableFactory,
--- a/src/core/evaluator.js
+++ b/src/core/evaluator.js
@ -1248,8 +1248,7 @@ class PartialEvaluator {
    this.translateFont(preEvaluatedFont)
      .then(translatedFont => {
        if (translatedFont.fontType !== undefined) {
-          const xrefFontStats = xref.stats.fontTypes;
-          xrefFontStats[translatedFont.fontType] = true;
+          xref.stats.addFontType(translatedFont.fontType);
        }

        fontCapability.resolve(
@ -1277,8 +1276,9 @@ class PartialEvaluator {
            preEvaluatedFont.type,
            subtype && subtype.name
          );
-          const xrefFontStats = xref.stats.fontTypes;
-          xrefFontStats[fontType] = true;
+          if (fontType !== undefined) {
+            xref.stats.addFontType(fontType);
+          }
        } catch (ex) {}

        fontCapability.resolve(
--- a/src/core/parser.js
+++ b/src/core/parser.js
@ -741,13 +741,13 @@ class Parser {
      warn(`Empty "${name}" stream.`);
      return new NullStream();
    }
+    const xrefStats = this.xref.stats;

    try {
-      const xrefStreamStats = this.xref.stats.streamTypes;
      switch (name) {
        case "Fl":
        case "FlateDecode":
-          xrefStreamStats[StreamType.FLATE] = true;
+          xrefStats.addStreamType(StreamType.FLATE);
          if (params) {
            return new PredictorStream(
              new FlateStream(stream, maybeLength),
@ -758,7 +758,7 @@ class Parser {
          return new FlateStream(stream, maybeLength);
        case "LZW":
        case "LZWDecode":
-          xrefStreamStats[StreamType.LZW] = true;
+          xrefStats.addStreamType(StreamType.LZW);
          let earlyChange = 1;
          if (params) {
            if (params.has("EarlyChange")) {
@ -773,30 +773,30 @@ class Parser {
          return new LZWStream(stream, maybeLength, earlyChange);
        case "DCT":
        case "DCTDecode":
-          xrefStreamStats[StreamType.DCT] = true;
+          xrefStats.addStreamType(StreamType.DCT);
          return new JpegStream(stream, maybeLength, params);
        case "JPX":
        case "JPXDecode":
-          xrefStreamStats[StreamType.JPX] = true;
+          xrefStats.addStreamType(StreamType.JPX);
          return new JpxStream(stream, maybeLength, params);
        case "A85":
        case "ASCII85Decode":
-          xrefStreamStats[StreamType.A85] = true;
+          xrefStats.addStreamType(StreamType.A85);
          return new Ascii85Stream(stream, maybeLength);
        case "AHx":
        case "ASCIIHexDecode":
-          xrefStreamStats[StreamType.AHX] = true;
+          xrefStats.addStreamType(StreamType.AHX);
          return new AsciiHexStream(stream, maybeLength);
        case "CCF":
        case "CCITTFaxDecode":
-          xrefStreamStats[StreamType.CCF] = true;
+          xrefStats.addStreamType(StreamType.CCF);
          return new CCITTFaxStream(stream, maybeLength, params);
        case "RL":
        case "RunLengthDecode":
-          xrefStreamStats[StreamType.RLX] = true;
+          xrefStats.addStreamType(StreamType.RLX);
          return new RunLengthStream(stream, maybeLength);
        case "JBIG2Decode":
-          xrefStreamStats[StreamType.JBIG] = true;
+          xrefStats.addStreamType(StreamType.JBIG);
          return new Jbig2Stream(stream, maybeLength, params);
      }
      warn(`Filter "${name}" is not supported.`);
--- a/src/core/pdf_manager.js
+++ b/src/core/pdf_manager.js
@ -115,12 +115,21 @@ class BasePdfManager {
 }

 class LocalPdfManager extends BasePdfManager {
-  constructor(docId, data, password, evaluatorOptions, enableXfa, docBaseUrl) {
+  constructor(
+    docId,
+    data,
+    password,
+    msgHandler,
+    evaluatorOptions,
+    enableXfa,
+    docBaseUrl
+  ) {
    super();

    this._docId = docId;
    this._password = password;
    this._docBaseUrl = parseDocBaseUrl(docBaseUrl);
+    this.msgHandler = msgHandler;
    this.evaluatorOptions = evaluatorOptions;
    this.enableXfa = enableXfa;

--- a/src/core/worker.js
+++ b/src/core/worker.js
@ -215,6 +215,7 @@ class WorkerMessageHandler {
            docId,
            source.data,
            source.password,
+            handler,
            evaluatorOptions,
            enableXfa,
            docBaseUrl
@ -287,6 +288,7 @@ class WorkerMessageHandler {
            docId,
            pdfFile,
            source.password,
+            handler,
            evaluatorOptions,
            enableXfa,
            docBaseUrl
@ -532,10 +534,6 @@ class WorkerMessageHandler {
      });
    });

-    handler.on("GetStats", function wphSetupGetStats(data) {
-      return pdfManager.ensureXRef("stats");
-    });
-
    handler.on("GetAnnotations", function ({ pageIndex, intent }) {
      return pdfManager.getPage(pageIndex).then(function (page) {
        return page.getAnnotationsData(intent);
--- a/src/core/xref.js
+++ b/src/core/xref.js
@ -30,13 +30,14 @@ import {
  isStream,
  Ref,
 } from "./primitives.js";
-import { Lexer, Parser } from "./parser.js";
 import {
+  DocStats,
  MissingDataException,
  ParserEOFException,
  XRefEntryException,
  XRefParseException,
 } from "./core_utils.js";
+import { Lexer, Parser } from "./parser.js";
 import { CipherTransformFactory } from "./crypto.js";

 class XRef {
@ -46,10 +47,7 @@ class XRef {
    this.entries = [];
    this.xrefstms = Object.create(null);
    this._cacheMap = new Map(); // Prepare the XRef cache.
-    this.stats = {
-      streamTypes: Object.create(null),
-      fontTypes: Object.create(null),
-    };
+    this.stats = new DocStats(pdfManager.msgHandler);
    this._newRefNum = null;
  }