Send fetch requests for all page dict lookups in parallel

- When adding page dict candidates to the lookup tree, also initiate fetching them from xref, so if they are not yet loaded at all, the XHR will be sent - Only at the top level - assume that if there is a /Pages tree, it is sensibly structured and the number of requests won't be too bad - We can then await on the cached Promise without making the requests pipeline - This has a significant performance improvement for load-on-demand (i.e. with auto-fetch turned off) when a PDF has a large number of pages in the top level /Pages collection, and those pages are spread through a file, so every candidate needs to be fetched separately - PDFs with many pages where each page is a big image and all the pages are at the top level are quite a common output for digitisation programmes - I would have liked to do something like "if it's the top level collection and page count = number of kids, then just fetch that page without traversing the tree" but unfortunately I agree with comments on #8088 that there is no good general solution to allow for /Pages nodes with empty /Kids arrays
2025-04-22 16:18:08 +02:00 · 2024-08-19 15:18:05 +01:00 · 2024-08-19 15:18:05 +01:00 · a67b9aec6c
commit a67b9aec6c
parent b47c7eca83
1 changed files with 18 additions and 3 deletions
--- a/src/core/catalog.js
+++ b/src/core/catalog.js
@ -143,6 +143,7 @@ class Catalog {
    this.globalImageCache = new GlobalImageCache();
    this.pageKidsCountCache = new RefSetCache();
    this.pageIndexCache = new RefSetCache();
+    this.pageDictCache = new RefSetCache();
    this.nonBlendModesSet = new RefSet();
    this.systemFontCache = new Map();
  }
@ -1161,6 +1162,7 @@ class Catalog {
    this.globalImageCache.clear(/* onlyData = */ manuallyTriggered);
    this.pageKidsCountCache.clear();
    this.pageIndexCache.clear();
+    this.pageDictCache.clear();
    this.nonBlendModesSet.clear();

    const translatedFonts = await Promise.all(this.fontCache);
@ -1184,7 +1186,8 @@ class Catalog {
    }
    const xref = this.xref,
      pageKidsCountCache = this.pageKidsCountCache,
-      pageIndexCache = this.pageIndexCache;
+      pageIndexCache = this.pageIndexCache,
+      pageDictCache = this.pageDictCache;
    let currentPageIndex = 0;

    while (nodesToVisit.length) {
@ -1203,7 +1206,8 @@ class Catalog {
        }
        visitedNodes.put(currentNode);

-        const obj = await xref.fetchAsync(currentNode);
+        const obj = await (pageDictCache.get(currentNode) ||
+          xref.fetchAsync(currentNode));
        if (obj instanceof Dict) {
          let type = obj.getRaw("Type");
          if (type instanceof Ref) {
@ -1285,7 +1289,18 @@ class Catalog {
      // node further down in the tree (see issue5644.pdf, issue8088.pdf),
      // and to ensure that we actually find the correct `Page` dict.
      for (let last = kids.length - 1; last >= 0; last--) {
-        nodesToVisit.push(kids[last]);
+        const lastKid = kids[last];
+        nodesToVisit.push(lastKid);
+
+        // Launch all requests in parallel so we don't wait for each one in turn
+        // when looking for a page near the end, if all the pages are top level.
+        if (
+          currentNode === this.toplevelPagesDict &&
+          lastKid instanceof Ref &&
+          !pageDictCache.has(lastKid)
+        ) {
+          pageDictCache.put(lastKid, xref.fetchAsync(lastKid));
+        }
      }
    }