[api-minor] Clear all caches in XRef.indexObjects, and improve /Root dictionary validation in XRef.parse (issue 14303)

*This patch improves handling of a couple of PDF documents from issue 14303.* - Update `XRef.indexObjects` to actually clear *all* XRef-caches. Invalid XRef tables *usually* cause issues early enough during parsing that we've not populated the XRef-cache, however to prevent any issues we obviously need to clear that one as well. - Improve the /Root dictionary validation in `XRef.parse` (PR 9827 follow-up). In addition to checking that a /Pages entry exists, we'll now also check that it can be successfully fetched *and* that it's of the correct type. There's really no point trying to use a /Root dictionary that e.g. `Catalog.toplevelPagesDict` will reject, and this way we'll be able to fallback to indexing the objects in corrupt documents. - Throw an `InvalidPDFException`, rather than a general `FormatError`, in `XRef.parse` when no usable /Root dictionary could be found. That really seems more appropriate overall, since all attempts at parsing/recovery have failed. (This part of the patch is API-observable, hence the tag.) With these changes, two existing test-cases are improved and the unit-tests are updated/re-factored to highlight that. In particular `GHOSTSCRIPT-698804-1-fuzzed.pdf` will now both load and "render" correctly, whereas `poppler-395-0-fuzzed.pdf` will now fail immediately upon loading (rather than *appearing* to work).
2025-04-25 09:38:06 +02:00 · 2021-12-02 16:40:31 +01:00 · 2021-12-02 16:40:31 +01:00 · ad3a271fc4
commit ad3a271fc4
parent e9e4b913c0
2 changed files with 66 additions and 43 deletions
--- a/src/core/xref.js
+++ b/src/core/xref.js
@ -107,14 +107,26 @@ class XRef {
      }
      warn(`XRef.parse - Invalid "Root" reference: "${ex}".`);
    }
-    if (root instanceof Dict && root.has("Pages")) {
-      this.root = root;
-    } else {
-      if (!recoveryMode) {
-        throw new XRefParseException();
+    if (root instanceof Dict) {
+      try {
+        const pages = root.get("Pages");
+        if (pages instanceof Dict) {
+          this.root = root;
+          return;
+        }
+      } catch (ex) {
+        if (ex instanceof MissingDataException) {
+          throw ex;
+        }
+        warn(`XRef.parse - Invalid "Pages" reference: "${ex}".`);
      }
-      throw new FormatError("Invalid root reference");
    }
+
+    if (!recoveryMode) {
+      throw new XRefParseException();
+    }
+    // Even recovery failed, there's nothing more we can do here.
+    throw new InvalidPDFException("Invalid Root reference.");
  }

  processXRefTable(parser) {
@ -417,6 +429,7 @@ class XRef {

    // Clear out any existing entries, since they may be bogus.
    this.entries.length = 0;
+    this._cacheMap.clear();

    const stream = this.stream;
    stream.pos = 0;