Allow specifying custom match logic in PDFFindController

This patch allows embedders of PDF.js to provide custom match logic for seaching in PDFs. This is done by subclassing the PDFFindController class and overriding the `match` method. `match` is called once per PDF page, receives as parameters the search query, the page contents, and the page index, and returns an array of { index, length } objects representing the search results.
2025-04-19 22:58:07 +02:00 · 2024-08-02 18:08:56 +02:00 · 2024-08-02 18:08:56 +02:00 · f051597e23
commit f051597e23
parent b80e552760
2 changed files with 157 additions and 59 deletions
--- a/web/pdf_find_controller.js
+++ b/web/pdf_find_controller.js
@ -670,37 +670,6 @@ class PDFFindController {
    return true;
  }

-  #calculateRegExpMatch(query, entireWord, pageIndex, pageContent) {
-    const matches = (this._pageMatches[pageIndex] = []);
-    const matchesLength = (this._pageMatchesLength[pageIndex] = []);
-    if (!query) {
-      // The query can be empty because some chars like diacritics could have
-      // been stripped out.
-      return;
-    }
-    const diffs = this._pageDiffs[pageIndex];
-    let match;
-    while ((match = query.exec(pageContent)) !== null) {
-      if (
-        entireWord &&
-        !this.#isEntireWord(pageContent, match.index, match[0].length)
-      ) {
-        continue;
-      }
-
-      const [matchPos, matchLen] = getOriginalIndex(
-        diffs,
-        match.index,
-        match[0].length
-      );
-
-      if (matchLen) {
-        matches.push(matchPos);
-        matchesLength.push(matchLen);
-      }
-    }
-  }
-
  #convertToRegExpString(query, hasDiacritics) {
    const { matchDiacritics } = this.#state;
    let isUnicode = false;
@ -772,12 +741,64 @@ class PDFFindController {
  }

  #calculateMatch(pageIndex) {
-    let query = this.#query;
+    const query = this.#query;
    if (query.length === 0) {
      return; // Do nothing: the matches should be wiped out already.
    }
-    const { caseSensitive, entireWord } = this.#state;
    const pageContent = this._pageContents[pageIndex];
+    const matcherResult = this.match(query, pageContent, pageIndex);
+
+    const matches = (this._pageMatches[pageIndex] = []);
+    const matchesLength = (this._pageMatchesLength[pageIndex] = []);
+    const diffs = this._pageDiffs[pageIndex];
+
+    matcherResult?.forEach(({ index, length }) => {
+      const [matchPos, matchLen] = getOriginalIndex(diffs, index, length);
+      if (matchLen) {
+        matches.push(matchPos);
+        matchesLength.push(matchLen);
+      }
+    });
+
+    // When `highlightAll` is set, ensure that the matches on previously
+    // rendered (and still active) pages are correctly highlighted.
+    if (this.#state.highlightAll) {
+      this.#updatePage(pageIndex);
+    }
+    if (this._resumePageIdx === pageIndex) {
+      this._resumePageIdx = null;
+      this.#nextPageMatch();
+    }
+
+    // Update the match count.
+    const pageMatchesCount = matches.length;
+    this._matchesCountTotal += pageMatchesCount;
+    if (this.#updateMatchesCountOnProgress) {
+      if (pageMatchesCount > 0) {
+        this.#updateUIResultsCount();
+      }
+    } else if (++this.#visitedPagesCount === this._linkService.pagesCount) {
+      // For example, in GeckoView we want to have only the final update because
+      // the Java side provides only one object to update the counts.
+      this.#updateUIResultsCount();
+    }
+  }
+
+  /**
+   * @typedef {Object} FindMatch
+   * @property {number} index - The start of the matched text in the page's
+   *   string contents.
+   * @property {number} length - The length of the matched text.
+   */
+
+  /**
+   * @param {string | string[]} query - The search query.
+   * @param {string} pageContent - The text content of the page to search in.
+   * @param {number} pageIndex - The index of the page that is being processed.
+   * @returns {FindMatch[] | undefined} An array of matches in the provided
+   *   page.
+   */
+  match(query, pageContent, pageIndex) {
    const hasDiacritics = this._hasDiacritics[pageIndex];

    let isUnicode = false;
@ -799,34 +820,28 @@ class PDFFindController {
        })
        .join("|");
    }
+    if (!query) {
+      // The query can be empty because some chars like diacritics could have
+      // been stripped out.
+      return undefined;
+    }

+    const { caseSensitive, entireWord } = this.#state;
    const flags = `g${isUnicode ? "u" : ""}${caseSensitive ? "" : "i"}`;
-    query = query ? new RegExp(query, flags) : null;
+    query = new RegExp(query, flags);

-    this.#calculateRegExpMatch(query, entireWord, pageIndex, pageContent);
-
-    // When `highlightAll` is set, ensure that the matches on previously
-    // rendered (and still active) pages are correctly highlighted.
-    if (this.#state.highlightAll) {
-      this.#updatePage(pageIndex);
-    }
-    if (this._resumePageIdx === pageIndex) {
-      this._resumePageIdx = null;
-      this.#nextPageMatch();
-    }
-
-    // Update the match count.
-    const pageMatchesCount = this._pageMatches[pageIndex].length;
-    this._matchesCountTotal += pageMatchesCount;
-    if (this.#updateMatchesCountOnProgress) {
-      if (pageMatchesCount > 0) {
-        this.#updateUIResultsCount();
+    const matches = [];
+    let match;
+    while ((match = query.exec(pageContent)) !== null) {
+      if (
+        entireWord &&
+        !this.#isEntireWord(pageContent, match.index, match[0].length)
+      ) {
+        continue;
      }
-    } else if (++this.#visitedPagesCount === this._linkService.pagesCount) {
-      // For example, in GeckoView we want to have only the final update because
-      // the Java side provides only one object to update the counts.
-      this.#updateUIResultsCount();
+      matches.push({ index: match.index, length: match[0].length });
    }
+    return matches;
  }

  #extractText() {
@ -1103,7 +1118,7 @@ class PDFFindController {
      current += matchIdx + 1;
    }
    // When searching starts, this method may be called before the `pageMatches`
-    // have been counted (in `_calculateMatch`). Ensure that the UI won't show
+    // have been counted (in `#calculateMatch`). Ensure that the UI won't show
    // temporarily broken state when the active find result doesn't make sense.
    if (current < 1 || current > total) {
      current = total = 0;