[api-minor] Highlight search results correctly for normalized text (PR 9448)

This patch is a rebased *and* refactored version of PR 9448, such that it applies cleanly given that `PDFFindController` has changed since that PR was opened; obviously keeping the original author information intact. This patch will thus ensure that e.g. fractions, and other things that we normalize before searching, will still be highlighted correctly in the textLayer. Furthermore, this patch also adds basic unit-tests for this functionality. *Note:* The `[api-minor]` tag is added, since third-party implementations of the `PDFFindController` must now always use the `pageMatchesLength` property to get accurate length information (see the `web/text_layer_builder.js` changes). Co-authored-by: Ross Johnson <ross@mazira.com> Co-authored-by: Jonas Jenwald <jonas.jenwald@gmail.com>
2025-04-22 16:18:08 +02:00 · 2021-01-12 15:21:19 +01:00 · 2021-01-12 15:21:19 +01:00 · 6dae2677d5
commit 6dae2677d5
parent 1de1ae0be6
6 changed files with 220 additions and 106 deletions
--- a/web/pdf_find_controller.js
+++ b/web/pdf_find_controller.js
@ -49,9 +49,40 @@ function normalize(text) {
    const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join("");
    normalizationRegex = new RegExp(`[${replace}]`, "g");
  }
-  return text.replace(normalizationRegex, function (ch) {
-    return CHARACTERS_TO_NORMALIZE[ch];
+  let diffs = null;
+  const normalizedText = text.replace(normalizationRegex, function (ch, index) {
+    const normalizedCh = CHARACTERS_TO_NORMALIZE[ch],
+      diff = normalizedCh.length - ch.length;
+    if (diff !== 0) {
+      (diffs ||= []).push([index, diff]);
+    }
+    return normalizedCh;
  });
+
+  return [normalizedText, diffs];
+}
+
+// Determine the original, non-normalized, match index such that highlighting of
+// search results is correct in the `textLayer` for strings containing e.g. "½"
+// characters; essentially "inverting" the result of the `normalize` function.
+function getOriginalIndex(matchIndex, diffs = null) {
+  if (!diffs) {
+    return matchIndex;
+  }
+  let totalDiff = 0;
+  for (const [index, diff] of diffs) {
+    const currentIndex = index + totalDiff;
+
+    if (currentIndex >= matchIndex) {
+      break;
+    }
+    if (currentIndex + diff > matchIndex) {
+      totalDiff += matchIndex - currentIndex;
+      break;
+    }
+    totalDiff += diff;
+  }
+  return matchIndex - totalDiff;
 }

 /**
@ -215,6 +246,7 @@ class PDFFindController {
    };
    this._extractTextPromises = [];
    this._pageContents = []; // Stores the normalized text for each page.
+    this._pageDiffs = [];
    this._matchesCountTotal = 0;
    this._pagesToSearch = null;
    this._pendingFindMatches = Object.create(null);
@ -232,7 +264,7 @@ class PDFFindController {
  get _query() {
    if (this._state.query !== this._rawQuery) {
      this._rawQuery = this._state.query;
-      this._normalizedQuery = normalize(this._state.query);
+      [this._normalizedQuery] = normalize(this._state.query);
    }
    return this._normalizedQuery;
  }
@ -349,8 +381,9 @@ class PDFFindController {
    return true;
  }

-  _calculatePhraseMatch(query, pageIndex, pageContent, entireWord) {
-    const matches = [];
+  _calculatePhraseMatch(query, pageIndex, pageContent, pageDiffs, entireWord) {
+    const matches = [],
+      matchesLength = [];
    const queryLen = query.length;

    let matchIdx = -queryLen;
@ -362,12 +395,19 @@ class PDFFindController {
      if (entireWord && !this._isEntireWord(pageContent, matchIdx, queryLen)) {
        continue;
      }
-      matches.push(matchIdx);
+      const originalMatchIdx = getOriginalIndex(matchIdx, pageDiffs),
+        matchEnd = matchIdx + queryLen - 1,
+        originalQueryLen =
+          getOriginalIndex(matchEnd, pageDiffs) - originalMatchIdx + 1;
+
+      matches.push(originalMatchIdx);
+      matchesLength.push(originalQueryLen);
    }
    this._pageMatches[pageIndex] = matches;
+    this._pageMatchesLength[pageIndex] = matchesLength;
  }

-  _calculateWordMatch(query, pageIndex, pageContent, entireWord) {
+  _calculateWordMatch(query, pageIndex, pageContent, pageDiffs, entireWord) {
    const matchesWithLength = [];

    // Divide the query into pieces and search for text in each piece.
@ -388,10 +428,15 @@ class PDFFindController {
        ) {
          continue;
        }
+        const originalMatchIdx = getOriginalIndex(matchIdx, pageDiffs),
+          matchEnd = matchIdx + subqueryLen - 1,
+          originalQueryLen =
+            getOriginalIndex(matchEnd, pageDiffs) - originalMatchIdx + 1;
+
        // Other searches do not, so we store the length.
        matchesWithLength.push({
-          match: matchIdx,
-          matchLength: subqueryLen,
+          match: originalMatchIdx,
+          matchLength: originalQueryLen,
          skipped: false,
        });
      }
@ -412,6 +457,7 @@ class PDFFindController {

  _calculateMatch(pageIndex) {
    let pageContent = this._pageContents[pageIndex];
+    const pageDiffs = this._pageDiffs[pageIndex];
    let query = this._query;
    const { caseSensitive, entireWord, phraseSearch } = this._state;

@ -426,9 +472,21 @@ class PDFFindController {
    }

    if (phraseSearch) {
-      this._calculatePhraseMatch(query, pageIndex, pageContent, entireWord);
+      this._calculatePhraseMatch(
+        query,
+        pageIndex,
+        pageContent,
+        pageDiffs,
+        entireWord
+      );
    } else {
-      this._calculateWordMatch(query, pageIndex, pageContent, entireWord);
+      this._calculateWordMatch(
+        query,
+        pageIndex,
+        pageContent,
+        pageDiffs,
+        entireWord
+      );
    }

    // When `highlightAll` is set, ensure that the matches on previously
@ -478,7 +536,9 @@ class PDFFindController {
              }

              // Store the normalized page content (text items) as one string.
-              this._pageContents[i] = normalize(strBuf.join(""));
+              [this._pageContents[i], this._pageDiffs[i]] = normalize(
+                strBuf.join("")
+              );
              extractTextCapability.resolve(i);
            },
            reason => {
@ -488,6 +548,7 @@ class PDFFindController {
              );
              // Page error -- assuming no text content.
              this._pageContents[i] = "";
+              this._pageDiffs[i] = null;
              extractTextCapability.resolve(i);
            }
          );
--- a/web/text_layer_builder.js
+++ b/web/text_layer_builder.js
@ -161,12 +161,11 @@ class TextLayerBuilder {
    if (!matches) {
      return [];
    }
-    const { findController, textContentItemsStr } = this;
+    const { textContentItemsStr } = this;

    let i = 0,
      iIndex = 0;
    const end = textContentItemsStr.length - 1;
-    const queryLen = findController.state.query.length;
    const result = [];

    for (let m = 0, mm = matches.length; m < mm; m++) {
@ -191,13 +190,7 @@ class TextLayerBuilder {
      };

      // Calculate the end position.
-      if (matchesLength) {
-        // Multiterm search.
-        matchIdx += matchesLength[m];
-      } else {
-        // Phrase search.
-        matchIdx += queryLen;
-      }
+      matchIdx += matchesLength[m];

      // Somewhat the same array as above, but use > instead of >= to get
      // the end position right.