Merge pull request #12855 from Snuffleupagus/find-normalization

[api-minor] Highlight search results correctly for normalized text (PR 9448)
2025-04-22 16:18:08 +02:00 · 2021-01-12 22:04:10 +01:00 · 2021-01-12 22:04:10 +01:00 · dcd1589b2c
commit dcd1589b2c
parent 1bcbf69c96 6dae2677d5
6 changed files with 220 additions and 106 deletions
--- a/web/pdf_find_controller.js
+++ b/web/pdf_find_controller.js
@ -49,9 +49,40 @@ function normalize(text) {
    const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join("");
    normalizationRegex = new RegExp(`[${replace}]`, "g");
  }
-  return text.replace(normalizationRegex, function (ch) {
-    return CHARACTERS_TO_NORMALIZE[ch];
+  let diffs = null;
+  const normalizedText = text.replace(normalizationRegex, function (ch, index) {
+    const normalizedCh = CHARACTERS_TO_NORMALIZE[ch],
+      diff = normalizedCh.length - ch.length;
+    if (diff !== 0) {
+      (diffs ||= []).push([index, diff]);
+    }
+    return normalizedCh;
  });
+
+  return [normalizedText, diffs];
+}
+
+// Determine the original, non-normalized, match index such that highlighting of
+// search results is correct in the `textLayer` for strings containing e.g. "½"
+// characters; essentially "inverting" the result of the `normalize` function.
+function getOriginalIndex(matchIndex, diffs = null) {
+  if (!diffs) {
+    return matchIndex;
+  }
+  let totalDiff = 0;
+  for (const [index, diff] of diffs) {
+    const currentIndex = index + totalDiff;
+
+    if (currentIndex >= matchIndex) {
+      break;
+    }
+    if (currentIndex + diff > matchIndex) {
+      totalDiff += matchIndex - currentIndex;
+      break;
+    }
+    totalDiff += diff;
+  }
+  return matchIndex - totalDiff;
 }

 /**
@ -215,6 +246,7 @@ class PDFFindController {
    };
    this._extractTextPromises = [];
    this._pageContents = []; // Stores the normalized text for each page.
+    this._pageDiffs = [];
    this._matchesCountTotal = 0;
    this._pagesToSearch = null;
    this._pendingFindMatches = Object.create(null);
@ -232,7 +264,7 @@ class PDFFindController {
  get _query() {
    if (this._state.query !== this._rawQuery) {
      this._rawQuery = this._state.query;
-      this._normalizedQuery = normalize(this._state.query);
+      [this._normalizedQuery] = normalize(this._state.query);
    }
    return this._normalizedQuery;
  }
@ -349,8 +381,9 @@ class PDFFindController {
    return true;
  }

-  _calculatePhraseMatch(query, pageIndex, pageContent, entireWord) {
-    const matches = [];
+  _calculatePhraseMatch(query, pageIndex, pageContent, pageDiffs, entireWord) {
+    const matches = [],
+      matchesLength = [];
    const queryLen = query.length;

    let matchIdx = -queryLen;
@ -362,12 +395,19 @@ class PDFFindController {
      if (entireWord && !this._isEntireWord(pageContent, matchIdx, queryLen)) {
        continue;
      }
-      matches.push(matchIdx);
+      const originalMatchIdx = getOriginalIndex(matchIdx, pageDiffs),
+        matchEnd = matchIdx + queryLen - 1,
+        originalQueryLen =
+          getOriginalIndex(matchEnd, pageDiffs) - originalMatchIdx + 1;
+
+      matches.push(originalMatchIdx);
+      matchesLength.push(originalQueryLen);
    }
    this._pageMatches[pageIndex] = matches;
+    this._pageMatchesLength[pageIndex] = matchesLength;
  }

-  _calculateWordMatch(query, pageIndex, pageContent, entireWord) {
+  _calculateWordMatch(query, pageIndex, pageContent, pageDiffs, entireWord) {
    const matchesWithLength = [];

    // Divide the query into pieces and search for text in each piece.
@ -388,10 +428,15 @@ class PDFFindController {
        ) {
          continue;
        }
+        const originalMatchIdx = getOriginalIndex(matchIdx, pageDiffs),
+          matchEnd = matchIdx + subqueryLen - 1,
+          originalQueryLen =
+            getOriginalIndex(matchEnd, pageDiffs) - originalMatchIdx + 1;
+
        // Other searches do not, so we store the length.
        matchesWithLength.push({
-          match: matchIdx,
-          matchLength: subqueryLen,
+          match: originalMatchIdx,
+          matchLength: originalQueryLen,
          skipped: false,
        });
      }
@ -412,6 +457,7 @@ class PDFFindController {

  _calculateMatch(pageIndex) {
    let pageContent = this._pageContents[pageIndex];
+    const pageDiffs = this._pageDiffs[pageIndex];
    let query = this._query;
    const { caseSensitive, entireWord, phraseSearch } = this._state;

@ -426,9 +472,21 @@ class PDFFindController {
    }

    if (phraseSearch) {
-      this._calculatePhraseMatch(query, pageIndex, pageContent, entireWord);
+      this._calculatePhraseMatch(
+        query,
+        pageIndex,
+        pageContent,
+        pageDiffs,
+        entireWord
+      );
    } else {
-      this._calculateWordMatch(query, pageIndex, pageContent, entireWord);
+      this._calculateWordMatch(
+        query,
+        pageIndex,
+        pageContent,
+        pageDiffs,
+        entireWord
+      );
    }

    // When `highlightAll` is set, ensure that the matches on previously
@ -478,7 +536,9 @@ class PDFFindController {
              }

              // Store the normalized page content (text items) as one string.
-              this._pageContents[i] = normalize(strBuf.join(""));
+              [this._pageContents[i], this._pageDiffs[i]] = normalize(
+                strBuf.join("")
+              );
              extractTextCapability.resolve(i);
            },
            reason => {
@ -488,6 +548,7 @@ class PDFFindController {
              );
              // Page error -- assuming no text content.
              this._pageContents[i] = "";
+              this._pageDiffs[i] = null;
              extractTextCapability.resolve(i);
            }
          );
--- a/web/text_layer_builder.js
+++ b/web/text_layer_builder.js
@ -161,12 +161,11 @@ class TextLayerBuilder {
    if (!matches) {
      return [];
    }
-    const { findController, textContentItemsStr } = this;
+    const { textContentItemsStr } = this;

    let i = 0,
      iIndex = 0;
    const end = textContentItemsStr.length - 1;
-    const queryLen = findController.state.query.length;
    const result = [];

    for (let m = 0, mm = matches.length; m < mm; m++) {
@ -191,13 +190,7 @@ class TextLayerBuilder {
      };

      // Calculate the end position.
-      if (matchesLength) {
-        // Multiterm search.
-        matchIdx += matchesLength[m];
-      } else {
-        // Phrase search.
-        matchIdx += queryLen;
-      }
+      matchIdx += matchesLength[m];

      // Somewhat the same array as above, but use > instead of >= to get
      // the end position right.