mirror of
https://github.com/mozilla/pdf.js.git
synced 2025-04-22 16:18:08 +02:00
Merge pull request #12855 from Snuffleupagus/find-normalization
[api-minor] Highlight search results correctly for normalized text (PR 9448)
This commit is contained in:
commit
dcd1589b2c
6 changed files with 220 additions and 106 deletions
|
@ -49,9 +49,40 @@ function normalize(text) {
|
|||
const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join("");
|
||||
normalizationRegex = new RegExp(`[${replace}]`, "g");
|
||||
}
|
||||
return text.replace(normalizationRegex, function (ch) {
|
||||
return CHARACTERS_TO_NORMALIZE[ch];
|
||||
let diffs = null;
|
||||
const normalizedText = text.replace(normalizationRegex, function (ch, index) {
|
||||
const normalizedCh = CHARACTERS_TO_NORMALIZE[ch],
|
||||
diff = normalizedCh.length - ch.length;
|
||||
if (diff !== 0) {
|
||||
(diffs ||= []).push([index, diff]);
|
||||
}
|
||||
return normalizedCh;
|
||||
});
|
||||
|
||||
return [normalizedText, diffs];
|
||||
}
|
||||
|
||||
// Determine the original, non-normalized, match index such that highlighting of
|
||||
// search results is correct in the `textLayer` for strings containing e.g. "½"
|
||||
// characters; essentially "inverting" the result of the `normalize` function.
|
||||
function getOriginalIndex(matchIndex, diffs = null) {
|
||||
if (!diffs) {
|
||||
return matchIndex;
|
||||
}
|
||||
let totalDiff = 0;
|
||||
for (const [index, diff] of diffs) {
|
||||
const currentIndex = index + totalDiff;
|
||||
|
||||
if (currentIndex >= matchIndex) {
|
||||
break;
|
||||
}
|
||||
if (currentIndex + diff > matchIndex) {
|
||||
totalDiff += matchIndex - currentIndex;
|
||||
break;
|
||||
}
|
||||
totalDiff += diff;
|
||||
}
|
||||
return matchIndex - totalDiff;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -215,6 +246,7 @@ class PDFFindController {
|
|||
};
|
||||
this._extractTextPromises = [];
|
||||
this._pageContents = []; // Stores the normalized text for each page.
|
||||
this._pageDiffs = [];
|
||||
this._matchesCountTotal = 0;
|
||||
this._pagesToSearch = null;
|
||||
this._pendingFindMatches = Object.create(null);
|
||||
|
@ -232,7 +264,7 @@ class PDFFindController {
|
|||
get _query() {
|
||||
if (this._state.query !== this._rawQuery) {
|
||||
this._rawQuery = this._state.query;
|
||||
this._normalizedQuery = normalize(this._state.query);
|
||||
[this._normalizedQuery] = normalize(this._state.query);
|
||||
}
|
||||
return this._normalizedQuery;
|
||||
}
|
||||
|
@ -349,8 +381,9 @@ class PDFFindController {
|
|||
return true;
|
||||
}
|
||||
|
||||
_calculatePhraseMatch(query, pageIndex, pageContent, entireWord) {
|
||||
const matches = [];
|
||||
_calculatePhraseMatch(query, pageIndex, pageContent, pageDiffs, entireWord) {
|
||||
const matches = [],
|
||||
matchesLength = [];
|
||||
const queryLen = query.length;
|
||||
|
||||
let matchIdx = -queryLen;
|
||||
|
@ -362,12 +395,19 @@ class PDFFindController {
|
|||
if (entireWord && !this._isEntireWord(pageContent, matchIdx, queryLen)) {
|
||||
continue;
|
||||
}
|
||||
matches.push(matchIdx);
|
||||
const originalMatchIdx = getOriginalIndex(matchIdx, pageDiffs),
|
||||
matchEnd = matchIdx + queryLen - 1,
|
||||
originalQueryLen =
|
||||
getOriginalIndex(matchEnd, pageDiffs) - originalMatchIdx + 1;
|
||||
|
||||
matches.push(originalMatchIdx);
|
||||
matchesLength.push(originalQueryLen);
|
||||
}
|
||||
this._pageMatches[pageIndex] = matches;
|
||||
this._pageMatchesLength[pageIndex] = matchesLength;
|
||||
}
|
||||
|
||||
_calculateWordMatch(query, pageIndex, pageContent, entireWord) {
|
||||
_calculateWordMatch(query, pageIndex, pageContent, pageDiffs, entireWord) {
|
||||
const matchesWithLength = [];
|
||||
|
||||
// Divide the query into pieces and search for text in each piece.
|
||||
|
@ -388,10 +428,15 @@ class PDFFindController {
|
|||
) {
|
||||
continue;
|
||||
}
|
||||
const originalMatchIdx = getOriginalIndex(matchIdx, pageDiffs),
|
||||
matchEnd = matchIdx + subqueryLen - 1,
|
||||
originalQueryLen =
|
||||
getOriginalIndex(matchEnd, pageDiffs) - originalMatchIdx + 1;
|
||||
|
||||
// Other searches do not, so we store the length.
|
||||
matchesWithLength.push({
|
||||
match: matchIdx,
|
||||
matchLength: subqueryLen,
|
||||
match: originalMatchIdx,
|
||||
matchLength: originalQueryLen,
|
||||
skipped: false,
|
||||
});
|
||||
}
|
||||
|
@ -412,6 +457,7 @@ class PDFFindController {
|
|||
|
||||
_calculateMatch(pageIndex) {
|
||||
let pageContent = this._pageContents[pageIndex];
|
||||
const pageDiffs = this._pageDiffs[pageIndex];
|
||||
let query = this._query;
|
||||
const { caseSensitive, entireWord, phraseSearch } = this._state;
|
||||
|
||||
|
@ -426,9 +472,21 @@ class PDFFindController {
|
|||
}
|
||||
|
||||
if (phraseSearch) {
|
||||
this._calculatePhraseMatch(query, pageIndex, pageContent, entireWord);
|
||||
this._calculatePhraseMatch(
|
||||
query,
|
||||
pageIndex,
|
||||
pageContent,
|
||||
pageDiffs,
|
||||
entireWord
|
||||
);
|
||||
} else {
|
||||
this._calculateWordMatch(query, pageIndex, pageContent, entireWord);
|
||||
this._calculateWordMatch(
|
||||
query,
|
||||
pageIndex,
|
||||
pageContent,
|
||||
pageDiffs,
|
||||
entireWord
|
||||
);
|
||||
}
|
||||
|
||||
// When `highlightAll` is set, ensure that the matches on previously
|
||||
|
@ -478,7 +536,9 @@ class PDFFindController {
|
|||
}
|
||||
|
||||
// Store the normalized page content (text items) as one string.
|
||||
this._pageContents[i] = normalize(strBuf.join(""));
|
||||
[this._pageContents[i], this._pageDiffs[i]] = normalize(
|
||||
strBuf.join("")
|
||||
);
|
||||
extractTextCapability.resolve(i);
|
||||
},
|
||||
reason => {
|
||||
|
@ -488,6 +548,7 @@ class PDFFindController {
|
|||
);
|
||||
// Page error -- assuming no text content.
|
||||
this._pageContents[i] = "";
|
||||
this._pageDiffs[i] = null;
|
||||
extractTextCapability.resolve(i);
|
||||
}
|
||||
);
|
||||
|
|
|
@ -161,12 +161,11 @@ class TextLayerBuilder {
|
|||
if (!matches) {
|
||||
return [];
|
||||
}
|
||||
const { findController, textContentItemsStr } = this;
|
||||
const { textContentItemsStr } = this;
|
||||
|
||||
let i = 0,
|
||||
iIndex = 0;
|
||||
const end = textContentItemsStr.length - 1;
|
||||
const queryLen = findController.state.query.length;
|
||||
const result = [];
|
||||
|
||||
for (let m = 0, mm = matches.length; m < mm; m++) {
|
||||
|
@ -191,13 +190,7 @@ class TextLayerBuilder {
|
|||
};
|
||||
|
||||
// Calculate the end position.
|
||||
if (matchesLength) {
|
||||
// Multiterm search.
|
||||
matchIdx += matchesLength[m];
|
||||
} else {
|
||||
// Phrase search.
|
||||
matchIdx += queryLen;
|
||||
}
|
||||
matchIdx += matchesLength[m];
|
||||
|
||||
// Somewhat the same array as above, but use > instead of >= to get
|
||||
// the end position right.
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue