1
0
Fork 0
mirror of https://github.com/mozilla/pdf.js.git synced 2025-04-19 22:58:07 +02:00

Allow specifying custom match logic in PDFFindController

This patch allows embedders of PDF.js to provide custom match
logic for seaching in PDFs. This is done by subclassing the
PDFFindController class and overriding the `match` method.

`match` is called once per PDF page, receives as parameters the
search query, the page contents, and the page index, and returns
an array of { index, length } objects representing the search
results.
This commit is contained in:
Nicolò Ribaudo 2024-08-02 18:08:56 +02:00
parent b80e552760
commit f051597e23
No known key found for this signature in database
GPG key ID: AAFDA9101C58F338
2 changed files with 157 additions and 59 deletions

View file

@ -670,37 +670,6 @@ class PDFFindController {
return true;
}
#calculateRegExpMatch(query, entireWord, pageIndex, pageContent) {
const matches = (this._pageMatches[pageIndex] = []);
const matchesLength = (this._pageMatchesLength[pageIndex] = []);
if (!query) {
// The query can be empty because some chars like diacritics could have
// been stripped out.
return;
}
const diffs = this._pageDiffs[pageIndex];
let match;
while ((match = query.exec(pageContent)) !== null) {
if (
entireWord &&
!this.#isEntireWord(pageContent, match.index, match[0].length)
) {
continue;
}
const [matchPos, matchLen] = getOriginalIndex(
diffs,
match.index,
match[0].length
);
if (matchLen) {
matches.push(matchPos);
matchesLength.push(matchLen);
}
}
}
#convertToRegExpString(query, hasDiacritics) {
const { matchDiacritics } = this.#state;
let isUnicode = false;
@ -772,12 +741,64 @@ class PDFFindController {
}
#calculateMatch(pageIndex) {
let query = this.#query;
const query = this.#query;
if (query.length === 0) {
return; // Do nothing: the matches should be wiped out already.
}
const { caseSensitive, entireWord } = this.#state;
const pageContent = this._pageContents[pageIndex];
const matcherResult = this.match(query, pageContent, pageIndex);
const matches = (this._pageMatches[pageIndex] = []);
const matchesLength = (this._pageMatchesLength[pageIndex] = []);
const diffs = this._pageDiffs[pageIndex];
matcherResult?.forEach(({ index, length }) => {
const [matchPos, matchLen] = getOriginalIndex(diffs, index, length);
if (matchLen) {
matches.push(matchPos);
matchesLength.push(matchLen);
}
});
// When `highlightAll` is set, ensure that the matches on previously
// rendered (and still active) pages are correctly highlighted.
if (this.#state.highlightAll) {
this.#updatePage(pageIndex);
}
if (this._resumePageIdx === pageIndex) {
this._resumePageIdx = null;
this.#nextPageMatch();
}
// Update the match count.
const pageMatchesCount = matches.length;
this._matchesCountTotal += pageMatchesCount;
if (this.#updateMatchesCountOnProgress) {
if (pageMatchesCount > 0) {
this.#updateUIResultsCount();
}
} else if (++this.#visitedPagesCount === this._linkService.pagesCount) {
// For example, in GeckoView we want to have only the final update because
// the Java side provides only one object to update the counts.
this.#updateUIResultsCount();
}
}
/**
* @typedef {Object} FindMatch
* @property {number} index - The start of the matched text in the page's
* string contents.
* @property {number} length - The length of the matched text.
*/
/**
* @param {string | string[]} query - The search query.
* @param {string} pageContent - The text content of the page to search in.
* @param {number} pageIndex - The index of the page that is being processed.
* @returns {FindMatch[] | undefined} An array of matches in the provided
* page.
*/
match(query, pageContent, pageIndex) {
const hasDiacritics = this._hasDiacritics[pageIndex];
let isUnicode = false;
@ -799,34 +820,28 @@ class PDFFindController {
})
.join("|");
}
if (!query) {
// The query can be empty because some chars like diacritics could have
// been stripped out.
return undefined;
}
const { caseSensitive, entireWord } = this.#state;
const flags = `g${isUnicode ? "u" : ""}${caseSensitive ? "" : "i"}`;
query = query ? new RegExp(query, flags) : null;
query = new RegExp(query, flags);
this.#calculateRegExpMatch(query, entireWord, pageIndex, pageContent);
// When `highlightAll` is set, ensure that the matches on previously
// rendered (and still active) pages are correctly highlighted.
if (this.#state.highlightAll) {
this.#updatePage(pageIndex);
}
if (this._resumePageIdx === pageIndex) {
this._resumePageIdx = null;
this.#nextPageMatch();
}
// Update the match count.
const pageMatchesCount = this._pageMatches[pageIndex].length;
this._matchesCountTotal += pageMatchesCount;
if (this.#updateMatchesCountOnProgress) {
if (pageMatchesCount > 0) {
this.#updateUIResultsCount();
const matches = [];
let match;
while ((match = query.exec(pageContent)) !== null) {
if (
entireWord &&
!this.#isEntireWord(pageContent, match.index, match[0].length)
) {
continue;
}
} else if (++this.#visitedPagesCount === this._linkService.pagesCount) {
// For example, in GeckoView we want to have only the final update because
// the Java side provides only one object to update the counts.
this.#updateUIResultsCount();
matches.push({ index: match.index, length: match[0].length });
}
return matches;
}
#extractText() {
@ -1103,7 +1118,7 @@ class PDFFindController {
current += matchIdx + 1;
}
// When searching starts, this method may be called before the `pageMatches`
// have been counted (in `_calculateMatch`). Ensure that the UI won't show
// have been counted (in `#calculateMatch`). Ensure that the UI won't show
// temporarily broken state when the active find result doesn't make sense.
if (current < 1 || current > total) {
current = total = 0;