1
0
Fork 0
mirror of https://github.com/mozilla/pdf.js.git synced 2025-04-19 06:38:07 +02:00

Merge pull request #19324 from nicolo-ribaudo/search-dash-date

Allow searching for number-number on two lines
This commit is contained in:
Jonas Jenwald 2025-01-15 15:04:09 +01:00 committed by GitHub
commit 711bf2bd12
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 78 additions and 30 deletions

View file

@ -1104,6 +1104,40 @@ describe("pdf_find_controller", function () {
}); });
}); });
it("performs a search with a dash between two digits", async () => {
const { eventBus, pdfFindController } = await initPdfFindController();
await testSearch({
eventBus,
pdfFindController,
state: {
query: "2008-02",
},
matchesPerPage: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
selectedMatch: {
pageIndex: 13,
matchIndex: 0,
},
pageMatches: [[], [], [], [], [], [], [], [], [], [], [], [], [], [314]],
pageMatchesLength: [
[],
[],
[],
[],
[],
[],
[],
[],
[],
[],
[],
[],
[],
[7],
],
});
});
describe("custom matcher", () => { describe("custom matcher", () => {
it("calls to the matcher with the right arguments", async () => { it("calls to the matcher with the right arguments", async () => {
const QUERY = "Foo bar"; const QUERY = "Foo bar";

View file

@ -117,10 +117,12 @@ function normalize(text) {
} }
} }
const hasSyllables = syllablePositions.length > 0;
let normalizationRegex; let normalizationRegex;
if (syllablePositions.length === 0 && noSyllablesRegExp) { if (!hasSyllables && noSyllablesRegExp) {
normalizationRegex = noSyllablesRegExp; normalizationRegex = noSyllablesRegExp;
} else if (syllablePositions.length > 0 && withSyllablesRegExp) { } else if (hasSyllables && withSyllablesRegExp) {
normalizationRegex = withSyllablesRegExp; normalizationRegex = withSyllablesRegExp;
} else { } else {
// Compile the regular expression for text normalization once. // Compile the regular expression for text normalization once.
@ -131,22 +133,33 @@ function normalize(text) {
// 30A0-30FF: Katakana // 30A0-30FF: Katakana
const CJK = "(?:\\p{Ideographic}|[\u3040-\u30FF])"; const CJK = "(?:\\p{Ideographic}|[\u3040-\u30FF])";
const HKDiacritics = "(?:\u3099|\u309A)"; const HKDiacritics = "(?:\u3099|\u309A)";
const CompoundWord = "\\p{Ll}-\\n\\p{Lu}"; const BrokenWord = `\\p{Ll}-\\n(?=\\p{Ll})|\\p{Lu}-\\n(?=\\p{L})`;
const regexp = `([${replace}])|([${toNormalizeWithNFKC}])|(${HKDiacritics}\\n)|(\\p{M}+(?:-\\n)?)|(${CompoundWord})|(\\S-\\n)|(${CJK}\\n)|(\\n)`;
if (syllablePositions.length === 0) { const regexps = [
// Most of the syllables belong to Hangul so there are no need /* p1 */ `[${replace}]`,
// to search for them in a non-Hangul document. /* p2 */ `[${toNormalizeWithNFKC}]`,
// We use the \0 in order to have the same number of groups. /* p3 */ `${HKDiacritics}\\n`,
normalizationRegex = noSyllablesRegExp = new RegExp( /* p4 */ "\\p{M}+(?:-\\n)?",
regexp + "|(\\u0000)", /* p5 */ `${BrokenWord}`,
"gum" /* p6 */ "\\S-\\n",
); /* p7 */ `${CJK}\\n`,
/* p8 */ "\\n",
/* p9 */ hasSyllables
? FIRST_CHAR_SYLLABLES_REG_EXP
: // Most of the syllables belong to Hangul so there are no need
// to search for them in a non-Hangul document.
// We use the \0 in order to have the same number of groups.
"\\u0000",
];
normalizationRegex = new RegExp(
regexps.map(r => `(${r})`).join("|"),
"gum"
);
if (hasSyllables) {
withSyllablesRegExp = normalizationRegex;
} else { } else {
normalizationRegex = withSyllablesRegExp = new RegExp( noSyllablesRegExp = normalizationRegex;
regexp + `|(${FIRST_CHAR_SYLLABLES_REG_EXP})`,
"gum"
);
} }
} }
@ -281,26 +294,27 @@ function normalize(text) {
} }
if (p5) { if (p5) {
// Compound word with a line break after the hyphen. // In "X-\ny", "-\n" is removed because an hyphen at the end of a line
// Since the \n isn't in the original text, o = 3 and n = 3. // between two letters is likely here to mark a break in a word.
shiftOrigin += 1;
eol += 1;
return p5.replace("\n", "");
}
if (p6) {
// "X-\n" is removed because an hyphen at the end of a line
// with not a space before is likely here to mark a break
// in a word.
// If X is encoded with UTF-32 then it can have a length greater than 1. // If X is encoded with UTF-32 then it can have a length greater than 1.
// The \n isn't in the original text so here y = i, n = X.len - 2 and // The \n isn't in the original text so here y = i, n = X.len - 2 and
// o = X.len - 1. // o = X.len - 1.
const len = p6.length - 2; const len = p5.length - 2;
positions.push(i - shift + len, 1 + shift); positions.push(i - shift + len, 1 + shift);
shift += 1; shift += 1;
shiftOrigin += 1; shiftOrigin += 1;
eol += 1; eol += 1;
return p6.slice(0, -2); return p5.slice(0, -2);
}
if (p6) {
// A - following a non-space character that is not detected as the
// hyphen breaking a word in two lines needs to be preserved. It could
// be, for example, in a compound word or in a date.
// Only remove the newline.
shiftOrigin += 1;
eol += 1;
return p6.slice(0, -1);
} }
if (p7) { if (p7) {
@ -324,7 +338,7 @@ function normalize(text) {
return " "; return " ";
} }
// p8 // p9
if (i + eol === syllablePositions[syllableIndex]?.[1]) { if (i + eol === syllablePositions[syllableIndex]?.[1]) {
// A syllable (1 char) is replaced with several chars (n) so // A syllable (1 char) is replaced with several chars (n) so
// newCharsLen = n - 1. // newCharsLen = n - 1.