mirror of
https://github.com/mozilla/pdf.js.git
synced 2025-04-19 06:38:07 +02:00
Merge pull request #19324 from nicolo-ribaudo/search-dash-date
Allow searching for number-number on two lines
This commit is contained in:
commit
711bf2bd12
2 changed files with 78 additions and 30 deletions
|
@ -1104,6 +1104,40 @@ describe("pdf_find_controller", function () {
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("performs a search with a dash between two digits", async () => {
|
||||||
|
const { eventBus, pdfFindController } = await initPdfFindController();
|
||||||
|
|
||||||
|
await testSearch({
|
||||||
|
eventBus,
|
||||||
|
pdfFindController,
|
||||||
|
state: {
|
||||||
|
query: "2008-02",
|
||||||
|
},
|
||||||
|
matchesPerPage: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
|
||||||
|
selectedMatch: {
|
||||||
|
pageIndex: 13,
|
||||||
|
matchIndex: 0,
|
||||||
|
},
|
||||||
|
pageMatches: [[], [], [], [], [], [], [], [], [], [], [], [], [], [314]],
|
||||||
|
pageMatchesLength: [
|
||||||
|
[],
|
||||||
|
[],
|
||||||
|
[],
|
||||||
|
[],
|
||||||
|
[],
|
||||||
|
[],
|
||||||
|
[],
|
||||||
|
[],
|
||||||
|
[],
|
||||||
|
[],
|
||||||
|
[],
|
||||||
|
[],
|
||||||
|
[],
|
||||||
|
[7],
|
||||||
|
],
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
describe("custom matcher", () => {
|
describe("custom matcher", () => {
|
||||||
it("calls to the matcher with the right arguments", async () => {
|
it("calls to the matcher with the right arguments", async () => {
|
||||||
const QUERY = "Foo bar";
|
const QUERY = "Foo bar";
|
||||||
|
|
|
@ -117,10 +117,12 @@ function normalize(text) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const hasSyllables = syllablePositions.length > 0;
|
||||||
|
|
||||||
let normalizationRegex;
|
let normalizationRegex;
|
||||||
if (syllablePositions.length === 0 && noSyllablesRegExp) {
|
if (!hasSyllables && noSyllablesRegExp) {
|
||||||
normalizationRegex = noSyllablesRegExp;
|
normalizationRegex = noSyllablesRegExp;
|
||||||
} else if (syllablePositions.length > 0 && withSyllablesRegExp) {
|
} else if (hasSyllables && withSyllablesRegExp) {
|
||||||
normalizationRegex = withSyllablesRegExp;
|
normalizationRegex = withSyllablesRegExp;
|
||||||
} else {
|
} else {
|
||||||
// Compile the regular expression for text normalization once.
|
// Compile the regular expression for text normalization once.
|
||||||
|
@ -131,22 +133,33 @@ function normalize(text) {
|
||||||
// 30A0-30FF: Katakana
|
// 30A0-30FF: Katakana
|
||||||
const CJK = "(?:\\p{Ideographic}|[\u3040-\u30FF])";
|
const CJK = "(?:\\p{Ideographic}|[\u3040-\u30FF])";
|
||||||
const HKDiacritics = "(?:\u3099|\u309A)";
|
const HKDiacritics = "(?:\u3099|\u309A)";
|
||||||
const CompoundWord = "\\p{Ll}-\\n\\p{Lu}";
|
const BrokenWord = `\\p{Ll}-\\n(?=\\p{Ll})|\\p{Lu}-\\n(?=\\p{L})`;
|
||||||
const regexp = `([${replace}])|([${toNormalizeWithNFKC}])|(${HKDiacritics}\\n)|(\\p{M}+(?:-\\n)?)|(${CompoundWord})|(\\S-\\n)|(${CJK}\\n)|(\\n)`;
|
|
||||||
|
|
||||||
if (syllablePositions.length === 0) {
|
const regexps = [
|
||||||
// Most of the syllables belong to Hangul so there are no need
|
/* p1 */ `[${replace}]`,
|
||||||
// to search for them in a non-Hangul document.
|
/* p2 */ `[${toNormalizeWithNFKC}]`,
|
||||||
// We use the \0 in order to have the same number of groups.
|
/* p3 */ `${HKDiacritics}\\n`,
|
||||||
normalizationRegex = noSyllablesRegExp = new RegExp(
|
/* p4 */ "\\p{M}+(?:-\\n)?",
|
||||||
regexp + "|(\\u0000)",
|
/* p5 */ `${BrokenWord}`,
|
||||||
"gum"
|
/* p6 */ "\\S-\\n",
|
||||||
);
|
/* p7 */ `${CJK}\\n`,
|
||||||
|
/* p8 */ "\\n",
|
||||||
|
/* p9 */ hasSyllables
|
||||||
|
? FIRST_CHAR_SYLLABLES_REG_EXP
|
||||||
|
: // Most of the syllables belong to Hangul so there are no need
|
||||||
|
// to search for them in a non-Hangul document.
|
||||||
|
// We use the \0 in order to have the same number of groups.
|
||||||
|
"\\u0000",
|
||||||
|
];
|
||||||
|
normalizationRegex = new RegExp(
|
||||||
|
regexps.map(r => `(${r})`).join("|"),
|
||||||
|
"gum"
|
||||||
|
);
|
||||||
|
|
||||||
|
if (hasSyllables) {
|
||||||
|
withSyllablesRegExp = normalizationRegex;
|
||||||
} else {
|
} else {
|
||||||
normalizationRegex = withSyllablesRegExp = new RegExp(
|
noSyllablesRegExp = normalizationRegex;
|
||||||
regexp + `|(${FIRST_CHAR_SYLLABLES_REG_EXP})`,
|
|
||||||
"gum"
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -281,26 +294,27 @@ function normalize(text) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (p5) {
|
if (p5) {
|
||||||
// Compound word with a line break after the hyphen.
|
// In "X-\ny", "-\n" is removed because an hyphen at the end of a line
|
||||||
// Since the \n isn't in the original text, o = 3 and n = 3.
|
// between two letters is likely here to mark a break in a word.
|
||||||
shiftOrigin += 1;
|
|
||||||
eol += 1;
|
|
||||||
return p5.replace("\n", "");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (p6) {
|
|
||||||
// "X-\n" is removed because an hyphen at the end of a line
|
|
||||||
// with not a space before is likely here to mark a break
|
|
||||||
// in a word.
|
|
||||||
// If X is encoded with UTF-32 then it can have a length greater than 1.
|
// If X is encoded with UTF-32 then it can have a length greater than 1.
|
||||||
// The \n isn't in the original text so here y = i, n = X.len - 2 and
|
// The \n isn't in the original text so here y = i, n = X.len - 2 and
|
||||||
// o = X.len - 1.
|
// o = X.len - 1.
|
||||||
const len = p6.length - 2;
|
const len = p5.length - 2;
|
||||||
positions.push(i - shift + len, 1 + shift);
|
positions.push(i - shift + len, 1 + shift);
|
||||||
shift += 1;
|
shift += 1;
|
||||||
shiftOrigin += 1;
|
shiftOrigin += 1;
|
||||||
eol += 1;
|
eol += 1;
|
||||||
return p6.slice(0, -2);
|
return p5.slice(0, -2);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (p6) {
|
||||||
|
// A - following a non-space character that is not detected as the
|
||||||
|
// hyphen breaking a word in two lines needs to be preserved. It could
|
||||||
|
// be, for example, in a compound word or in a date.
|
||||||
|
// Only remove the newline.
|
||||||
|
shiftOrigin += 1;
|
||||||
|
eol += 1;
|
||||||
|
return p6.slice(0, -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (p7) {
|
if (p7) {
|
||||||
|
@ -324,7 +338,7 @@ function normalize(text) {
|
||||||
return " ";
|
return " ";
|
||||||
}
|
}
|
||||||
|
|
||||||
// p8
|
// p9
|
||||||
if (i + eol === syllablePositions[syllableIndex]?.[1]) {
|
if (i + eol === syllablePositions[syllableIndex]?.[1]) {
|
||||||
// A syllable (1 char) is replaced with several chars (n) so
|
// A syllable (1 char) is replaced with several chars (n) so
|
||||||
// newCharsLen = n - 1.
|
// newCharsLen = n - 1.
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue