1
0
Fork 0
mirror of https://github.com/mozilla/pdf.js.git synced 2025-04-20 15:18:08 +02:00

Support search with or without diacritics (bug 1508345, bug 916883, bug 1651113)

- get original index in using a dichotomic seach instead of a linear one;
  - normalize the text in using NFD;
  - convert the query string into a RegExp;
  - replace whitespaces in the query with \s+;
  - handle hyphens at eol use to break a word;
  - add some \s* around punctuation signs
This commit is contained in:
Calixte Denizet 2021-04-18 23:37:22 +02:00
parent 70073ed81c
commit 1f41028fcb
12 changed files with 604 additions and 172 deletions

View file

@ -357,6 +357,7 @@
!issue4650.pdf
!issue6721_reduced.pdf
!issue3025.pdf
!french_diacritics.pdf
!issue2099-1.pdf
!issue3371.pdf
!issue2956.pdf

Binary file not shown.

View file

@ -86,6 +86,7 @@ function testSearch({
entireWord: false,
phraseSearch: true,
findPrevious: false,
matchDiacritics: false,
},
state
);
@ -264,5 +265,293 @@ describe("pdf_find_controller", function () {
pageMatches: [[19, 46, 62]],
pageMatchesLength: [[8, 8, 8]],
});
await testSearch({
eventBus,
pdfFindController,
state: {
query: "1/2",
},
matchesPerPage: [2],
selectedMatch: {
pageIndex: 0,
matchIndex: 0,
},
pageMatches: [[27, 54]],
pageMatchesLength: [[1, 1]],
});
await testSearch({
eventBus,
pdfFindController,
state: {
query: "½",
},
matchesPerPage: [2],
selectedMatch: {
pageIndex: 0,
matchIndex: 0,
},
pageMatches: [[27, 54]],
pageMatchesLength: [[1, 1]],
});
});
it("performs a normal search, where the text with diacritics is normalized", async function () {
const { eventBus, pdfFindController } = await initPdfFindController(
"french_diacritics.pdf"
);
await testSearch({
eventBus,
pdfFindController,
state: {
query: "a",
},
matchesPerPage: [6],
selectedMatch: {
pageIndex: 0,
matchIndex: 0,
},
pageMatches: [[0, 2, 4, 6, 8, 10]],
pageMatchesLength: [[1, 1, 1, 1, 1, 1]],
});
await testSearch({
eventBus,
pdfFindController,
state: {
query: "u",
},
matchesPerPage: [6],
selectedMatch: {
pageIndex: 0,
matchIndex: 0,
},
pageMatches: [[44, 46, 48, 50, 52, 54]],
pageMatchesLength: [[1, 1, 1, 1, 1, 1]],
});
await testSearch({
eventBus,
pdfFindController,
state: {
query: "ë",
matchDiacritics: true,
},
matchesPerPage: [2],
selectedMatch: {
pageIndex: 0,
matchIndex: 0,
},
pageMatches: [[28, 30]],
pageMatchesLength: [[1, 1]],
});
});
it("performs a search where one of the results contains an hyphen", async function () {
const { eventBus, pdfFindController } = await initPdfFindController();
await testSearch({
eventBus,
pdfFindController,
state: {
query: "optimiz",
},
matchesPerPage: [1, 4, 2, 3, 3, 0, 2, 9, 1, 0, 0, 6, 3, 4],
selectedMatch: {
pageIndex: 0,
matchIndex: 0,
},
});
});
it("performs a search where the result is on two lines", async function () {
const { eventBus, pdfFindController } = await initPdfFindController();
await testSearch({
eventBus,
pdfFindController,
state: {
query: "user experience",
},
matchesPerPage: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
selectedMatch: {
pageIndex: 0,
matchIndex: 0,
},
pageMatches: [[2743]],
pageMatchesLength: [[14]],
});
});
it("performs a search where the result is on two lines with a punctuation at eol", async function () {
const { eventBus, pdfFindController } = await initPdfFindController();
await testSearch({
eventBus,
pdfFindController,
state: {
query: "version.the",
},
matchesPerPage: [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
selectedMatch: {
pageIndex: 1,
matchIndex: 0,
},
pageMatches: [[], [1493]],
pageMatchesLength: [[], [11]],
});
});
it("performs a search with a minus sign in the query", async function () {
const { eventBus, pdfFindController } = await initPdfFindController();
await testSearch({
eventBus,
pdfFindController,
state: {
query: "trace-based just-in-time",
},
matchesPerPage: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
selectedMatch: {
pageIndex: 0,
matchIndex: 0,
},
pageMatches: [
[0],
[],
[],
[],
[],
[],
[],
[],
[],
[],
[],
[],
[],
[2087],
],
pageMatchesLength: [
[24],
[],
[],
[],
[],
[],
[],
[],
[],
[],
[],
[],
[],
[24],
],
});
});
it("performs a search with square brackets in the query", async function () {
const { eventBus, pdfFindController } = await initPdfFindController();
await testSearch({
eventBus,
pdfFindController,
state: {
query: "[Programming Languages]",
},
matchesPerPage: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
selectedMatch: {
pageIndex: 0,
matchIndex: 0,
},
pageMatches: [[1501]],
pageMatchesLength: [[25]],
});
});
it("performs a search with parenthesis in the query", async function () {
const { eventBus, pdfFindController } = await initPdfFindController();
await testSearch({
eventBus,
pdfFindController,
state: {
query: "\t (checks)",
},
matchesPerPage: [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
selectedMatch: {
pageIndex: 1,
matchIndex: 0,
},
pageMatches: [[], [201]],
pageMatchesLength: [[], [9]],
});
});
it("performs a search with a final dot in the query", async function () {
const { eventBus, pdfFindController } = await initPdfFindController();
// The whitespace after the dot mustn't be matched.
const query = "complex applications.";
await testSearch({
eventBus,
pdfFindController,
state: {
query,
},
matchesPerPage: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
selectedMatch: {
pageIndex: 0,
matchIndex: 0,
},
pageMatches: [[1946]],
pageMatchesLength: [[21]],
});
});
it("performs a search with a dot in the query and a missing whitespace", async function () {
const { eventBus, pdfFindController } = await initPdfFindController();
// The whitespace after the dot must be matched.
const query = "complex applications.J";
await testSearch({
eventBus,
pdfFindController,
state: {
query,
},
matchesPerPage: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
selectedMatch: {
pageIndex: 0,
matchIndex: 0,
},
pageMatches: [[1946]],
pageMatchesLength: [[23]],
});
});
it("performs a search with a dot followed by a whitespace in the query", async function () {
const { eventBus, pdfFindController } = await initPdfFindController();
const query = "complex applications. j";
await testSearch({
eventBus,
pdfFindController,
state: {
query,
},
matchesPerPage: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
selectedMatch: {
pageIndex: 0,
matchIndex: 0,
},
pageMatches: [[1946]],
pageMatchesLength: [[23]],
});
});
});

View file

@ -51,6 +51,11 @@ describe("ui_utils", function () {
expect(binarySearchFirstItem([2, 3, 4], isGreater3)).toEqual(2);
expect(binarySearchFirstItem([4, 5, 6], isGreater3)).toEqual(0);
});
it("three numeric entries and a start index", function () {
expect(binarySearchFirstItem([0, 1, 2, 3, 4], isGreater3, 2)).toEqual(4);
expect(binarySearchFirstItem([2, 3, 4], isGreater3, 2)).toEqual(2);
expect(binarySearchFirstItem([4, 5, 6], isGreater3, 1)).toEqual(1);
});
});
describe("isValidRotation", function () {