mirror of
https://github.com/mozilla/pdf.js.git
synced 2025-04-20 15:18:08 +02:00
Support search with or without diacritics (bug 1508345, bug 916883, bug 1651113)
- get original index in using a dichotomic seach instead of a linear one; - normalize the text in using NFD; - convert the query string into a RegExp; - replace whitespaces in the query with \s+; - handle hyphens at eol use to break a word; - add some \s* around punctuation signs
This commit is contained in:
parent
70073ed81c
commit
1f41028fcb
12 changed files with 604 additions and 172 deletions
1
test/pdfs/.gitignore
vendored
1
test/pdfs/.gitignore
vendored
|
@ -357,6 +357,7 @@
|
|||
!issue4650.pdf
|
||||
!issue6721_reduced.pdf
|
||||
!issue3025.pdf
|
||||
!french_diacritics.pdf
|
||||
!issue2099-1.pdf
|
||||
!issue3371.pdf
|
||||
!issue2956.pdf
|
||||
|
|
BIN
test/pdfs/french_diacritics.pdf
Normal file
BIN
test/pdfs/french_diacritics.pdf
Normal file
Binary file not shown.
|
@ -86,6 +86,7 @@ function testSearch({
|
|||
entireWord: false,
|
||||
phraseSearch: true,
|
||||
findPrevious: false,
|
||||
matchDiacritics: false,
|
||||
},
|
||||
state
|
||||
);
|
||||
|
@ -264,5 +265,293 @@ describe("pdf_find_controller", function () {
|
|||
pageMatches: [[19, 46, 62]],
|
||||
pageMatchesLength: [[8, 8, 8]],
|
||||
});
|
||||
|
||||
await testSearch({
|
||||
eventBus,
|
||||
pdfFindController,
|
||||
state: {
|
||||
query: "1/2",
|
||||
},
|
||||
matchesPerPage: [2],
|
||||
selectedMatch: {
|
||||
pageIndex: 0,
|
||||
matchIndex: 0,
|
||||
},
|
||||
pageMatches: [[27, 54]],
|
||||
pageMatchesLength: [[1, 1]],
|
||||
});
|
||||
|
||||
await testSearch({
|
||||
eventBus,
|
||||
pdfFindController,
|
||||
state: {
|
||||
query: "½",
|
||||
},
|
||||
matchesPerPage: [2],
|
||||
selectedMatch: {
|
||||
pageIndex: 0,
|
||||
matchIndex: 0,
|
||||
},
|
||||
pageMatches: [[27, 54]],
|
||||
pageMatchesLength: [[1, 1]],
|
||||
});
|
||||
});
|
||||
|
||||
it("performs a normal search, where the text with diacritics is normalized", async function () {
|
||||
const { eventBus, pdfFindController } = await initPdfFindController(
|
||||
"french_diacritics.pdf"
|
||||
);
|
||||
|
||||
await testSearch({
|
||||
eventBus,
|
||||
pdfFindController,
|
||||
state: {
|
||||
query: "a",
|
||||
},
|
||||
matchesPerPage: [6],
|
||||
selectedMatch: {
|
||||
pageIndex: 0,
|
||||
matchIndex: 0,
|
||||
},
|
||||
pageMatches: [[0, 2, 4, 6, 8, 10]],
|
||||
pageMatchesLength: [[1, 1, 1, 1, 1, 1]],
|
||||
});
|
||||
|
||||
await testSearch({
|
||||
eventBus,
|
||||
pdfFindController,
|
||||
state: {
|
||||
query: "u",
|
||||
},
|
||||
matchesPerPage: [6],
|
||||
selectedMatch: {
|
||||
pageIndex: 0,
|
||||
matchIndex: 0,
|
||||
},
|
||||
pageMatches: [[44, 46, 48, 50, 52, 54]],
|
||||
pageMatchesLength: [[1, 1, 1, 1, 1, 1]],
|
||||
});
|
||||
|
||||
await testSearch({
|
||||
eventBus,
|
||||
pdfFindController,
|
||||
state: {
|
||||
query: "ë",
|
||||
matchDiacritics: true,
|
||||
},
|
||||
matchesPerPage: [2],
|
||||
selectedMatch: {
|
||||
pageIndex: 0,
|
||||
matchIndex: 0,
|
||||
},
|
||||
pageMatches: [[28, 30]],
|
||||
pageMatchesLength: [[1, 1]],
|
||||
});
|
||||
});
|
||||
|
||||
it("performs a search where one of the results contains an hyphen", async function () {
|
||||
const { eventBus, pdfFindController } = await initPdfFindController();
|
||||
|
||||
await testSearch({
|
||||
eventBus,
|
||||
pdfFindController,
|
||||
state: {
|
||||
query: "optimiz",
|
||||
},
|
||||
matchesPerPage: [1, 4, 2, 3, 3, 0, 2, 9, 1, 0, 0, 6, 3, 4],
|
||||
selectedMatch: {
|
||||
pageIndex: 0,
|
||||
matchIndex: 0,
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it("performs a search where the result is on two lines", async function () {
|
||||
const { eventBus, pdfFindController } = await initPdfFindController();
|
||||
|
||||
await testSearch({
|
||||
eventBus,
|
||||
pdfFindController,
|
||||
state: {
|
||||
query: "user experience",
|
||||
},
|
||||
matchesPerPage: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
|
||||
selectedMatch: {
|
||||
pageIndex: 0,
|
||||
matchIndex: 0,
|
||||
},
|
||||
pageMatches: [[2743]],
|
||||
pageMatchesLength: [[14]],
|
||||
});
|
||||
});
|
||||
|
||||
it("performs a search where the result is on two lines with a punctuation at eol", async function () {
|
||||
const { eventBus, pdfFindController } = await initPdfFindController();
|
||||
|
||||
await testSearch({
|
||||
eventBus,
|
||||
pdfFindController,
|
||||
state: {
|
||||
query: "version.the",
|
||||
},
|
||||
matchesPerPage: [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
|
||||
selectedMatch: {
|
||||
pageIndex: 1,
|
||||
matchIndex: 0,
|
||||
},
|
||||
pageMatches: [[], [1493]],
|
||||
pageMatchesLength: [[], [11]],
|
||||
});
|
||||
});
|
||||
|
||||
it("performs a search with a minus sign in the query", async function () {
|
||||
const { eventBus, pdfFindController } = await initPdfFindController();
|
||||
|
||||
await testSearch({
|
||||
eventBus,
|
||||
pdfFindController,
|
||||
state: {
|
||||
query: "trace-based just-in-time",
|
||||
},
|
||||
matchesPerPage: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
|
||||
selectedMatch: {
|
||||
pageIndex: 0,
|
||||
matchIndex: 0,
|
||||
},
|
||||
pageMatches: [
|
||||
[0],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[2087],
|
||||
],
|
||||
pageMatchesLength: [
|
||||
[24],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[24],
|
||||
],
|
||||
});
|
||||
});
|
||||
|
||||
it("performs a search with square brackets in the query", async function () {
|
||||
const { eventBus, pdfFindController } = await initPdfFindController();
|
||||
|
||||
await testSearch({
|
||||
eventBus,
|
||||
pdfFindController,
|
||||
state: {
|
||||
query: "[Programming Languages]",
|
||||
},
|
||||
matchesPerPage: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
|
||||
selectedMatch: {
|
||||
pageIndex: 0,
|
||||
matchIndex: 0,
|
||||
},
|
||||
pageMatches: [[1501]],
|
||||
pageMatchesLength: [[25]],
|
||||
});
|
||||
});
|
||||
|
||||
it("performs a search with parenthesis in the query", async function () {
|
||||
const { eventBus, pdfFindController } = await initPdfFindController();
|
||||
|
||||
await testSearch({
|
||||
eventBus,
|
||||
pdfFindController,
|
||||
state: {
|
||||
query: "\t (checks)",
|
||||
},
|
||||
matchesPerPage: [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
|
||||
selectedMatch: {
|
||||
pageIndex: 1,
|
||||
matchIndex: 0,
|
||||
},
|
||||
pageMatches: [[], [201]],
|
||||
pageMatchesLength: [[], [9]],
|
||||
});
|
||||
});
|
||||
|
||||
it("performs a search with a final dot in the query", async function () {
|
||||
const { eventBus, pdfFindController } = await initPdfFindController();
|
||||
|
||||
// The whitespace after the dot mustn't be matched.
|
||||
const query = "complex applications.";
|
||||
|
||||
await testSearch({
|
||||
eventBus,
|
||||
pdfFindController,
|
||||
state: {
|
||||
query,
|
||||
},
|
||||
matchesPerPage: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
|
||||
selectedMatch: {
|
||||
pageIndex: 0,
|
||||
matchIndex: 0,
|
||||
},
|
||||
pageMatches: [[1946]],
|
||||
pageMatchesLength: [[21]],
|
||||
});
|
||||
});
|
||||
|
||||
it("performs a search with a dot in the query and a missing whitespace", async function () {
|
||||
const { eventBus, pdfFindController } = await initPdfFindController();
|
||||
|
||||
// The whitespace after the dot must be matched.
|
||||
const query = "complex applications.J";
|
||||
|
||||
await testSearch({
|
||||
eventBus,
|
||||
pdfFindController,
|
||||
state: {
|
||||
query,
|
||||
},
|
||||
matchesPerPage: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
|
||||
selectedMatch: {
|
||||
pageIndex: 0,
|
||||
matchIndex: 0,
|
||||
},
|
||||
pageMatches: [[1946]],
|
||||
pageMatchesLength: [[23]],
|
||||
});
|
||||
});
|
||||
|
||||
it("performs a search with a dot followed by a whitespace in the query", async function () {
|
||||
const { eventBus, pdfFindController } = await initPdfFindController();
|
||||
const query = "complex applications. j";
|
||||
|
||||
await testSearch({
|
||||
eventBus,
|
||||
pdfFindController,
|
||||
state: {
|
||||
query,
|
||||
},
|
||||
matchesPerPage: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
|
||||
selectedMatch: {
|
||||
pageIndex: 0,
|
||||
matchIndex: 0,
|
||||
},
|
||||
pageMatches: [[1946]],
|
||||
pageMatchesLength: [[23]],
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
|
@ -51,6 +51,11 @@ describe("ui_utils", function () {
|
|||
expect(binarySearchFirstItem([2, 3, 4], isGreater3)).toEqual(2);
|
||||
expect(binarySearchFirstItem([4, 5, 6], isGreater3)).toEqual(0);
|
||||
});
|
||||
it("three numeric entries and a start index", function () {
|
||||
expect(binarySearchFirstItem([0, 1, 2, 3, 4], isGreater3, 2)).toEqual(4);
|
||||
expect(binarySearchFirstItem([2, 3, 4], isGreater3, 2)).toEqual(2);
|
||||
expect(binarySearchFirstItem([4, 5, 6], isGreater3, 1)).toEqual(1);
|
||||
});
|
||||
});
|
||||
|
||||
describe("isValidRotation", function () {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue