mirror of
https://github.com/mozilla/pdf.js.git
synced 2025-04-22 16:18:08 +02:00
Fix issues in text selection
- PR #13257 fixed a lot of issues but not all and this patch aims to fix almost all remaining issues. - the idea in this new patch is to compare position of new glyph with the last position where a glyph has been drawn; - no space are "drawn": it just moves the cursor but they aren't added in the chunk; - so this way a space followed by a cursor move can be treated as only one space: it helps to merge all spaces into one. - to make difference between real spaces and tracking ones, we used a factor of the space width (from the font) - it was a pretty good idea in general but it fails with some fonts where space was too big: - in Poppler, they're using a factor of the font size: this is an excellent idea (<= 0.1 * fontSize implies tracking space).
This commit is contained in:
parent
f5b79be0b7
commit
61d1063276
10 changed files with 361 additions and 143 deletions
5
test/pdfs/.gitignore
vendored
5
test/pdfs/.gitignore
vendored
|
@ -13,6 +13,7 @@
|
|||
!issue1155r.pdf
|
||||
!issue2017r.pdf
|
||||
!bug1727053.pdf
|
||||
!issue11913.pdf
|
||||
!issue2391-1.pdf
|
||||
!issue2391-2.pdf
|
||||
!issue14046.pdf
|
||||
|
@ -182,6 +183,7 @@
|
|||
!issue11931.pdf
|
||||
!issue1655r.pdf
|
||||
!issue6541.pdf
|
||||
!issue10640.pdf
|
||||
!issue2948.pdf
|
||||
!issue6231_1.pdf
|
||||
!issue10402.pdf
|
||||
|
@ -285,6 +287,7 @@
|
|||
!issue2840.pdf
|
||||
!issue4061.pdf
|
||||
!issue4668.pdf
|
||||
!issue13226.pdf
|
||||
!PDFJS-7562-reduced.pdf
|
||||
!issue11768_reduced.pdf
|
||||
!issue5039.pdf
|
||||
|
@ -440,6 +443,7 @@
|
|||
!annotation-fileattachment.pdf
|
||||
!annotation-text-widget.pdf
|
||||
!annotation-choice-widget.pdf
|
||||
!issue10900.pdf
|
||||
!annotation-button-widget.pdf
|
||||
!annotation-polyline-polygon.pdf
|
||||
!annotation-polyline-polygon-without-appearance.pdf
|
||||
|
@ -462,6 +466,7 @@
|
|||
!issue9972-3.pdf
|
||||
!tiling-pattern-box.pdf
|
||||
!tiling-pattern-large-steps.pdf
|
||||
!issue13201.pdf
|
||||
!issue11555.pdf
|
||||
!issue12337.pdf
|
||||
!pr12564.pdf
|
||||
|
|
BIN
test/pdfs/issue10640.pdf
Normal file
BIN
test/pdfs/issue10640.pdf
Normal file
Binary file not shown.
BIN
test/pdfs/issue10900.pdf
Normal file
BIN
test/pdfs/issue10900.pdf
Normal file
Binary file not shown.
BIN
test/pdfs/issue11913.pdf
Normal file
BIN
test/pdfs/issue11913.pdf
Normal file
Binary file not shown.
BIN
test/pdfs/issue13201.pdf
Normal file
BIN
test/pdfs/issue13201.pdf
Normal file
Binary file not shown.
86
test/pdfs/issue13226.pdf
Normal file
86
test/pdfs/issue13226.pdf
Normal file
|
@ -0,0 +1,86 @@
|
|||
%PDF-1.7
|
||||
%âãÏÓ
|
||||
1 0 obj
|
||||
<<
|
||||
/Type /Encoding
|
||||
/BaseEncoding /WinAnsiEncoding
|
||||
>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<<
|
||||
/Pages 3 0 R
|
||||
/Type /Catalog
|
||||
>>
|
||||
endobj
|
||||
3 0 obj
|
||||
<<
|
||||
/MediaBox [0 0 400 50]
|
||||
/Kids [4 0 R]
|
||||
/Count 1
|
||||
/Type /Pages
|
||||
>>
|
||||
endobj
|
||||
4 0 obj
|
||||
<<
|
||||
/Parent 3 0 R
|
||||
/MediaBox [0 0 400 50]
|
||||
/Resources
|
||||
<<
|
||||
/Font
|
||||
<<
|
||||
/F1 5 0 R
|
||||
>>
|
||||
>>
|
||||
/Contents 6 0 R
|
||||
/Type /Page
|
||||
>>
|
||||
endobj
|
||||
5 0 obj
|
||||
<<
|
||||
/BaseFont /Times-Italic
|
||||
/Subtype /Type1
|
||||
/Encoding 1 0 R
|
||||
/Type /Font
|
||||
>>
|
||||
endobj
|
||||
6 0 obj
|
||||
<<
|
||||
/Length 278
|
||||
>>
|
||||
stream
|
||||
BT
|
||||
/F1 10 Tf
|
||||
0.005 Tc 1 0 0 1 10 30 Tm
|
||||
[(M)5 (i)5 (t)]TJ
|
||||
/Span<</ActualText<FEFF00AD>>> BDC
|
||||
14 0 Td
|
||||
( )Tj
|
||||
EMC
|
||||
T*
|
||||
(arbei)Tj
|
||||
/Span<</ActualText<FEFF00AD>>> BDC
|
||||
( )Tj
|
||||
EMC
|
||||
21.2 0 Td
|
||||
[(terinnen und Mitarbeiter arbeiten in \374ber 100 L\344ndern engagiert im\
|
||||
Dienste)5 ( )]TJ
|
||||
ET
|
||||
endstream
|
||||
endobj xref
|
||||
0 7
|
||||
0000000000 65535 f
|
||||
0000000015 00000 n
|
||||
0000000085 00000 n
|
||||
0000000136 00000 n
|
||||
0000000218 00000 n
|
||||
0000000347 00000 n
|
||||
0000000438 00000 n
|
||||
trailer
|
||||
|
||||
<<
|
||||
/Root 2 0 R
|
||||
/Size 7
|
||||
>>
|
||||
startxref
|
||||
768
|
||||
%%EOF
|
|
@ -73,6 +73,10 @@ describe("api", function () {
|
|||
}, WAIT_TIMEOUT);
|
||||
}
|
||||
|
||||
function mergeText(items) {
|
||||
return items.map(chunk => chunk.str + (chunk.hasEOL ? "\n" : "")).join("");
|
||||
}
|
||||
|
||||
describe("getDocument", function () {
|
||||
it("creates pdf doc from URL-string", async function () {
|
||||
const urlStr = TEST_PDFS_PATH + basicApiFileName;
|
||||
|
@ -1604,11 +1608,17 @@ describe("api", function () {
|
|||
const data = await Promise.all([defaultPromise, parametersPromise]);
|
||||
|
||||
expect(!!data[0].items).toEqual(true);
|
||||
expect(data[0].items.length).toEqual(12);
|
||||
expect(data[0].items.length).toEqual(11);
|
||||
expect(!!data[0].styles).toEqual(true);
|
||||
|
||||
const page1 = mergeText(data[0].items);
|
||||
expect(page1).toEqual(`Table Of Content
|
||||
Chapter 1 .......................................................... 2
|
||||
Paragraph 1.1 ...................................................... 3
|
||||
page 1 / 3`);
|
||||
|
||||
expect(!!data[1].items).toEqual(true);
|
||||
expect(data[1].items.length).toEqual(7);
|
||||
expect(data[1].items.length).toEqual(6);
|
||||
expect(!!data[1].styles).toEqual(true);
|
||||
});
|
||||
|
||||
|
@ -1643,6 +1653,107 @@ describe("api", function () {
|
|||
await loadingTask.destroy();
|
||||
});
|
||||
|
||||
it("gets text content, with no extra spaces (issue 13226)", async function () {
|
||||
const loadingTask = getDocument(buildGetDocumentParams("issue13226.pdf"));
|
||||
const pdfDoc = await loadingTask.promise;
|
||||
const pdfPage = await pdfDoc.getPage(1);
|
||||
const { items } = await pdfPage.getTextContent();
|
||||
const text = mergeText(items);
|
||||
|
||||
expect(text).toEqual(
|
||||
"Mitarbeiterinnen und Mitarbeiter arbeiten in über 100 Ländern engagiert im Dienste"
|
||||
);
|
||||
|
||||
await loadingTask.destroy();
|
||||
});
|
||||
|
||||
it("gets text content, with merged spaces (issue 13201)", async function () {
|
||||
const loadingTask = getDocument(buildGetDocumentParams("issue13201.pdf"));
|
||||
const pdfDoc = await loadingTask.promise;
|
||||
const pdfPage = await pdfDoc.getPage(1);
|
||||
const { items } = await pdfPage.getTextContent();
|
||||
const text = mergeText(items);
|
||||
|
||||
expect(
|
||||
text.includes(
|
||||
"Abstract. A purely peer-to-peer version of electronic cash would allow online"
|
||||
)
|
||||
).toEqual(true);
|
||||
expect(
|
||||
text.includes(
|
||||
"avoid mediating disputes. The cost of mediation increases transaction costs, limiting the"
|
||||
)
|
||||
).toEqual(true);
|
||||
expect(
|
||||
text.includes(
|
||||
"system is secure as long as honest nodes collectively control more CPU power than any"
|
||||
)
|
||||
).toEqual(true);
|
||||
|
||||
await loadingTask.destroy();
|
||||
});
|
||||
|
||||
it("gets text content, with no spaces between letters of words (issue 11913)", async function () {
|
||||
const loadingTask = getDocument(buildGetDocumentParams("issue11913.pdf"));
|
||||
const pdfDoc = await loadingTask.promise;
|
||||
const pdfPage = await pdfDoc.getPage(1);
|
||||
const { items } = await pdfPage.getTextContent();
|
||||
const text = mergeText(items);
|
||||
|
||||
expect(
|
||||
text.includes(
|
||||
"1. The first of these cases arises from the tragic handicap which has blighted the life of the Plaintiff, and from the response of the"
|
||||
)
|
||||
).toEqual(true);
|
||||
expect(
|
||||
text.includes(
|
||||
"argued in this Court the appeal raises narrower, but important, issues which may be summarised as follows:-"
|
||||
)
|
||||
).toEqual(true);
|
||||
await loadingTask.destroy();
|
||||
});
|
||||
|
||||
it("gets text content, with merged spaces (issue 10900)", async function () {
|
||||
const loadingTask = getDocument(buildGetDocumentParams("issue10900.pdf"));
|
||||
const pdfDoc = await loadingTask.promise;
|
||||
const pdfPage = await pdfDoc.getPage(1);
|
||||
const { items } = await pdfPage.getTextContent();
|
||||
const text = mergeText(items);
|
||||
|
||||
expect(
|
||||
text.includes(`3 3 3 3
|
||||
851.5 854.9 839.3 837.5
|
||||
633.6 727.8 789.9 796.2
|
||||
1,485.1 1,582.7 1,629.2 1,633.7
|
||||
114.2 121.7 125.3 130.7
|
||||
13.0x 13.0x 13.0x 12.5x`)
|
||||
).toEqual(true);
|
||||
|
||||
await loadingTask.destroy();
|
||||
});
|
||||
|
||||
it("gets text content, with spaces (issue 10640)", async function () {
|
||||
const loadingTask = getDocument(buildGetDocumentParams("issue10640.pdf"));
|
||||
const pdfDoc = await loadingTask.promise;
|
||||
const pdfPage = await pdfDoc.getPage(1);
|
||||
const { items } = await pdfPage.getTextContent();
|
||||
const text = mergeText(items);
|
||||
|
||||
expect(
|
||||
text.includes(`Open Sans is a humanist sans serif typeface designed by Steve Matteson.
|
||||
Open Sans was designed with an upright stress, open forms and a neu-
|
||||
tral, yet friendly appearance. It was optimized for print, web, and mobile
|
||||
interfaces, and has excellent legibility characteristics in its letterforms (see
|
||||
figure \x81 on the following page). This font is available from the Google Font
|
||||
Directory [\x81] as TrueType files licensed under the Apache License version \x82.\x80.
|
||||
This package provides support for this font in LATEX. It includes Type \x81
|
||||
versions of the fonts, converted for this package using FontForge from its
|
||||
sources, for full support with Dvips.`)
|
||||
).toEqual(true);
|
||||
|
||||
await loadingTask.destroy();
|
||||
});
|
||||
|
||||
it("gets empty structure tree", async function () {
|
||||
const tree = await page.getStructTree();
|
||||
|
||||
|
|
|
@ -268,7 +268,7 @@ describe("pdf_find_controller", function () {
|
|||
pageIndex: 0,
|
||||
matchIndex: 0,
|
||||
},
|
||||
pageMatches: [[19, 48, 66]],
|
||||
pageMatches: [[19, 46, 62]],
|
||||
pageMatchesLength: [[8, 8, 8]],
|
||||
});
|
||||
});
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue