1
0
Fork 0
mirror of https://github.com/mozilla/pdf.js.git synced 2025-04-24 09:08:07 +02:00

[text selection] Add the whitespaces present in the pdf in the text chunk

- it aims to fix issue #14627;
- the basic idea of the recent text refactoring was to only consider the rendered visible whitespaces.
  But sometimes, the heuristics aren't correct and although some whitespaces are in the text stream
  they weren't in the text chunks because they were too small. Hence we added some exceptions, for example,
  we always add a whitespace when it is between two non-whitespace chars but only when in the same Tj.
  So basically, this patch removes the constraint to have the chars in the same Tj
  (in using a circular buffer to save the two last chars) but don't add a space when the visible space is really
  too small (hence `NOT_A_SPACE_FACTOR`).
This commit is contained in:
Calixte Denizet 2022-03-21 22:10:46 +01:00
parent db4f3adc5e
commit 18e79e3c0b
6 changed files with 94 additions and 23 deletions

View file

@ -515,3 +515,4 @@
!issue14497.pdf
!issue14502.pdf
!issue13211.pdf
!issue14627.pdf

BIN
test/pdfs/issue14627.pdf Executable file

Binary file not shown.

View file

@ -6329,5 +6329,11 @@
"md5": "d193853e8a123dc50eeea593a4150b60",
"rounds": 1,
"type": "eq"
},
{ "id": "issue14627",
"file": "pdfs/issue14627.pdf",
"md5": "5d1bfcc3b3130bfa7e33e43990e2213a",
"rounds": 1,
"type": "text"
}
]

View file

@ -1999,7 +1999,7 @@ describe("api", function () {
const data = await Promise.all([defaultPromise, parametersPromise]);
expect(!!data[0].items).toEqual(true);
expect(data[0].items.length).toEqual(11);
expect(data[0].items.length).toEqual(15);
expect(!!data[0].styles).toEqual(true);
const page1 = mergeText(data[0].items);

View file

@ -579,14 +579,14 @@ describe("pdf_find_controller", function () {
},
pageMatches: [
[
299, 337, 414, 476, 623, 797, 978, 984, 1010, 1058, 1079, 1144, 1152,
1274, 1343, 1391, 1399, 1421, 1497, 1521, 1527, 1684, 1774, 1786,
1857, 1879, 1909, 1946, 2064, 2074, 2161, 2178, 2213, 2227, 2272,
2322, 2359, 2401, 2412, 2423, 2462, 2532, 2538, 2553, 2562, 2576,
2602, 2613, 2638, 2668, 2792, 2805, 2836, 2848, 2859, 2896, 2902,
2916, 2940, 2960, 3091, 3239, 3249, 3339, 3387, 3394, 3468, 3477,
3485, 3502, 3690, 3696, 3711, 3758, 3789, 3865, 3977, 4052, 4058,
4071,
302, 340, 418, 481, 628, 802, 983, 989, 1015, 1063, 1084, 1149, 1157,
1278, 1346, 1394, 1402, 1424, 1500, 1524, 1530, 1686, 1776, 1788,
1859, 1881, 1911, 1948, 2066, 2076, 2163, 2180, 2215, 2229, 2274,
2324, 2360, 2402, 2413, 2424, 2463, 2532, 2538, 2553, 2562, 2576,
2602, 2613, 2638, 2668, 2792, 2805, 2836, 2847, 2858, 2895, 2901,
2915, 2939, 2959, 3089, 3236, 3246, 3336, 3384, 3391, 3465, 3474,
3482, 3499, 3687, 3693, 3708, 3755, 3786, 3862, 3974, 4049, 4055,
4068,
],
],
pageMatchesLength: [