From 1c364422a612f02bc9f50212c7c40dd473e544ff Mon Sep 17 00:00:00 2001 From: alexcat3 Date: Fri, 5 Jul 2024 13:04:11 -0400 Subject: [PATCH] Handle toUnicode cmaps that omit leading zeros in hex encoded UTF-16 (issue 18099) Add unit test to check compatability with such cmaps In the PDF in issue 18099. the toUnicode cmap had a line to map the glyph char codes from 00 to 7F to the corresponding code points. The syntax to map a range of char codes to a range of unicode code points is As the unicode code points are supposed to be given in UTF-16 BE, the PDF's line SHOULD have probably read <00> <7F> <0000> Instead it omitted two leading zeros from the UTF-16 like this <00> <7F> <00> This confused PDF.js into mapping these character codes to the UTF-16 characters with the corresponding HIGH bytes (01 became \u0100, 02 became \u0200, et cetera), which ended up turning latin text in the PDF into chinese when it was copied I'm not sure if the PDF spec actually allows PDFs to do this, but since there's at least one PDF in the wild that does and other PDF readers read it correctly, PDF.js should probably support this --- src/core/evaluator.js | 5 +++++ test/pdfs/.gitignore | 1 + test/pdfs/issue18099_reduced.pdf | Bin 0 -> 1483 bytes test/unit/api_spec.js | 15 +++++++++++++++ 4 files changed, 21 insertions(+) create mode 100644 test/pdfs/issue18099_reduced.pdf diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 11c83d04f..3896ed47e 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -3852,6 +3852,11 @@ class PartialEvaluator { map[charCode] = String.fromCodePoint(token); return; } + // Add back omitted leading zeros on odd length tokens + // (fixes issue #18099) + if (token.length % 2 !== 0) { + token = "\u0000" + token; + } const str = []; for (let k = 0; k < token.length; k += 2) { const w1 = (token.charCodeAt(k) << 8) | token.charCodeAt(k + 1); diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 98c3f4ace..13450e37c 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -653,3 +653,4 @@ !bug1539074.1.pdf !issue18305.pdf !issue18360.pdf +!issue18099_reduced.pdf diff --git a/test/pdfs/issue18099_reduced.pdf b/test/pdfs/issue18099_reduced.pdf new file mode 100644 index 0000000000000000000000000000000000000000..8fa6fd6a8d7e2f782f3f0f6323590f724a2cdbd7 GIT binary patch literal 1483 zcmY!laBiE*l&DfW-9FVg)0hNRWblNM%8)zH?$p zVorX#ogFV%YF-Lh9Y`9c!8yM)uSCI6-#as=bcMkWU42BwBa#wNz5n)=Ql>HH!Mpj$Ibb2aq?it!Fc(fPrZMN8!Uu<;KAVCP5sYk1u`jID7JBxW;+?Q=WlWLO4!7D&l2iI9J9s z0UYKq?XchnISgj8Q(|!{$jSO4i5dC1iTZAtMa3n~8Hq&-#zy)+iBPtYsi}T&X;KL& zK0}I1Q^64%k{_Cv3394}IoPKVUxExn_9aLTW*Ht|8k-_}Gj@u7|6>J#qt$k-PxCf@ zN;qDo6q&clcggSNr(;WRF&!3&yp?deu&d^`@SdG1-@8)&B+M|k7ZCY<__bo+YP~w0 z$=07%9hUfeX!^@t7T1{PA6wzG^6!hVz@PW$zFpJx_uju@<5cloHlUGMbL zoiCT~?G3I8JNo_TXQN}2>wfLEuhX0#TDa%jXY2Wv>t==A&pz)c`g?uZzqIpfy7zo| z`oMDCo2`a zZ>y9qyk1`tKB=^pD|Ke`M$OZ2TnwK{u{arME-jH*qoJQ+5dL=4s?+YiMr-$Le?6Gl zy}{W*{8)X~GN&6U&hvwNL~?m}F6PJ98l|-4uD>6Wp1N2_kFkm8>8zBE3CRmSEm&!j z6~(iS<2JkcOesEIu`M%adHp}2dYO-NdWOJZtLHpP5l)?otFAJeiM1X6cF=PfUw~=W z;;jek50&1$aK)iGp~yf*!piW7VPNw?u~UwTdh+}er57Z;uwmlqOw4fDu~Oi|zV!kL z5yB!owmjL2DLjuY!oOkL7PF9pzpyJAL z^uhxHvlV*}USpA9Q*<$AY;ojxc;&$>R>dhk0&I#?k`~r`i~qWR)wkT**71A#;n?Y| zk7wOq`6pnBdQwkK{i$b}eycyd*Z(%favj#RTv3#o#>-`(U;!lHK*7w^)Yw!3Bmo76 zz+?yl3V8@IGhjYI6Eidd7ENekMh2D`V#Y=m7-A+SnCeVTk<^tGC1&QN7J-UMUM~Hh z{Cr@s2g*1=j%Qw4z5=Lp195^gt5Ov#^gUgoZ4BHj&0Wo1T`b+4Tr5lt9G%^qO`I$Y oja^Nh%q*Nuo$Ls!hy`WF;*!Lo5=c-P8k(5`OKw$He>Yw(0G5UZqyPW_ literal 0 HcmV?d00001 diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index 44f9de06b..fa446fbef 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -3419,6 +3419,21 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`) await loadingTask.destroy(); }); + it("gets text content, correctly handling documents with toUnicode cmaps that omit leading zeros on hex-encoded UTF-16", async function () { + const loadingTask = getDocument( + buildGetDocumentParams("issue18099_reduced.pdf") + ); + const pdfDoc = await loadingTask.promise; + const pdfPage = await pdfDoc.getPage(1); + const { items } = await pdfPage.getTextContent({ + disableNormalization: true, + }); + const text = mergeText(items); + expect(text).toEqual("Hello world!"); + + await loadingTask.destroy(); + }); + it("gets text content, and check that out-of-page text is not present (bug 1755201)", async function () { if (isNodeJS) { pending("Linked test-cases are not supported in Node.js.");