From 1c364422a612f02bc9f50212c7c40dd473e544ff Mon Sep 17 00:00:00 2001
From: alexcat3 <alexcunneen@gmail.com>
Date: Fri, 5 Jul 2024 13:04:11 -0400
Subject: [PATCH] Handle toUnicode cmaps that omit leading zeros in hex encoded
 UTF-16 (issue 18099) Add unit test to check compatability with such cmaps

In the PDF in issue 18099. the toUnicode cmap had a line to map the glyph char codes from 00 to 7F to the corresponding code points. The syntax to map a range of char codes to a range of unicode code points is
<start_char_code> <end_char_code> <start_unicode_codepoint>
As the unicode code points are supposed to be given in UTF-16 BE, the PDF's line SHOULD have probably read
<00> <7F> <0000>
Instead it omitted two leading zeros from the UTF-16 like this
<00> <7F> <00>
This confused PDF.js into mapping these character codes to the UTF-16 characters with the corresponding HIGH bytes (01 became \u0100, 02 became \u0200, et cetera), which ended up turning latin text in the PDF into chinese when it was copied
I'm not sure if the PDF spec actually allows PDFs to do this, but since there's at least one PDF in the wild that does and other PDF readers read it correctly, PDF.js should probably support this
---
 src/core/evaluator.js            |   5 +++++
 test/pdfs/.gitignore             |   1 +
 test/pdfs/issue18099_reduced.pdf | Bin 0 -> 1483 bytes
 test/unit/api_spec.js            |  15 +++++++++++++++
 4 files changed, 21 insertions(+)
 create mode 100644 test/pdfs/issue18099_reduced.pdf

diff --git a/src/core/evaluator.js b/src/core/evaluator.js
index 11c83d04f..3896ed47e 100644
--- a/src/core/evaluator.js
+++ b/src/core/evaluator.js
@@ -3852,6 +3852,11 @@ class PartialEvaluator {
             map[charCode] = String.fromCodePoint(token);
             return;
           }
+          // Add back omitted leading zeros on odd length tokens
+          // (fixes issue #18099)
+          if (token.length % 2 !== 0) {
+            token = "\u0000" + token;
+          }
           const str = [];
           for (let k = 0; k < token.length; k += 2) {
             const w1 = (token.charCodeAt(k) << 8) | token.charCodeAt(k + 1);
diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore
index 98c3f4ace..13450e37c 100644
--- a/test/pdfs/.gitignore
+++ b/test/pdfs/.gitignore
@@ -653,3 +653,4 @@
 !bug1539074.1.pdf
 !issue18305.pdf
 !issue18360.pdf
+!issue18099_reduced.pdf
diff --git a/test/pdfs/issue18099_reduced.pdf b/test/pdfs/issue18099_reduced.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..8fa6fd6a8d7e2f782f3f0f6323590f724a2cdbd7
GIT binary patch
literal 1483
zcmY!laB<T$)HCPhQeC+D=yhH$Lj?nc{G=>iE*l&DfW-9FVg)0hNRWblNM%8)zH?$p
zVorX#ogFV%YF-Lh9Y`9c!8yM)uSCI6-#as<I9kC3tT7g%6Koc;K4WBkMX8A;nfZAx
zi6yBTE>=bcMkWU42BwBa#wNz5n)=Ql>HH!Mpj$Ibb2aq?it<xRlT+aW$i{*!MK~v~
zBsH(3SiuzR5`Ev)l*~k@{EBEG*FeF@0E}aS-Yf#Dg84HjwK%`DC^@wl7yxehc_kpq
zOu-Cnf*lYbyn}2l$Of3TZkahHsYUv3IY8gIq$cO5r0V;m=B1ZpD3}`9*%g-neUr<}
zRWYZS>!Fc(fPrZMN8!Uu<;KAVCP5sYk1u`jID7JBxW;+?Q=WlWLO4!7D&l2iI9J9s
z0UYKq?XchnISgj8Q(|!{$jSO4i5dC1iTZAtMa3n~8Hq&-#zy)+iBPtYsi}T&X;KL&
zK0}I1Q^64%k{_Cv3394}IoPKVUxExn_9aLTW*Ht|8k-_}Gj@u7|6>J#qt$k-PxCf@
zN;qDo6q&clcggSNr(;WRF&!3&yp?deu&d^`@SdG1-@8)&B+M|k7ZCY<__bo+YP~w0
z$=07%9hUfeX!^@t7T1{PA6wzG^6!hVz@PW$zFpJx_u<!y-D_&>ju@<5cloHlUGMbL
zoiCT~?G3I8JNo_TXQN}2>wfLEuhX0#TDa%jXY2Wv>t==A&pz)c`g?uZzqIpfy7zo|
z`oMDC<ErYDtK}^B=NV^bG`HP(^3y2#cz3UQm~7zs-2zodFa9?VPdqAMw`|+O>o2`a
zZ>y9qyk1`tKB=^pD|Ke`M$OZ2TnwK{u{arME-jH*qoJQ+5dL=4s?+YiMr-$Le?6Gl
zy}{W*{8)X~GN&6U&hvwNL~?m}F6PJ98l|-4uD>6Wp1N2_kFkm8>8zBE3CRmSEm&!j
z6~(iS<2JkcOesEIu`M%adHp}2dYO-NdWOJZtLHpP5l)?otFAJeiM1X6cF=PfUw~=W
z;;jek50&1$aK)iGp~yf*!piW7VPNw?u~UwTdh+}er57Z;uwmlqOw4fDu~Oi|zV!kL
z5yB!owmjL2DLju<qEEUQ%eC<C`tHcFaXOEan3KdNfkjdaCB>Y!oOkL7PF9pzpyJAL
z^uhxHvlV*}USpA9Q*<$AY;ojxc;&$>R>dhk0&I#?k`~r`i~qWR)wkT**71A#;n?Y|
zk7wOq`6pnBdQwkK{i$b}eycyd*Z(%favj#RTv3#o#>-`(U;!lHK*7w^)Yw!3Bmo76
zz+?yl3V8@IGhjYI6Eidd7ENekMh2D`V#Y=m7-A+SnCeVTk<^tGC1&QN7J-UMUM~Hh
z{Cr@s2g*1=j%Qw4z5=Lp195^gt5Ov#^gUgoZ4BHj&0Wo1T`b+4Tr5lt9G%^qO`I$Y
oja^Nh%q*Nuo$Ls!hy`WF;*!Lo5=c-P8k(5`OKw$He>Yw(0G5UZqyPW_

literal 0
HcmV?d00001

diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js
index 44f9de06b..fa446fbef 100644
--- a/test/unit/api_spec.js
+++ b/test/unit/api_spec.js
@@ -3419,6 +3419,21 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
       await loadingTask.destroy();
     });
 
+    it("gets text content, correctly handling documents with toUnicode cmaps that omit leading zeros on hex-encoded UTF-16", async function () {
+      const loadingTask = getDocument(
+        buildGetDocumentParams("issue18099_reduced.pdf")
+      );
+      const pdfDoc = await loadingTask.promise;
+      const pdfPage = await pdfDoc.getPage(1);
+      const { items } = await pdfPage.getTextContent({
+        disableNormalization: true,
+      });
+      const text = mergeText(items);
+      expect(text).toEqual("Hello world!");
+
+      await loadingTask.destroy();
+    });
+
     it("gets text content, and check that out-of-page text is not present (bug 1755201)", async function () {
       if (isNodeJS) {
         pending("Linked test-cases are not supported in Node.js.");