diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 11c83d04f..3896ed47e 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -3852,6 +3852,11 @@ class PartialEvaluator { map[charCode] = String.fromCodePoint(token); return; } + // Add back omitted leading zeros on odd length tokens + // (fixes issue #18099) + if (token.length % 2 !== 0) { + token = "\u0000" + token; + } const str = []; for (let k = 0; k < token.length; k += 2) { const w1 = (token.charCodeAt(k) << 8) | token.charCodeAt(k + 1); diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 98c3f4ace..13450e37c 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -653,3 +653,4 @@ !bug1539074.1.pdf !issue18305.pdf !issue18360.pdf +!issue18099_reduced.pdf diff --git a/test/pdfs/issue18099_reduced.pdf b/test/pdfs/issue18099_reduced.pdf new file mode 100644 index 000000000..8fa6fd6a8 Binary files /dev/null and b/test/pdfs/issue18099_reduced.pdf differ diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index 7a383498c..567d02a87 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -3419,6 +3419,21 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`) await loadingTask.destroy(); }); + it("gets text content, correctly handling documents with toUnicode cmaps that omit leading zeros on hex-encoded UTF-16", async function () { + const loadingTask = getDocument( + buildGetDocumentParams("issue18099_reduced.pdf") + ); + const pdfDoc = await loadingTask.promise; + const pdfPage = await pdfDoc.getPage(1); + const { items } = await pdfPage.getTextContent({ + disableNormalization: true, + }); + const text = mergeText(items); + expect(text).toEqual("Hello world!"); + + await loadingTask.destroy(); + }); + it("gets text content, and check that out-of-page text is not present (bug 1755201)", async function () { if (isNodeJS) { pending("Linked test-cases are not supported in Node.js.");