Print correctly documents containing chars with an unicode greater than 0xFFFF (bug 1669097)

2025-04-22 16:18:08 +02:00 · 2024-01-21 23:00:43 +01:00 · 2024-01-21 23:00:43 +01:00 · 06601fd90c
commit 06601fd90c
parent d549c2ef4c
6 changed files with 44 additions and 34 deletions
--- a/src/core/annotation.js
+++ b/src/core/annotation.js
@ -3826,7 +3826,7 @@ class FreeTextAnnotation extends MarkupAnnotation {
          fontColor,
          strokeAlpha
        );
-        this._streams.push(this.appearance, FakeUnicodeFont.toUnicodeStream);
+        this._streams.push(this.appearance);
      } else {
        warn(
          "FreeTextAnnotation: OffscreenCanvas is not supported, annotation may not render correctly."
--- a/src/core/core_utils.js
+++ b/src/core/core_utils.js
@ -386,6 +386,17 @@ const XMLEntities = {
  /* ' */ 0x27: "&apos;",
 };

+function* codePointIter(str) {
+  for (let i = 0, ii = str.length; i < ii; i++) {
+    const char = str.codePointAt(i);
+    if (char > 0xd7ff && (char < 0xe000 || char > 0xfffd)) {
+      // char is represented by two u16
+      i++;
+    }
+    yield char;
+  }
+}
+
 function encodeToXmlString(str) {
  const buffer = [];
  let start = 0;
@ -602,6 +613,7 @@ function getRotationMatrix(rotation, width, height) {

 export {
  arrayBuffersToBytes,
+  codePointIter,
  collectActions,
  encodeToXmlString,
  escapePDFName,
--- a/src/core/default_appearance.js
+++ b/src/core/default_appearance.js
@ -13,13 +13,14 @@
 * limitations under the License.
 */

-import { Dict, Name } from "./primitives.js";
 import {
+  codePointIter,
  escapePDFName,
  getRotationMatrix,
  numberToString,
  stringToUTF16HexString,
 } from "./core_utils.js";
+import { Dict, Name } from "./primitives.js";
 import {
  LINE_DESCENT_FACTOR,
  LINE_FACTOR,
@ -251,35 +252,6 @@ class FakeUnicodeFont {
    );
  }

-  get toUnicodeRef() {
-    if (!FakeUnicodeFont._toUnicodeRef) {
-      const toUnicode = `/CIDInit /ProcSet findresource begin
-12 dict begin
-begincmap
-/CIDSystemInfo
-<< /Registry (Adobe)
-/Ordering (UCS) /Supplement 0 >> def
-/CMapName /Adobe-Identity-UCS def
-/CMapType 2 def
-1 begincodespacerange
-<0000> <FFFF>
-endcodespacerange
-1 beginbfrange
-<0000> <FFFF> <0000>
-endbfrange
-endcmap CMapName currentdict /CMap defineresource pop end end`;
-      const toUnicodeStream = (FakeUnicodeFont.toUnicodeStream =
-        new StringStream(toUnicode));
-      const toUnicodeDict = new Dict(this.xref);
-      toUnicodeStream.dict = toUnicodeDict;
-      toUnicodeDict.set("Length", toUnicode.length);
-      FakeUnicodeFont._toUnicodeRef =
-        this.xref.getNewPersistentRef(toUnicodeStream);
-    }
-
-    return FakeUnicodeFont._toUnicodeRef;
-  }
-
  get fontDescriptorRef() {
    if (!FakeUnicodeFont._fontDescriptorRef) {
      const fontDescriptor = new Dict(this.xref);
@ -350,7 +322,7 @@ endcmap CMapName currentdict /CMap defineresource pop end end`;
    baseFont.set("Subtype", Name.get("Type0"));
    baseFont.set("Encoding", Name.get("Identity-H"));
    baseFont.set("DescendantFonts", [this.descendantFontRef]);
-    baseFont.set("ToUnicode", this.toUnicodeRef);
+    baseFont.set("ToUnicode", Name.get("Identity-H"));

    return this.xref.getNewPersistentRef(baseFont);
  }
@ -420,8 +392,8 @@ endcmap CMapName currentdict /CMap defineresource pop end end`;
      // languages, like arabic, it'd be wrong because of ligatures.
      const lineWidth = ctx.measureText(line).width;
      maxWidth = Math.max(maxWidth, lineWidth);
-      for (const char of line.split("")) {
-        const code = char.charCodeAt(0);
+      for (const code of codePointIter(line)) {
+        const char = String.fromCodePoint(code);
        let width = this.widths.get(code);
        if (width === undefined) {
          const metrics = ctx.measureText(char);