Cache the normalized unicode-value on the Glyph-instance

Currently, during text-extraction, we're repeatedly normalizing and (when necessary) reversing the unicode-values every time. This seems a little unnecessary, since the result won't change, hence this patch moves that into the `Glyph`-instance and makes it *lazily* initialized. Taking the `tracemonkey.pdf` document as an example: When extracting the text-content there's a total of 69236 characters but only 595 unique `Glyph`-instances, which mean a 99.1 percent cache hit-rate. Generally speaking, the longer a PDF document is the more beneficial this should be. *Please note:* The old code is fast enough that it unfortunately seems difficult to measure a (clear) performance improvement with this patch, so I completely understand if it's deemed an unnecessary change.
2025-04-29 15:47:57 +02:00 · 2022-11-03 10:20:18 +01:00 · 2022-11-03 10:20:18 +01:00 · c33b8d7692
commit c33b8d7692
parent eda51d1dcc
3 changed files with 24 additions and 11 deletions
--- a/src/core/fonts.js
+++ b/src/core/fonts.js
@ -35,9 +35,11 @@ import {
 } from "./fonts_utils.js";
 import {
  getCharUnicodeCategory,
+  getNormalizedUnicodes,
  getUnicodeForGlyph,
  getUnicodeRangeFor,
  mapSpecialUnicodeValues,
+  reverseIfRtl,
 } from "./unicode.js";
 import { getDingbatsGlyphsUnicode, getGlyphsUnicode } from "./glyphlist.js";
 import {
@ -218,6 +220,24 @@ class Glyph {
    this.isZeroWidthDiacritic = category.isZeroWidthDiacritic;
    this.isInvisibleFormatMark = category.isInvisibleFormatMark;
  }
+
+  /**
+   * This property, which is only used by `PartialEvaluator.getTextContent`,
+   * is purposely made non-serializable.
+   * @type {string}
+   */
+  get normalizedUnicode() {
+    return shadow(
+      this,
+      "normalizedUnicode",
+      reverseIfRtl(Glyph._NormalizedUnicodes[this.unicode] || this.unicode),
+      /* nonSerializable = */ true
+    );
+  }
+
+  static get _NormalizedUnicodes() {
+    return shadow(this, "_NormalizedUnicodes", getNormalizedUnicodes());
+  }
 }

 function int16(b0, b1) {