From 6eef69de2261aa6e34987ec6b6707985de3cf960 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Fri, 7 May 2021 22:25:08 +0200 Subject: [PATCH] Export the "raw" `toUnicode`-data from `PartialEvaluator.preEvaluateFont` Compared to other data-structures, such as e.g. `Dict`s, we're purposely *not* caching Streams on the `XRef`-instance.[1] The, somewhat unfortunate, effect of Streams not being cached is that repeatedly getting the *same* Stream-data requires re-parsing/re-initializing of a bunch of data; see `XRef.fetch` and related methods. For the font-parsing in particular we're currently fetching the `toUnicode`-data, which is very often a Stream, in `PartialEvaluator.preEvaluateFont` and then *again* in `PartialEvaluator.extractDataStructures` soon afterwards. By instead letting `PartialEvaluator.preEvaluateFont` export the "raw" `toUnicode`-data, we can avoid *some* unnecessary re-parsing/re-initializing when handling fonts. *Please note:* In this particular case, given that `PartialEvaluator.preEvaluateFont` only accesses the "raw" `toUnicode` data, exporting a Stream should be safe. --- [1] The reasons for this include: - Streams, especially `DecodeStream`-instances, can become *very* large once read. Hence caching them really isn't a good idea simply because of the (potential) memory impact of doing so. - Attempting to read from the *same* Stream-instance more than once won't work, unless it's `reset` in between, since using any method such as e.g. `getBytes` always starts at the current data position. - Given that parsing, even in the worker-thread, is now fairly asynchronous it's generally impossible to assert that any one Stream-instance isn't being accessed "concurrently" by e.g. different `getOperatorList` calls. Hence `reset`-ing a cached Stream-instance isn't going to work in the general case. --- src/core/evaluator.js | 44 ++++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/src/core/evaluator.js b/src/core/evaluator.js index d5476347d..f38678ccf 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -2978,10 +2978,9 @@ class PartialEvaluator { const xref = this.xref; let cidToGidBytes; // 9.10.2 - const toUnicode = dict.get("ToUnicode") || baseDict.get("ToUnicode"); - const toUnicodePromise = toUnicode - ? this.readToUnicode(toUnicode) - : Promise.resolve(undefined); + const toUnicodePromise = this.readToUnicode( + properties.toUnicode || dict.get("ToUnicode") || baseDict.get("ToUnicode") + ); if (properties.composite) { // CIDSystemInfo helps to match CID to glyphs @@ -3289,8 +3288,10 @@ class PartialEvaluator { ); } - readToUnicode(toUnicode) { - const cmapObj = toUnicode; + readToUnicode(cmapObj) { + if (!cmapObj) { + return Promise.resolve(null); + } if (isName(cmapObj)) { return CMapFactory.create({ encoding: cmapObj, @@ -3541,7 +3542,7 @@ class PartialEvaluator { } let composite = false; - let uint8array; + let hash, toUnicode; if (type.name === "Type0") { // If font is a composite // - get the descendant font @@ -3566,7 +3567,6 @@ class PartialEvaluator { const firstChar = dict.get("FirstChar") || 0, lastChar = dict.get("LastChar") || (composite ? 0xffff : 0xff); const descriptor = dict.get("FontDescriptor"); - let hash; if (descriptor) { hash = new MurmurHash3_64(); @@ -3601,10 +3601,10 @@ class PartialEvaluator { hash.update(`${firstChar}-${lastChar}`); // Fixes issue10665_reduced.pdf - const toUnicode = dict.get("ToUnicode") || baseDict.get("ToUnicode"); + toUnicode = dict.get("ToUnicode") || baseDict.get("ToUnicode"); if (isStream(toUnicode)) { const stream = toUnicode.str || toUnicode; - uint8array = stream.buffer + const uint8array = stream.buffer ? new Uint8Array(stream.buffer.buffer, 0, stream.bufferLength) : new Uint8Array( stream.bytes.buffer, @@ -3659,18 +3659,22 @@ class PartialEvaluator { type: type.name, firstChar, lastChar, + toUnicode, hash: hash ? hash.hexdigest() : "", }; } - async translateFont(preEvaluatedFont) { - const baseDict = preEvaluatedFont.baseDict; - const dict = preEvaluatedFont.dict; - const composite = preEvaluatedFont.composite; - let descriptor = preEvaluatedFont.descriptor; - const type = preEvaluatedFont.type; - const firstChar = preEvaluatedFont.firstChar, - lastChar = preEvaluatedFont.lastChar; + async translateFont({ + descriptor, + dict, + baseDict, + composite, + type, + firstChar, + lastChar, + toUnicode, + cssFontInfo, + }) { let properties; if (!descriptor) { @@ -3710,6 +3714,7 @@ class PartialEvaluator { flags, firstChar, lastChar, + toUnicode, }; const widths = dict.get("Widths"); return this.extractDataStructures(dict, dict, properties).then( @@ -3806,6 +3811,7 @@ class PartialEvaluator { fontMatrix: dict.getArray("FontMatrix") || FONT_IDENTITY_MATRIX, firstChar, lastChar, + toUnicode, bbox: descriptor.getArray("FontBBox"), ascent: descriptor.get("Ascent"), descent: descriptor.get("Descent"), @@ -3814,7 +3820,7 @@ class PartialEvaluator { flags: descriptor.get("Flags"), italicAngle: descriptor.get("ItalicAngle"), isType3Font: false, - cssFontInfo: preEvaluatedFont.cssFontInfo, + cssFontInfo, }; if (composite) {