From fa47d4b9b114579c8dd8971921c6ec2f23555105 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Wed, 5 Oct 2022 12:52:22 +0200 Subject: [PATCH] Slightly re-factor `PartialEvaluator._simpleFontToUnicode` Given the sheer number of heuristics added to this method over the years, moving the *valid* unicode found case to the top should improve readability of the code. --- src/core/evaluator.js | 134 +++++++++++++++++++++--------------------- 1 file changed, 67 insertions(+), 67 deletions(-) diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 53c8282a3..84a58fe2b 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -3526,77 +3526,77 @@ class PartialEvaluator { for (const charcode in encoding) { // a) Map the character code to a character name. let glyphName = encoding[charcode]; - // b) Look up the character name in the Adobe Glyph List (see the - // Bibliography) to obtain the corresponding Unicode value. if (glyphName === "") { continue; - } else if (glyphsUnicodeMap[glyphName] === undefined) { - // (undocumented) c) Few heuristics to recognize unknown glyphs - // NOTE: Adobe Reader does not do this step, but OSX Preview does - let code = 0; - switch (glyphName[0]) { - case "G": // Gxx glyph - if (glyphName.length === 3) { - code = parseInt(glyphName.substring(1), 16); - } - break; - case "g": // g00xx glyph - if (glyphName.length === 5) { - code = parseInt(glyphName.substring(1), 16); - } - break; - case "C": // Cdd{d} glyph - case "c": // cdd{d} glyph - if (glyphName.length >= 3 && glyphName.length <= 4) { - const codeStr = glyphName.substring(1); - - if (forceGlyphs) { - code = parseInt(codeStr, 16); - break; - } - // Normally the Cdd{d}/cdd{d} glyphName format will contain - // regular, i.e. base 10, charCodes (see issue4550.pdf)... - code = +codeStr; - - // ... however some PDF generators violate that assumption by - // containing glyph, i.e. base 16, codes instead. - // In that case we need to re-parse the *entire* encoding to - // prevent broken text-selection (fixes issue9655_reduced.pdf). - if ( - Number.isNaN(code) && - Number.isInteger(parseInt(codeStr, 16)) - ) { - return this._simpleFontToUnicode( - properties, - /* forceGlyphs */ true - ); - } - } - break; - default: // 'uniXXXX'/'uXXXX{XX}' glyphs - const unicode = getUnicodeForGlyph(glyphName, glyphsUnicodeMap); - if (unicode !== -1) { - code = unicode; - } - } - if (code > 0 && code <= 0x10ffff && Number.isInteger(code)) { - // If `baseEncodingName` is one the predefined encodings, and `code` - // equals `charcode`, using the glyph defined in the baseEncoding - // seems to yield a better `toUnicode` mapping (fixes issue 5070). - if (baseEncodingName && code === +charcode) { - const baseEncoding = getEncoding(baseEncodingName); - if (baseEncoding && (glyphName = baseEncoding[charcode])) { - toUnicode[charcode] = String.fromCharCode( - glyphsUnicodeMap[glyphName] - ); - continue; - } - } - toUnicode[charcode] = String.fromCodePoint(code); - } + } + // b) Look up the character name in the Adobe Glyph List (see the + // Bibliography) to obtain the corresponding Unicode value. + let unicode = glyphsUnicodeMap[glyphName]; + if (unicode !== undefined) { + toUnicode[charcode] = String.fromCharCode(unicode); continue; } - toUnicode[charcode] = String.fromCharCode(glyphsUnicodeMap[glyphName]); + // (undocumented) c) Few heuristics to recognize unknown glyphs + // NOTE: Adobe Reader does not do this step, but OSX Preview does + let code = 0; + switch (glyphName[0]) { + case "G": // Gxx glyph + if (glyphName.length === 3) { + code = parseInt(glyphName.substring(1), 16); + } + break; + case "g": // g00xx glyph + if (glyphName.length === 5) { + code = parseInt(glyphName.substring(1), 16); + } + break; + case "C": // Cdd{d} glyph + case "c": // cdd{d} glyph + if (glyphName.length >= 3 && glyphName.length <= 4) { + const codeStr = glyphName.substring(1); + + if (forceGlyphs) { + code = parseInt(codeStr, 16); + break; + } + // Normally the Cdd{d}/cdd{d} glyphName format will contain + // regular, i.e. base 10, charCodes (see issue4550.pdf)... + code = +codeStr; + + // ... however some PDF generators violate that assumption by + // containing glyph, i.e. base 16, codes instead. + // In that case we need to re-parse the *entire* encoding to + // prevent broken text-selection (fixes issue9655_reduced.pdf). + if (Number.isNaN(code) && Number.isInteger(parseInt(codeStr, 16))) { + return this._simpleFontToUnicode( + properties, + /* forceGlyphs */ true + ); + } + } + break; + case "u": // 'uniXXXX'/'uXXXX{XX}' glyphs + unicode = getUnicodeForGlyph(glyphName, glyphsUnicodeMap); + if (unicode !== -1) { + code = unicode; + } + break; + } + if (code > 0 && code <= 0x10ffff && Number.isInteger(code)) { + // If `baseEncodingName` is one the predefined encodings, and `code` + // equals `charcode`, using the glyph defined in the baseEncoding + // seems to yield a better `toUnicode` mapping (fixes issue 5070). + if (baseEncodingName && code === +charcode) { + const baseEncoding = getEncoding(baseEncodingName); + if (baseEncoding && (glyphName = baseEncoding[charcode])) { + toUnicode[charcode] = String.fromCharCode( + glyphsUnicodeMap[glyphName] + ); + continue; + } + } + toUnicode[charcode] = String.fromCodePoint(code); + } } return toUnicode; }