From 9576047f0daec1d8a6ae27fd8337b90da73eebc5 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Wed, 6 Aug 2014 18:02:11 -0700 Subject: [PATCH 1/3] Add ToUnicodeMap class. --- src/core/evaluator.js | 13 ++++++----- src/core/fonts.js | 53 +++++++++++++++++++++++++++++++++---------- 2 files changed, 48 insertions(+), 18 deletions(-) diff --git a/src/core/evaluator.js b/src/core/evaluator.js index df1539dcf..4df4db5ff 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -20,8 +20,8 @@ isNum, isStream, isString, JpegStream, Lexer, Metrics, MurmurHash3_64, Name, Parser, Pattern, PDFImage, PDFJS, serifFonts, stdFontMap, symbolsFonts, getTilingPatternIR, warn, Util, Promise, - RefSetCache, isRef, TextRenderingMode, CMapFactory, OPS, - UNSUPPORTED_FEATURES, UnsupportedManager, NormalizedUnicodes, + RefSetCache, isRef, TextRenderingMode, ToUnicodeMap, CMapFactory, + OPS, UNSUPPORTED_FEATURES, UnsupportedManager, NormalizedUnicodes, IDENTITY_MATRIX, reverseIfRtl, createPromiseCapability, getFontType */ @@ -1306,12 +1306,13 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { }, readToUnicode: function PartialEvaluator_readToUnicode(toUnicode) { - var cmapObj = toUnicode; + var cmap, cmapObj = toUnicode; if (isName(cmapObj)) { - return CMapFactory.create(cmapObj, + cmap = CMapFactory.create(cmapObj, { url: PDFJS.cMapUrl, packed: PDFJS.cMapPacked }, null).getMap(); + return new ToUnicodeMap(cmap); } else if (isStream(cmapObj)) { - var cmap = CMapFactory.create(cmapObj, + cmap = CMapFactory.create(cmapObj, { url: PDFJS.cMapUrl, packed: PDFJS.cMapPacked }, null).getMap(); // Convert UTF-16BE // NOTE: cmap can be a sparse array, so use forEach instead of for(;;) @@ -1330,7 +1331,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { } cmap[i] = String.fromCharCode.apply(String, str); }); - return cmap; + return new ToUnicodeMap(cmap); } return null; }, diff --git a/src/core/fonts.js b/src/core/fonts.js index 7046d4ffa..e14ea3576 100644 --- a/src/core/fonts.js +++ b/src/core/fonts.js @@ -2161,6 +2161,36 @@ var Glyph = (function GlyphClosure() { return Glyph; })(); +var ToUnicodeMap = (function ToUnicodeMapClosure() { + function ToUnicodeMap(cmap) { + // The elements of this._map can be integers or strings, depending on how + // |cmap| was created. + this._map = cmap; + } + + ToUnicodeMap.prototype = { + get length() { + return this._map.length; + }, + + forEach: function(callback) { + for (var charCode in this._map) { + callback(charCode, this._map[charCode].charCodeAt(0)); + } + }, + + get: function(i) { + return this._map[i]; + }, + + charCodeOf: function(v) { + return this._map.indexOf(v); + } + }; + + return ToUnicodeMap; +})(); + /** * 'Font' is the class the outside world should use, it encapsulate all the font * decoding logics whatever type it is (assuming the font type is supported). @@ -2259,7 +2289,7 @@ var Font = (function FontClosure() { map[+code] = GlyphMapForStandardFonts[code]; } this.toFontChar = map; - this.toUnicode = map; + this.toUnicode = new ToUnicodeMap(map); } else if (/Symbol/i.test(fontName)) { var symbols = Encodings.SymbolSetEncoding; for (charCode in symbols) { @@ -2278,15 +2308,14 @@ var Font = (function FontClosure() { } } else { var unicodeCharCode, notCidFont = (type.indexOf('CIDFontType') === -1); - for (charCode in this.toUnicode) { - unicodeCharCode = this.toUnicode[charCode].charCodeAt(0); + this.toUnicode.forEach(function(charCode, unicodeCharCode) { if (notCidFont) { glyphName = (properties.differences[charCode] || properties.defaultEncoding[charCode]); unicodeCharCode = (GlyphsUnicode[glyphName] || unicodeCharCode); } this.toFontChar[charCode] = unicodeCharCode; - } + }.bind(this)); } this.loadedName = fontName.split('-')[0]; this.loading = false; @@ -2512,8 +2541,8 @@ var Font = (function FontClosure() { // First try to map the value to a unicode position if a non identity map // was created. if (!isIdentityUnicode) { - if (toUnicode[originalCharCode] !== undefined) { - var unicode = toUnicode[fontCharCode]; + if (toUnicode.get(originalCharCode) !== undefined) { + var unicode = toUnicode.get(fontCharCode); // TODO: Try to map ligatures to the correct spot. if (unicode.length === 1) { fontCharCode = unicode.charCodeAt(0); @@ -3852,7 +3881,7 @@ var Font = (function FontClosure() { var dupFirstEntry = false; if (properties.type === 'CIDFontType2' && properties.toUnicode && - properties.toUnicode[0] > '\u0000') { + properties.toUnicode.get(0) > '\u0000') { // oracle's defect (see 3427), duplicating first entry dupFirstEntry = true; numGlyphs++; @@ -4375,7 +4404,7 @@ var Font = (function FontClosure() { } toUnicode[charcode] = String.fromCharCode(GlyphsUnicode[glyphName]); } - map.toUnicode = toUnicode; + map.toUnicode = new ToUnicodeMap(toUnicode); return map; } // If the font is a composite font that uses one of the predefined CMaps @@ -4419,7 +4448,7 @@ var Font = (function FontClosure() { ucs2.charCodeAt(1)); } }); - map.toUnicode = toUnicode; + map.toUnicode = new ToUnicodeMap(toUnicode); return map; } @@ -4430,7 +4459,7 @@ var Font = (function FontClosure() { toUnicode[i] = String.fromCharCode(i); } map.isIdentity = true; - map.toUnicode = toUnicode; + map.toUnicode = new ToUnicodeMap(toUnicode); return map; }, @@ -4459,7 +4488,7 @@ var Font = (function FontClosure() { } // ... via toUnicode map if (!charcode && 'toUnicode' in this) { - charcode = this.toUnicode.indexOf(glyphUnicode); + charcode = this.toUnicode.charCodeOf(glyphUnicode); } // setting it to unicode if negative or undefined if (charcode <= 0) { @@ -4489,7 +4518,7 @@ var Font = (function FontClosure() { width = isNum(width) ? width : this.defaultWidth; var vmetric = this.vmetrics && this.vmetrics[widthCode]; - var unicode = this.toUnicode[charcode] || charcode; + var unicode = this.toUnicode.get(charcode) || charcode; if (typeof unicode === 'number') { unicode = String.fromCharCode(unicode); } From 6c8cca1284f061028bc5521b307d662ea831d0f0 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Thu, 7 Aug 2014 18:29:53 -0700 Subject: [PATCH 2/3] Add IdentityToUnicodeMap class. When loading the PDF from issue #4935, this change reduces peak RSS from ~2400 to ~300 MiB, and improves overall speed by ~81%, from 6336 ms to 1222 ms. --- src/core/fonts.js | 40 ++++++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/src/core/fonts.js b/src/core/fonts.js index e14ea3576..dc6369165 100644 --- a/src/core/fonts.js +++ b/src/core/fonts.js @@ -2191,6 +2191,38 @@ var ToUnicodeMap = (function ToUnicodeMapClosure() { return ToUnicodeMap; })(); +var IdentityToUnicodeMap = (function IdentityToUnicodeMapClosure() { + function IdentityToUnicodeMap(firstChar, lastChar) { + this.firstChar = firstChar; + this.lastChar = lastChar; + } + + IdentityToUnicodeMap.prototype = { + get length() { + error('should not access .length'); + }, + + forEach: function(callback) { + for (var i = this.firstChar, ii = this.lastChar; i <= ii; i++) { + callback(i, i); + } + }, + + get: function(i) { + if (this.firstChar <= i && i <= this.lastChar) { + return String.fromCharCode(i); + } + return undefined; + }, + + charCodeOf: function(v) { + error('should not call .charCodeOf'); + } + }; + + return IdentityToUnicodeMap; +})(); + /** * 'Font' is the class the outside world should use, it encapsulate all the font * decoding logics whatever type it is (assuming the font type is supported). @@ -4453,13 +4485,9 @@ var Font = (function FontClosure() { } // The viewer's choice, just use an identity map. - toUnicode = []; - var firstChar = properties.firstChar, lastChar = properties.lastChar; - for (var i = firstChar; i <= lastChar; i++) { - toUnicode[i] = String.fromCharCode(i); - } map.isIdentity = true; - map.toUnicode = new ToUnicodeMap(toUnicode); + map.toUnicode = + new IdentityToUnicodeMap(properties.firstChar, properties.lastChar); return map; }, From f82977caf9b09b970b520c8cb2822a102ac19b38 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Thu, 7 Aug 2014 19:49:41 -0700 Subject: [PATCH 3/3] Simplify isIdentityUnicode detection. --- src/core/fonts.js | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) diff --git a/src/core/fonts.js b/src/core/fonts.js index dc6369165..1a9881b46 100644 --- a/src/core/fonts.js +++ b/src/core/fonts.js @@ -2266,9 +2266,7 @@ var Font = (function FontClosure() { this.descent = properties.descent / PDF_GLYPH_SPACE_UNITS; this.fontMatrix = properties.fontMatrix; - var unicode = this.buildToUnicode(properties); - this.toUnicode = properties.toUnicode = unicode.toUnicode; - this.isIdentityUnicode = properties.isIdentityUnicode = unicode.isIdentity; + this.toUnicode = properties.toUnicode = this.buildToUnicode(properties); this.toFontChar = []; @@ -2560,7 +2558,8 @@ var Font = (function FontClosure() { function adjustMapping(charCodeToGlyphId, properties) { var toUnicode = properties.toUnicode; var isSymbolic = !!(properties.flags & FontFlags.Symbolic); - var isIdentityUnicode = properties.isIdentityUnicode; + var isIdentityUnicode = + properties.toUnicode instanceof IdentityToUnicodeMap; var isCidFontType2 = (properties.type === 'CIDFontType2'); var newMap = Object.create(null); var toFontChar = []; @@ -4359,19 +4358,12 @@ var Font = (function FontClosure() { /** * Builds a char code to unicode map based on section 9.10 of the spec. * @param {Object} properties Font properties object. - * @return {Object} Has two properties: 'toUnicode' which maps char codes to - * unicode (string) values and 'isIdentity' which is true if an identity map - * is used. + * @return {Object} A ToUnicodeMap object. */ buildToUnicode: function Font_buildToUnicode(properties) { - var map = { - isIdentity: false, - toUnicode: null - }; // Section 9.10.2 Mapping Character Codes to Unicode Values if (properties.toUnicode && properties.toUnicode.length !== 0) { - map.toUnicode = properties.toUnicode; - return map; + return properties.toUnicode; } // According to the spec if the font is a simple font we should only map // to unicode if the base encoding is MacRoman, MacExpert, or WinAnsi or @@ -4436,8 +4428,7 @@ var Font = (function FontClosure() { } toUnicode[charcode] = String.fromCharCode(GlyphsUnicode[glyphName]); } - map.toUnicode = new ToUnicodeMap(toUnicode); - return map; + return new ToUnicodeMap(toUnicode); } // If the font is a composite font that uses one of the predefined CMaps // listed in Table 118 (except Identity–H and Identity–V) or whose @@ -4480,15 +4471,12 @@ var Font = (function FontClosure() { ucs2.charCodeAt(1)); } }); - map.toUnicode = new ToUnicodeMap(toUnicode); - return map; + return new ToUnicodeMap(toUnicode); } // The viewer's choice, just use an identity map. - map.isIdentity = true; - map.toUnicode = - new IdentityToUnicodeMap(properties.firstChar, properties.lastChar); - return map; + return new IdentityToUnicodeMap(properties.firstChar, + properties.lastChar); }, get spaceWidth() {