1
0
Fork 0
mirror of https://github.com/mozilla/pdf.js.git synced 2025-04-26 10:08:06 +02:00

Cache the normalized unicode-value on the Glyph-instance

Currently, during text-extraction, we're repeatedly normalizing and (when necessary) reversing the unicode-values every time. This seems a little unnecessary, since the result won't change, hence this patch moves that into the `Glyph`-instance and makes it *lazily* initialized.

Taking the `tracemonkey.pdf` document as an example: When extracting the text-content there's a total of 69236 characters but only 595 unique `Glyph`-instances, which mean a 99.1 percent cache hit-rate. Generally speaking, the longer a PDF document is the more beneficial this should be.

*Please note:* The old code is fast enough that it unfortunately seems difficult to measure a (clear) performance improvement with this patch, so I completely understand if it's deemed an unnecessary change.
This commit is contained in:
Jonas Jenwald 2022-11-03 10:20:18 +01:00
parent eda51d1dcc
commit c33b8d7692
3 changed files with 24 additions and 11 deletions

View file

@ -51,11 +51,6 @@ import {
getStdFontMap,
getSymbolsFonts,
} from "./standard_fonts.js";
import {
getNormalizedUnicodes,
getUnicodeForGlyph,
reverseIfRtl,
} from "./unicode.js";
import { getTilingPatternIR, Pattern } from "./pattern.js";
import { getXfaFontDict, getXfaFontName } from "./xfa_fonts.js";
import { IdentityToUnicodeMap, ToUnicodeMap } from "./to_unicode_map.js";
@ -75,6 +70,7 @@ import { DecodeStream } from "./decode_stream.js";
import { getGlyphsUnicode } from "./glyphlist.js";
import { getLookupTableFactory } from "./core_utils.js";
import { getMetrics } from "./metrics.js";
import { getUnicodeForGlyph } from "./unicode.js";
import { MurmurHash3_64 } from "../shared/murmurhash3.js";
import { OperatorList } from "./operator_list.js";
import { PDFImage } from "./image.js";
@ -2293,7 +2289,6 @@ class PartialEvaluator {
if (includeMarkedContent) {
markedContentData = markedContentData || { level: 0 };
}
const NormalizedUnicodes = getNormalizedUnicodes();
const textContent = {
items: [],
@ -2839,9 +2834,7 @@ class PartialEvaluator {
textChunk.prevTransform = getCurrentTextTransform();
}
let glyphUnicode = glyph.unicode;
glyphUnicode = NormalizedUnicodes[glyphUnicode] || glyphUnicode;
glyphUnicode = reverseIfRtl(glyphUnicode);
const glyphUnicode = glyph.normalizedUnicode;
if (saveLastChar(glyphUnicode)) {
// The two last chars are a non-whitespace followed by a whitespace
// and then this non-whitespace, so we insert a whitespace here.