mirror of
https://github.com/mozilla/pdf.js.git
synced 2025-04-29 15:47:57 +02:00
Cache the normalized unicode-value on the Glyph
-instance
Currently, during text-extraction, we're repeatedly normalizing and (when necessary) reversing the unicode-values every time. This seems a little unnecessary, since the result won't change, hence this patch moves that into the `Glyph`-instance and makes it *lazily* initialized. Taking the `tracemonkey.pdf` document as an example: When extracting the text-content there's a total of 69236 characters but only 595 unique `Glyph`-instances, which mean a 99.1 percent cache hit-rate. Generally speaking, the longer a PDF document is the more beneficial this should be. *Please note:* The old code is fast enough that it unfortunately seems difficult to measure a (clear) performance improvement with this patch, so I completely understand if it's deemed an unnecessary change.
This commit is contained in:
parent
eda51d1dcc
commit
c33b8d7692
3 changed files with 24 additions and 11 deletions
|
@ -35,9 +35,11 @@ import {
|
|||
} from "./fonts_utils.js";
|
||||
import {
|
||||
getCharUnicodeCategory,
|
||||
getNormalizedUnicodes,
|
||||
getUnicodeForGlyph,
|
||||
getUnicodeRangeFor,
|
||||
mapSpecialUnicodeValues,
|
||||
reverseIfRtl,
|
||||
} from "./unicode.js";
|
||||
import { getDingbatsGlyphsUnicode, getGlyphsUnicode } from "./glyphlist.js";
|
||||
import {
|
||||
|
@ -218,6 +220,24 @@ class Glyph {
|
|||
this.isZeroWidthDiacritic = category.isZeroWidthDiacritic;
|
||||
this.isInvisibleFormatMark = category.isInvisibleFormatMark;
|
||||
}
|
||||
|
||||
/**
|
||||
* This property, which is only used by `PartialEvaluator.getTextContent`,
|
||||
* is purposely made non-serializable.
|
||||
* @type {string}
|
||||
*/
|
||||
get normalizedUnicode() {
|
||||
return shadow(
|
||||
this,
|
||||
"normalizedUnicode",
|
||||
reverseIfRtl(Glyph._NormalizedUnicodes[this.unicode] || this.unicode),
|
||||
/* nonSerializable = */ true
|
||||
);
|
||||
}
|
||||
|
||||
static get _NormalizedUnicodes() {
|
||||
return shadow(this, "_NormalizedUnicodes", getNormalizedUnicodes());
|
||||
}
|
||||
}
|
||||
|
||||
function int16(b0, b1) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue