mirror of
https://github.com/mozilla/pdf.js.git
synced 2025-04-22 16:18:08 +02:00
Build a fallback ToUnicode
map for simple fonts (issue 8229)
In some fonts, the included `ToUnicode` data is incomplete causing text-selection to not work properly. For simple fonts that contain encoding data, we can manually build a `ToUnicode` map to attempt to improve things. Please note that since we're currently using the `ToUnicode` data during glyph mapping, in an attempt to avoid rendering regressions, I purposely didn't want to amend to original `ToUnicode` data for this text-selection edge-case. Instead, I opted for the current solution, which will (hopefully) give slightly better text-extraction results in PDF file with incomplete `ToUnicode` data. According to the PDF specification, see [section 9.10.2](http://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PDF32000_2008.pdf#G8.1873172): > A conforming reader can use these methods, in the priority given, to map a character code to a Unicode value. > ... Reading that paragraph literally, it doesn't seem too unreasonable to use *different* methods for different charcodes. Fixes 8229.
This commit is contained in:
parent
ffbfc3c2a7
commit
61e19bee43
5 changed files with 21 additions and 3 deletions
|
@ -2021,6 +2021,14 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
|||
|
||||
// Section 9.10.2 Mapping Character Codes to Unicode Values
|
||||
if (properties.hasIncludedToUnicodeMap) {
|
||||
// Some fonts contain incomplete ToUnicode data, causing issues with
|
||||
// text-extraction. For simple fonts, containing encoding information,
|
||||
// use a fallback ToUnicode map to improve this (fixes issue8229.pdf).
|
||||
if (!properties.composite && properties.hasEncoding) {
|
||||
properties.fallbackToUnicode =
|
||||
this._buildSimpleFontToUnicode(properties);
|
||||
}
|
||||
|
||||
return Promise.resolve(properties.toUnicode);
|
||||
}
|
||||
|
||||
|
|
|
@ -211,9 +211,9 @@ var Glyph = (function GlyphClosure() {
|
|||
})();
|
||||
|
||||
var ToUnicodeMap = (function ToUnicodeMapClosure() {
|
||||
function ToUnicodeMap(cmap) {
|
||||
function ToUnicodeMap(cmap = []) {
|
||||
// The elements of this._map can be integers or strings, depending on how
|
||||
// |cmap| was created.
|
||||
// `cmap` was created.
|
||||
this._map = cmap;
|
||||
}
|
||||
|
||||
|
@ -516,6 +516,7 @@ var Font = (function FontClosure() {
|
|||
this.defaultEncoding = properties.defaultEncoding;
|
||||
|
||||
this.toUnicode = properties.toUnicode;
|
||||
this.fallbackToUnicode = properties.fallbackToUnicode || new ToUnicodeMap();
|
||||
|
||||
this.toFontChar = [];
|
||||
|
||||
|
@ -2766,7 +2767,8 @@ var Font = (function FontClosure() {
|
|||
width = isNum(width) ? width : this.defaultWidth;
|
||||
var vmetric = this.vmetrics && this.vmetrics[widthCode];
|
||||
|
||||
var unicode = this.toUnicode.get(charcode) || charcode;
|
||||
let unicode = this.toUnicode.get(charcode) ||
|
||||
this.fallbackToUnicode.get(charcode) || charcode;
|
||||
if (typeof unicode === 'number') {
|
||||
unicode = String.fromCharCode(unicode);
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue