mirror of
https://github.com/mozilla/pdf.js.git
synced 2025-04-22 16:18:08 +02:00
Merge pull request #2127 from jviereck/text-algo-3
Use the text extracted in the getTextContent function for the divs of the textLayer.
This commit is contained in:
commit
e98eba1b11
7 changed files with 233 additions and 96 deletions
22
src/bidi.js
22
src/bidi.js
|
@ -138,11 +138,16 @@ var bidi = PDFJS.bidi = (function bidiClosure() {
|
|||
}
|
||||
}
|
||||
|
||||
function bidi(text, startLevel) {
|
||||
var str = text.str;
|
||||
function BidiResult(str, isLTR) {
|
||||
this.str = str;
|
||||
this.ltr = isLTR;
|
||||
}
|
||||
|
||||
function bidi(str, startLevel) {
|
||||
var isLTR = true;
|
||||
var strLength = str.length;
|
||||
if (strLength == 0)
|
||||
return str;
|
||||
return new BidiResult(str, ltr);
|
||||
|
||||
// get types, fill arrays
|
||||
|
||||
|
@ -176,16 +181,16 @@ var bidi = PDFJS.bidi = (function bidiClosure() {
|
|||
// if less than 30% chars are rtl then string is primarily ltr
|
||||
// if more than 30% chars are rtl then string is primarily rtl
|
||||
if (numBidi == 0) {
|
||||
text.direction = 'ltr';
|
||||
return str;
|
||||
isLTR = true;
|
||||
return new BidiResult(str, isLTR);
|
||||
}
|
||||
|
||||
if (startLevel == -1) {
|
||||
if ((strLength / numBidi) < 0.3) {
|
||||
text.direction = 'ltr';
|
||||
isLTR = true;
|
||||
startLevel = 0;
|
||||
} else {
|
||||
text.direction = 'rtl';
|
||||
isLTR = false;
|
||||
startLevel = 1;
|
||||
}
|
||||
}
|
||||
|
@ -438,7 +443,8 @@ var bidi = PDFJS.bidi = (function bidiClosure() {
|
|||
if (ch != '<' && ch != '>')
|
||||
result += ch;
|
||||
}
|
||||
return result;
|
||||
|
||||
return new BidiResult(result, isLTR);
|
||||
}
|
||||
|
||||
return bidi;
|
||||
|
|
|
@ -677,9 +677,10 @@ var CanvasGraphics = (function CanvasGraphicsClosure() {
|
|||
var textHScale2 = textHScale * fontMatrix[0];
|
||||
var glyphsLength = glyphs.length;
|
||||
var textLayer = this.textLayer;
|
||||
var text = {str: '', length: 0, canvasWidth: 0, geom: {}};
|
||||
var geom;
|
||||
var textSelection = textLayer && !skipTextSelection ? true : false;
|
||||
var textRenderingMode = current.textRenderingMode;
|
||||
var canvasWidth = 0.0;
|
||||
|
||||
// Type3 fonts - each glyph is a "mini-PDF"
|
||||
if (font.coded) {
|
||||
|
@ -692,7 +693,7 @@ var CanvasGraphics = (function CanvasGraphicsClosure() {
|
|||
if (textSelection) {
|
||||
this.save();
|
||||
ctx.scale(1, -1);
|
||||
text.geom = this.getTextGeometry();
|
||||
geom = this.getTextGeometry();
|
||||
this.restore();
|
||||
}
|
||||
for (var i = 0; i < glyphsLength; ++i) {
|
||||
|
@ -718,9 +719,7 @@ var CanvasGraphics = (function CanvasGraphicsClosure() {
|
|||
ctx.translate(width, 0);
|
||||
current.x += width * textHScale;
|
||||
|
||||
text.str += glyph.unicode;
|
||||
text.length++;
|
||||
text.canvasWidth += width;
|
||||
canvasWidth += width;
|
||||
}
|
||||
ctx.restore();
|
||||
} else {
|
||||
|
@ -735,7 +734,7 @@ var CanvasGraphics = (function CanvasGraphicsClosure() {
|
|||
lineWidth /= scale;
|
||||
|
||||
if (textSelection)
|
||||
text.geom = this.getTextGeometry();
|
||||
geom = this.getTextGeometry();
|
||||
|
||||
if (fontSizeScale != 1.0) {
|
||||
ctx.scale(fontSizeScale, fontSizeScale);
|
||||
|
@ -784,17 +783,19 @@ var CanvasGraphics = (function CanvasGraphicsClosure() {
|
|||
var glyphUnicode = glyph.unicode === ' ' ? '\u00A0' : glyph.unicode;
|
||||
if (glyphUnicode in NormalizedUnicodes)
|
||||
glyphUnicode = NormalizedUnicodes[glyphUnicode];
|
||||
text.str += reverseIfRtl(glyphUnicode);
|
||||
text.canvasWidth += charWidth;
|
||||
|
||||
canvasWidth += charWidth;
|
||||
}
|
||||
current.x += x * textHScale2;
|
||||
ctx.restore();
|
||||
}
|
||||
|
||||
if (textSelection)
|
||||
this.textLayer.appendText(text, font.fallbackName, fontSize);
|
||||
if (textSelection) {
|
||||
geom.canvasWidth = canvasWidth;
|
||||
this.textLayer.appendText(font.fallbackName, fontSize, geom);
|
||||
}
|
||||
|
||||
return text;
|
||||
return canvasWidth;
|
||||
},
|
||||
showSpacedText: function CanvasGraphics_showSpacedText(arr) {
|
||||
var ctx = this.ctx;
|
||||
|
@ -806,7 +807,8 @@ var CanvasGraphics = (function CanvasGraphicsClosure() {
|
|||
textHScale *= (current.fontMatrix || IDENTITY_MATRIX)[0];
|
||||
var arrLength = arr.length;
|
||||
var textLayer = this.textLayer;
|
||||
var text = {str: '', length: 0, canvasWidth: 0, geom: {}};
|
||||
var geom;
|
||||
var canvasWidth = 0.0;
|
||||
var textSelection = textLayer ? true : false;
|
||||
|
||||
if (textSelection) {
|
||||
|
@ -819,7 +821,7 @@ var CanvasGraphics = (function CanvasGraphicsClosure() {
|
|||
ctx.scale(textHScale, 1);
|
||||
} else
|
||||
this.applyTextTransforms();
|
||||
text.geom = this.getTextGeometry();
|
||||
geom = this.getTextGeometry();
|
||||
ctx.restore();
|
||||
}
|
||||
|
||||
|
@ -829,34 +831,22 @@ var CanvasGraphics = (function CanvasGraphicsClosure() {
|
|||
var spacingLength = -e * 0.001 * fontSize * textHScale;
|
||||
current.x += spacingLength;
|
||||
|
||||
if (textSelection) {
|
||||
// Emulate precise spacing via HTML spaces
|
||||
text.canvasWidth += spacingLength;
|
||||
if (e < 0 && text.geom.spaceWidth > 0) { // avoid div by zero
|
||||
var numFakeSpaces = Math.round(-e / text.geom.spaceWidth);
|
||||
if (numFakeSpaces > 0) {
|
||||
text.str += '\u00A0';
|
||||
}
|
||||
}
|
||||
}
|
||||
if (textSelection)
|
||||
canvasWidth += spacingLength;
|
||||
} else if (isString(e)) {
|
||||
var shownText = this.showText(e, true);
|
||||
var shownCanvasWidth = this.showText(e, true);
|
||||
|
||||
if (textSelection) {
|
||||
if (shownText.str === ' ') {
|
||||
text.str += '\u00A0';
|
||||
} else {
|
||||
text.str += shownText.str;
|
||||
}
|
||||
text.canvasWidth += shownText.canvasWidth;
|
||||
}
|
||||
if (textSelection)
|
||||
canvasWidth += shownCanvasWidth;
|
||||
} else {
|
||||
error('TJ array element ' + e + ' is not string or num');
|
||||
}
|
||||
}
|
||||
|
||||
if (textSelection)
|
||||
this.textLayer.appendText(text, font.fallbackName, fontSize);
|
||||
if (textSelection) {
|
||||
geom.canvasWidth = canvasWidth;
|
||||
this.textLayer.appendText(font.fallbackName, fontSize, geom);
|
||||
}
|
||||
},
|
||||
nextLineShowText: function CanvasGraphics_nextLineShowText(text) {
|
||||
this.nextLine();
|
||||
|
|
119
src/evaluator.js
119
src/evaluator.js
|
@ -164,6 +164,21 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
|||
translated = { error: e };
|
||||
}
|
||||
font.translated = translated;
|
||||
|
||||
var data = translated;
|
||||
if (data.loadCharProcs) {
|
||||
delete data.loadCharProcs;
|
||||
|
||||
var charProcs = font.get('CharProcs').getAll();
|
||||
var fontResources = font.get('Resources') || resources;
|
||||
var charProcOperatorList = {};
|
||||
for (var key in charProcs) {
|
||||
var glyphStream = charProcs[key];
|
||||
charProcOperatorList[key] =
|
||||
this.getOperatorList(glyphStream, fontResources, dependency);
|
||||
}
|
||||
data.charProcOperatorList = charProcOperatorList;
|
||||
}
|
||||
}
|
||||
return font;
|
||||
},
|
||||
|
@ -195,19 +210,6 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
|||
var loadedName = font.loadedName;
|
||||
if (!font.sent) {
|
||||
var data = font.translated;
|
||||
if (data.loadCharProcs) {
|
||||
delete data.loadCharProcs;
|
||||
|
||||
var charProcs = font.get('CharProcs').getAll();
|
||||
var fontResources = font.get('Resources') || resources;
|
||||
var charProcOperatorList = {};
|
||||
for (var key in charProcs) {
|
||||
var glyphStream = charProcs[key];
|
||||
charProcOperatorList[key] =
|
||||
self.getOperatorList(glyphStream, fontResources, dependency);
|
||||
}
|
||||
data.charProcOperatorList = charProcOperatorList;
|
||||
}
|
||||
|
||||
if (data instanceof Font)
|
||||
data = data.exportData();
|
||||
|
@ -505,7 +507,18 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
|||
return queue;
|
||||
},
|
||||
|
||||
getTextContent: function partialEvaluatorGetIRQueue(stream, resources) {
|
||||
getTextContent: function partialEvaluatorGetIRQueue(
|
||||
stream, resources, state) {
|
||||
var bidiTexts;
|
||||
|
||||
if (!state) {
|
||||
bidiTexts = [];
|
||||
state = {
|
||||
bidiTexts: bidiTexts
|
||||
};
|
||||
} else {
|
||||
bidiTexts = state.bidiTexts;
|
||||
}
|
||||
|
||||
var self = this;
|
||||
var xref = this.xref;
|
||||
|
@ -515,18 +528,20 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
|||
}
|
||||
|
||||
resources = xref.fetchIfRef(resources) || new Dict();
|
||||
// The xobj is parsed iff it's needed, e.g. if there is a `DO` cmd.
|
||||
var xobjs = null;
|
||||
|
||||
var parser = new Parser(new Lexer(stream), false);
|
||||
var res = resources;
|
||||
var args = [], obj;
|
||||
|
||||
var text = '';
|
||||
var chunk = '';
|
||||
var font = null;
|
||||
while (!isEOF(obj = parser.getObj())) {
|
||||
if (isCmd(obj)) {
|
||||
var cmd = obj.cmd;
|
||||
switch (cmd) {
|
||||
// TODO: Add support for SAVE/RESTORE and XFORM here.
|
||||
case 'Tf':
|
||||
font = handleSetFont(args[0].name).translated;
|
||||
break;
|
||||
|
@ -535,10 +550,11 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
|||
for (var j = 0, jj = items.length; j < jj; j++) {
|
||||
if (typeof items[j] === 'string') {
|
||||
chunk += fontCharsToUnicode(items[j], font);
|
||||
} else if (items[j] < 0) {
|
||||
// making all negative offsets a space - better to have
|
||||
// a space in incorrect place than not have them at all
|
||||
chunk += ' ';
|
||||
} else if (items[j] < 0 && font.spaceWidth > 0) {
|
||||
var numFakeSpaces = Math.round(-items[j] / font.spaceWidth);
|
||||
if (numFakeSpaces > 0) {
|
||||
chunk += ' ';
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
@ -546,14 +562,69 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
|||
chunk += fontCharsToUnicode(args[0], font);
|
||||
break;
|
||||
case "'":
|
||||
chunk += fontCharsToUnicode(args[0], font) + ' ';
|
||||
// For search, adding a extra white space for line breaks would be
|
||||
// better here, but that causes too much spaces in the
|
||||
// text-selection divs.
|
||||
chunk += fontCharsToUnicode(args[0], font);
|
||||
break;
|
||||
case '"':
|
||||
chunk += fontCharsToUnicode(args[2], font) + ' ';
|
||||
// Note comment in "'"
|
||||
chunk += fontCharsToUnicode(args[2], font);
|
||||
break;
|
||||
case 'Do':
|
||||
// Set the chunk such that the following if won't add something
|
||||
// to the state.
|
||||
chunk = '';
|
||||
|
||||
if (args[0].code) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (!xobjs) {
|
||||
xobjs = resources.get('XObject') || new Dict();
|
||||
}
|
||||
|
||||
var name = args[0].name;
|
||||
var xobj = xobjs.get(name);
|
||||
if (!xobj)
|
||||
break;
|
||||
assertWellFormed(isStream(xobj), 'XObject should be a stream');
|
||||
|
||||
var type = xobj.dict.get('Subtype');
|
||||
assertWellFormed(
|
||||
isName(type),
|
||||
'XObject should have a Name subtype'
|
||||
);
|
||||
|
||||
if ('Form' !== type.name)
|
||||
break;
|
||||
|
||||
state = this.getTextContent(
|
||||
xobj,
|
||||
xobj.dict.get('Resources') || resources,
|
||||
state
|
||||
);
|
||||
break;
|
||||
case 'gs':
|
||||
var dictName = args[0];
|
||||
var extGState = resources.get('ExtGState');
|
||||
|
||||
if (!isDict(extGState) || !extGState.has(dictName.name))
|
||||
break;
|
||||
|
||||
var gsState = extGState.get(dictName.name);
|
||||
|
||||
for (var i = 0; i < gsState.length; i++) {
|
||||
if (gsState[i] === 'Font') {
|
||||
font = handleSetFont(args[0].name).translated;
|
||||
}
|
||||
}
|
||||
break;
|
||||
} // switch
|
||||
|
||||
if (chunk !== '') {
|
||||
text += chunk;
|
||||
bidiTexts.push(PDFJS.bidi(chunk, -1));
|
||||
|
||||
chunk = '';
|
||||
}
|
||||
|
||||
|
@ -562,9 +633,9 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
|||
assertWellFormed(args.length <= 33, 'Too many arguments');
|
||||
args.push(obj);
|
||||
}
|
||||
}
|
||||
} // while
|
||||
|
||||
return text;
|
||||
return state;
|
||||
},
|
||||
|
||||
extractDataStructures: function
|
||||
|
|
|
@ -3886,6 +3886,10 @@ var Font = (function FontClosure() {
|
|||
},
|
||||
|
||||
get spaceWidth() {
|
||||
if ('_shadowWidth' in this) {
|
||||
return this._shadowWidth;
|
||||
}
|
||||
|
||||
// trying to estimate space character width
|
||||
var possibleSpaceReplacements = ['space', 'minus', 'one', 'i'];
|
||||
var width;
|
||||
|
@ -3913,7 +3917,10 @@ var Font = (function FontClosure() {
|
|||
break; // the non-zero width found
|
||||
}
|
||||
width = (width || this.defaultWidth) * this.widthMultiplier;
|
||||
return shadow(this, 'spaceWidth', width);
|
||||
// Do not shadow the property here. See discussion:
|
||||
// https://github.com/mozilla/pdf.js/pull/2127#discussion_r1662280
|
||||
this._shadowWidth = width;
|
||||
return width;
|
||||
},
|
||||
|
||||
charToGlyph: function Font_charToGlyph(charcode) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue