1
0
Fork 0
mirror of https://github.com/mozilla/pdf.js.git synced 2025-04-22 16:18:08 +02:00

Merge pull request #2127 from jviereck/text-algo-3

Use the text extracted in the getTextContent function for the divs of the textLayer.
This commit is contained in:
Julian Viereck 2012-09-25 05:52:46 -07:00
commit e98eba1b11
7 changed files with 233 additions and 96 deletions

View file

@ -138,11 +138,16 @@ var bidi = PDFJS.bidi = (function bidiClosure() {
}
}
function bidi(text, startLevel) {
var str = text.str;
function BidiResult(str, isLTR) {
this.str = str;
this.ltr = isLTR;
}
function bidi(str, startLevel) {
var isLTR = true;
var strLength = str.length;
if (strLength == 0)
return str;
return new BidiResult(str, ltr);
// get types, fill arrays
@ -176,16 +181,16 @@ var bidi = PDFJS.bidi = (function bidiClosure() {
// if less than 30% chars are rtl then string is primarily ltr
// if more than 30% chars are rtl then string is primarily rtl
if (numBidi == 0) {
text.direction = 'ltr';
return str;
isLTR = true;
return new BidiResult(str, isLTR);
}
if (startLevel == -1) {
if ((strLength / numBidi) < 0.3) {
text.direction = 'ltr';
isLTR = true;
startLevel = 0;
} else {
text.direction = 'rtl';
isLTR = false;
startLevel = 1;
}
}
@ -438,7 +443,8 @@ var bidi = PDFJS.bidi = (function bidiClosure() {
if (ch != '<' && ch != '>')
result += ch;
}
return result;
return new BidiResult(result, isLTR);
}
return bidi;

View file

@ -677,9 +677,10 @@ var CanvasGraphics = (function CanvasGraphicsClosure() {
var textHScale2 = textHScale * fontMatrix[0];
var glyphsLength = glyphs.length;
var textLayer = this.textLayer;
var text = {str: '', length: 0, canvasWidth: 0, geom: {}};
var geom;
var textSelection = textLayer && !skipTextSelection ? true : false;
var textRenderingMode = current.textRenderingMode;
var canvasWidth = 0.0;
// Type3 fonts - each glyph is a "mini-PDF"
if (font.coded) {
@ -692,7 +693,7 @@ var CanvasGraphics = (function CanvasGraphicsClosure() {
if (textSelection) {
this.save();
ctx.scale(1, -1);
text.geom = this.getTextGeometry();
geom = this.getTextGeometry();
this.restore();
}
for (var i = 0; i < glyphsLength; ++i) {
@ -718,9 +719,7 @@ var CanvasGraphics = (function CanvasGraphicsClosure() {
ctx.translate(width, 0);
current.x += width * textHScale;
text.str += glyph.unicode;
text.length++;
text.canvasWidth += width;
canvasWidth += width;
}
ctx.restore();
} else {
@ -735,7 +734,7 @@ var CanvasGraphics = (function CanvasGraphicsClosure() {
lineWidth /= scale;
if (textSelection)
text.geom = this.getTextGeometry();
geom = this.getTextGeometry();
if (fontSizeScale != 1.0) {
ctx.scale(fontSizeScale, fontSizeScale);
@ -784,17 +783,19 @@ var CanvasGraphics = (function CanvasGraphicsClosure() {
var glyphUnicode = glyph.unicode === ' ' ? '\u00A0' : glyph.unicode;
if (glyphUnicode in NormalizedUnicodes)
glyphUnicode = NormalizedUnicodes[glyphUnicode];
text.str += reverseIfRtl(glyphUnicode);
text.canvasWidth += charWidth;
canvasWidth += charWidth;
}
current.x += x * textHScale2;
ctx.restore();
}
if (textSelection)
this.textLayer.appendText(text, font.fallbackName, fontSize);
if (textSelection) {
geom.canvasWidth = canvasWidth;
this.textLayer.appendText(font.fallbackName, fontSize, geom);
}
return text;
return canvasWidth;
},
showSpacedText: function CanvasGraphics_showSpacedText(arr) {
var ctx = this.ctx;
@ -806,7 +807,8 @@ var CanvasGraphics = (function CanvasGraphicsClosure() {
textHScale *= (current.fontMatrix || IDENTITY_MATRIX)[0];
var arrLength = arr.length;
var textLayer = this.textLayer;
var text = {str: '', length: 0, canvasWidth: 0, geom: {}};
var geom;
var canvasWidth = 0.0;
var textSelection = textLayer ? true : false;
if (textSelection) {
@ -819,7 +821,7 @@ var CanvasGraphics = (function CanvasGraphicsClosure() {
ctx.scale(textHScale, 1);
} else
this.applyTextTransforms();
text.geom = this.getTextGeometry();
geom = this.getTextGeometry();
ctx.restore();
}
@ -829,34 +831,22 @@ var CanvasGraphics = (function CanvasGraphicsClosure() {
var spacingLength = -e * 0.001 * fontSize * textHScale;
current.x += spacingLength;
if (textSelection) {
// Emulate precise spacing via HTML spaces
text.canvasWidth += spacingLength;
if (e < 0 && text.geom.spaceWidth > 0) { // avoid div by zero
var numFakeSpaces = Math.round(-e / text.geom.spaceWidth);
if (numFakeSpaces > 0) {
text.str += '\u00A0';
}
}
}
if (textSelection)
canvasWidth += spacingLength;
} else if (isString(e)) {
var shownText = this.showText(e, true);
var shownCanvasWidth = this.showText(e, true);
if (textSelection) {
if (shownText.str === ' ') {
text.str += '\u00A0';
} else {
text.str += shownText.str;
}
text.canvasWidth += shownText.canvasWidth;
}
if (textSelection)
canvasWidth += shownCanvasWidth;
} else {
error('TJ array element ' + e + ' is not string or num');
}
}
if (textSelection)
this.textLayer.appendText(text, font.fallbackName, fontSize);
if (textSelection) {
geom.canvasWidth = canvasWidth;
this.textLayer.appendText(font.fallbackName, fontSize, geom);
}
},
nextLineShowText: function CanvasGraphics_nextLineShowText(text) {
this.nextLine();

View file

@ -164,6 +164,21 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
translated = { error: e };
}
font.translated = translated;
var data = translated;
if (data.loadCharProcs) {
delete data.loadCharProcs;
var charProcs = font.get('CharProcs').getAll();
var fontResources = font.get('Resources') || resources;
var charProcOperatorList = {};
for (var key in charProcs) {
var glyphStream = charProcs[key];
charProcOperatorList[key] =
this.getOperatorList(glyphStream, fontResources, dependency);
}
data.charProcOperatorList = charProcOperatorList;
}
}
return font;
},
@ -195,19 +210,6 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
var loadedName = font.loadedName;
if (!font.sent) {
var data = font.translated;
if (data.loadCharProcs) {
delete data.loadCharProcs;
var charProcs = font.get('CharProcs').getAll();
var fontResources = font.get('Resources') || resources;
var charProcOperatorList = {};
for (var key in charProcs) {
var glyphStream = charProcs[key];
charProcOperatorList[key] =
self.getOperatorList(glyphStream, fontResources, dependency);
}
data.charProcOperatorList = charProcOperatorList;
}
if (data instanceof Font)
data = data.exportData();
@ -505,7 +507,18 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
return queue;
},
getTextContent: function partialEvaluatorGetIRQueue(stream, resources) {
getTextContent: function partialEvaluatorGetIRQueue(
stream, resources, state) {
var bidiTexts;
if (!state) {
bidiTexts = [];
state = {
bidiTexts: bidiTexts
};
} else {
bidiTexts = state.bidiTexts;
}
var self = this;
var xref = this.xref;
@ -515,18 +528,20 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
}
resources = xref.fetchIfRef(resources) || new Dict();
// The xobj is parsed iff it's needed, e.g. if there is a `DO` cmd.
var xobjs = null;
var parser = new Parser(new Lexer(stream), false);
var res = resources;
var args = [], obj;
var text = '';
var chunk = '';
var font = null;
while (!isEOF(obj = parser.getObj())) {
if (isCmd(obj)) {
var cmd = obj.cmd;
switch (cmd) {
// TODO: Add support for SAVE/RESTORE and XFORM here.
case 'Tf':
font = handleSetFont(args[0].name).translated;
break;
@ -535,10 +550,11 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
for (var j = 0, jj = items.length; j < jj; j++) {
if (typeof items[j] === 'string') {
chunk += fontCharsToUnicode(items[j], font);
} else if (items[j] < 0) {
// making all negative offsets a space - better to have
// a space in incorrect place than not have them at all
chunk += ' ';
} else if (items[j] < 0 && font.spaceWidth > 0) {
var numFakeSpaces = Math.round(-items[j] / font.spaceWidth);
if (numFakeSpaces > 0) {
chunk += ' ';
}
}
}
break;
@ -546,14 +562,69 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
chunk += fontCharsToUnicode(args[0], font);
break;
case "'":
chunk += fontCharsToUnicode(args[0], font) + ' ';
// For search, adding a extra white space for line breaks would be
// better here, but that causes too much spaces in the
// text-selection divs.
chunk += fontCharsToUnicode(args[0], font);
break;
case '"':
chunk += fontCharsToUnicode(args[2], font) + ' ';
// Note comment in "'"
chunk += fontCharsToUnicode(args[2], font);
break;
case 'Do':
// Set the chunk such that the following if won't add something
// to the state.
chunk = '';
if (args[0].code) {
break;
}
if (!xobjs) {
xobjs = resources.get('XObject') || new Dict();
}
var name = args[0].name;
var xobj = xobjs.get(name);
if (!xobj)
break;
assertWellFormed(isStream(xobj), 'XObject should be a stream');
var type = xobj.dict.get('Subtype');
assertWellFormed(
isName(type),
'XObject should have a Name subtype'
);
if ('Form' !== type.name)
break;
state = this.getTextContent(
xobj,
xobj.dict.get('Resources') || resources,
state
);
break;
case 'gs':
var dictName = args[0];
var extGState = resources.get('ExtGState');
if (!isDict(extGState) || !extGState.has(dictName.name))
break;
var gsState = extGState.get(dictName.name);
for (var i = 0; i < gsState.length; i++) {
if (gsState[i] === 'Font') {
font = handleSetFont(args[0].name).translated;
}
}
break;
} // switch
if (chunk !== '') {
text += chunk;
bidiTexts.push(PDFJS.bidi(chunk, -1));
chunk = '';
}
@ -562,9 +633,9 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
assertWellFormed(args.length <= 33, 'Too many arguments');
args.push(obj);
}
}
} // while
return text;
return state;
},
extractDataStructures: function

View file

@ -3886,6 +3886,10 @@ var Font = (function FontClosure() {
},
get spaceWidth() {
if ('_shadowWidth' in this) {
return this._shadowWidth;
}
// trying to estimate space character width
var possibleSpaceReplacements = ['space', 'minus', 'one', 'i'];
var width;
@ -3913,7 +3917,10 @@ var Font = (function FontClosure() {
break; // the non-zero width found
}
width = (width || this.defaultWidth) * this.widthMultiplier;
return shadow(this, 'spaceWidth', width);
// Do not shadow the property here. See discussion:
// https://github.com/mozilla/pdf.js/pull/2127#discussion_r1662280
this._shadowWidth = width;
return width;
},
charToGlyph: function Font_charToGlyph(charcode) {