1
0
Fork 0
mirror of https://github.com/mozilla/pdf.js.git synced 2025-04-26 01:58:06 +02:00

Merge pull request #2127 from jviereck/text-algo-3

Use the text extracted in the getTextContent function for the divs of the textLayer.
This commit is contained in:
Julian Viereck 2012-09-25 05:52:46 -07:00
commit e98eba1b11
7 changed files with 233 additions and 96 deletions

View file

@ -164,6 +164,21 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
translated = { error: e };
}
font.translated = translated;
var data = translated;
if (data.loadCharProcs) {
delete data.loadCharProcs;
var charProcs = font.get('CharProcs').getAll();
var fontResources = font.get('Resources') || resources;
var charProcOperatorList = {};
for (var key in charProcs) {
var glyphStream = charProcs[key];
charProcOperatorList[key] =
this.getOperatorList(glyphStream, fontResources, dependency);
}
data.charProcOperatorList = charProcOperatorList;
}
}
return font;
},
@ -195,19 +210,6 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
var loadedName = font.loadedName;
if (!font.sent) {
var data = font.translated;
if (data.loadCharProcs) {
delete data.loadCharProcs;
var charProcs = font.get('CharProcs').getAll();
var fontResources = font.get('Resources') || resources;
var charProcOperatorList = {};
for (var key in charProcs) {
var glyphStream = charProcs[key];
charProcOperatorList[key] =
self.getOperatorList(glyphStream, fontResources, dependency);
}
data.charProcOperatorList = charProcOperatorList;
}
if (data instanceof Font)
data = data.exportData();
@ -505,7 +507,18 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
return queue;
},
getTextContent: function partialEvaluatorGetIRQueue(stream, resources) {
getTextContent: function partialEvaluatorGetIRQueue(
stream, resources, state) {
var bidiTexts;
if (!state) {
bidiTexts = [];
state = {
bidiTexts: bidiTexts
};
} else {
bidiTexts = state.bidiTexts;
}
var self = this;
var xref = this.xref;
@ -515,18 +528,20 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
}
resources = xref.fetchIfRef(resources) || new Dict();
// The xobj is parsed iff it's needed, e.g. if there is a `DO` cmd.
var xobjs = null;
var parser = new Parser(new Lexer(stream), false);
var res = resources;
var args = [], obj;
var text = '';
var chunk = '';
var font = null;
while (!isEOF(obj = parser.getObj())) {
if (isCmd(obj)) {
var cmd = obj.cmd;
switch (cmd) {
// TODO: Add support for SAVE/RESTORE and XFORM here.
case 'Tf':
font = handleSetFont(args[0].name).translated;
break;
@ -535,10 +550,11 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
for (var j = 0, jj = items.length; j < jj; j++) {
if (typeof items[j] === 'string') {
chunk += fontCharsToUnicode(items[j], font);
} else if (items[j] < 0) {
// making all negative offsets a space - better to have
// a space in incorrect place than not have them at all
chunk += ' ';
} else if (items[j] < 0 && font.spaceWidth > 0) {
var numFakeSpaces = Math.round(-items[j] / font.spaceWidth);
if (numFakeSpaces > 0) {
chunk += ' ';
}
}
}
break;
@ -546,14 +562,69 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
chunk += fontCharsToUnicode(args[0], font);
break;
case "'":
chunk += fontCharsToUnicode(args[0], font) + ' ';
// For search, adding a extra white space for line breaks would be
// better here, but that causes too much spaces in the
// text-selection divs.
chunk += fontCharsToUnicode(args[0], font);
break;
case '"':
chunk += fontCharsToUnicode(args[2], font) + ' ';
// Note comment in "'"
chunk += fontCharsToUnicode(args[2], font);
break;
case 'Do':
// Set the chunk such that the following if won't add something
// to the state.
chunk = '';
if (args[0].code) {
break;
}
if (!xobjs) {
xobjs = resources.get('XObject') || new Dict();
}
var name = args[0].name;
var xobj = xobjs.get(name);
if (!xobj)
break;
assertWellFormed(isStream(xobj), 'XObject should be a stream');
var type = xobj.dict.get('Subtype');
assertWellFormed(
isName(type),
'XObject should have a Name subtype'
);
if ('Form' !== type.name)
break;
state = this.getTextContent(
xobj,
xobj.dict.get('Resources') || resources,
state
);
break;
case 'gs':
var dictName = args[0];
var extGState = resources.get('ExtGState');
if (!isDict(extGState) || !extGState.has(dictName.name))
break;
var gsState = extGState.get(dictName.name);
for (var i = 0; i < gsState.length; i++) {
if (gsState[i] === 'Font') {
font = handleSetFont(args[0].name).translated;
}
}
break;
} // switch
if (chunk !== '') {
text += chunk;
bidiTexts.push(PDFJS.bidi(chunk, -1));
chunk = '';
}
@ -562,9 +633,9 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
assertWellFormed(args.length <= 33, 'Too many arguments');
args.push(obj);
}
}
} // while
return text;
return state;
},
extractDataStructures: function