From a38c4bc72903073727119c29331bb3ed7abf6ce5 Mon Sep 17 00:00:00 2001 From: Julian Viereck Date: Tue, 11 Sep 2012 15:10:34 -0700 Subject: [PATCH] Make getTextContent return offset array and improve the algorithm. Make parts in viewer.js work again. --- src/evaluator.js | 67 ++++++++++++++++++++++++++++++++++++++++++++---- web/viewer.html | 2 +- web/viewer.js | 2 +- 3 files changed, 64 insertions(+), 7 deletions(-) diff --git a/src/evaluator.js b/src/evaluator.js index f0e775cdb..1ac32f781 100644 --- a/src/evaluator.js +++ b/src/evaluator.js @@ -505,7 +505,13 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { return queue; }, - getTextContent: function partialEvaluatorGetIRQueue(stream, resources) { + getTextContent: function partialEvaluatorGetIRQueue(stream, resources, state) { + if (!state) { + state = { + text: '', + mapping: [] + }; + } var self = this; var xref = this.xref; @@ -515,18 +521,22 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { } resources = xref.fetchIfRef(resources) || new Dict(); + // The xobj is parsed iff it's needed, e.g. if there is a `DO` cmd. + var xobjs = null; var parser = new Parser(new Lexer(stream), false); var res = resources; var args = [], obj; - var text = ''; + var text = state.text; var chunk = ''; + var commandOffset = state.mapping; var font = null; while (!isEOF(obj = parser.getObj())) { if (isCmd(obj)) { var cmd = obj.cmd; switch (cmd) { + // TODO: Add support for SAVE/RESTORE and XFORM here. case 'Tf': font = handleSetFont(args[0].name).translated; break; @@ -536,9 +546,12 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { if (typeof items[j] === 'string') { chunk += fontCharsToUnicode(items[j], font); } else if (items[j] < 0) { - // making all negative offsets a space - better to have - // a space in incorrect place than not have them at all chunk += ' '; + } else if (items[j] < 0 && font.spacedWidth > 0) { + var numFakeSpaces = Math.round(-e / font.spacedWidth); + if (numFakeSpaces > 0) { + chunk += ' '; + } } } break; @@ -551,8 +564,49 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { case '"': chunk += fontCharsToUnicode(args[2], font) + ' '; break; + case 'Do': + // Set the chunk such that the following if won't add something + // to the state. + chunk = ''; + + if (args[0].code) { + break; + } + + if (!xobjs) { + xobjs = resources.get('XObject') || new Dict(); + } + + var name = args[0].name; + var xobj = xobjs.get(name); + if (!xobj) + break; + assertWellFormed(isStream(xobj), 'XObject should be a stream'); + + var type = xobj.dict.get('Subtype'); + assertWellFormed( + isName(type), + 'XObject should have a Name subtype' + ); + + if ('Form' !== type.name) + break; + + // Add some spacing between the text here and the text of the + // xForm. + text = text + ' '; + + state.text = text; + state = this.getTextContent( + xobj, + xobj.dict.get('Resources') || resources, + state + ); + text = state.text; + break; } // switch if (chunk !== '') { + commandOffset.push(text.length); text += chunk; chunk = ''; } @@ -564,7 +618,10 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { } } - return text; + return { + text: text, + mapping: commandOffset + }; }, extractDataStructures: function diff --git a/web/viewer.html b/web/viewer.html index 5a2f4f28c..813484c6c 100644 --- a/web/viewer.html +++ b/web/viewer.html @@ -88,7 +88,7 @@ limitations under the License. - diff --git a/web/viewer.js b/web/viewer.js index 5f8ded0ec..be512736f 100644 --- a/web/viewer.js +++ b/web/viewer.js @@ -1043,7 +1043,7 @@ var PDFView = { function extractPageText(pageIndex) { self.pages[pageIndex].pdfPage.getTextContent().then( function textContentResolved(textContent) { - self.pageText[pageIndex] = textContent; + self.pageText[pageIndex] = textContent.text; self.search(); if ((pageIndex + 1) < self.pages.length) extractPageText(pageIndex + 1);