From f297e4d17c5c0ac3dc26b585afc1918ff87ee8de Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Sun, 3 Jul 2016 18:29:47 +0200 Subject: [PATCH] [api-minor] Add a parameter to `PDFPageProxy_getTextContent` that controls whether `PartialEvaluator_getTextContent` will attempt to combine same line text items From the discussion in issue 7445, it seems that there may be cases where an API consumer would want to get the text content as is, without combined text items. --- src/core/document.js | 6 ++++-- src/core/evaluator.js | 12 ++++++++---- src/core/worker.js | 4 +++- src/display/api.js | 9 ++++++--- test/driver.js | 14 +++++++------- test/unit/api_spec.js | 8 +++++--- web/pdf_page_view.js | 12 ++++++------ web/pdf_viewer.js | 4 +++- 8 files changed, 42 insertions(+), 27 deletions(-) diff --git a/src/core/document.js b/src/core/document.js index 6b9a7bced..a23c9fd89 100644 --- a/src/core/document.js +++ b/src/core/document.js @@ -265,7 +265,8 @@ var Page = (function PageClosure() { }, extractTextContent: function Page_extractTextContent(task, - normalizeWhitespace) { + normalizeWhitespace, + combineTextItems) { var handler = { on: function nullHandlerOn() {}, send: function nullHandlerSend() {} @@ -298,7 +299,8 @@ var Page = (function PageClosure() { task, self.resources, /* stateManager = */ null, - normalizeWhitespace); + normalizeWhitespace, + combineTextItems); }); }, diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 47f5816b1..bce4ce6a5 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -1110,7 +1110,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { getTextContent: function PartialEvaluator_getTextContent(stream, task, resources, stateManager, - normalizeWhitespace) { + normalizeWhitespace, + combineTextItems) { stateManager = (stateManager || new StateManager(new TextState())); @@ -1421,7 +1422,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { var isSameTextLine = !textState.font ? false : ((textState.font.vertical ? args[0] : args[1]) === 0); advance = args[0] - args[1]; - if (isSameTextLine && textContentItem.initialized && + if (combineTextItems && + isSameTextLine && textContentItem.initialized && advance > 0 && advance <= textContentItem.fakeMultiSpaceMax) { textState.translateTextLineMatrix(args[0], args[1]); @@ -1453,7 +1455,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { // Optimization to treat same line movement as advance. advance = textState.calcTextLineMatrixAdvance( args[0], args[1], args[2], args[3], args[4], args[5]); - if (advance !== null && textContentItem.initialized && + if (combineTextItems && + advance !== null && textContentItem.initialized && advance.value > 0 && advance.value <= textContentItem.fakeMultiSpaceMax) { textState.translateTextLineMatrix(advance.width, @@ -1594,7 +1597,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { next(self.getTextContent(xobj, task, xobj.dict.get('Resources') || resources, stateManager, - normalizeWhitespace).then(function (formTextContent) { + normalizeWhitespace, combineTextItems).then( + function (formTextContent) { Util.appendToArray(textContent.items, formTextContent.items); Util.extendObj(textContent.styles, formTextContent.styles); stateManager.restore(); diff --git a/src/core/worker.js b/src/core/worker.js index 76556518c..40e5a5a0f 100644 --- a/src/core/worker.js +++ b/src/core/worker.js @@ -891,12 +891,14 @@ var WorkerMessageHandler = { handler.on('GetTextContent', function wphExtractText(data) { var pageIndex = data.pageIndex; var normalizeWhitespace = data.normalizeWhitespace; + var combineTextItems = data.combineTextItems; return pdfManager.getPage(pageIndex).then(function(page) { var task = new WorkerTask('GetTextContent: page ' + pageIndex); startWorkerTask(task); var pageNum = pageIndex + 1; var start = Date.now(); - return page.extractTextContent(task, normalizeWhitespace).then( + return page.extractTextContent(task, normalizeWhitespace, + combineTextItems).then( function(textContent) { finishWorkerTask(task); info('text indexing: page=' + pageNum + ' - time=' + diff --git a/src/display/api.js b/src/display/api.js index cfdfb1a9f..06fcc18be 100644 --- a/src/display/api.js +++ b/src/display/api.js @@ -600,6 +600,8 @@ var PDFDocumentProxy = (function PDFDocumentProxyClosure() { * @typedef {Object} getTextContentParameters * @param {boolean} normalizeWhitespace - replaces all occurrences of * whitespace with standard spaces (0x20). The default value is `false`. + * @param {boolean} disableCombineTextItems - do not attempt to combine + * same line {@link TextItem}'s. The default value is `false`. */ /** @@ -891,11 +893,12 @@ var PDFPageProxy = (function PDFPageProxyClosure() { * object that represent the page text content. */ getTextContent: function PDFPageProxy_getTextContent(params) { - var normalizeWhitespace = (params && params.normalizeWhitespace) || false; - return this.transport.messageHandler.sendWithPromise('GetTextContent', { pageIndex: this.pageNumber - 1, - normalizeWhitespace: normalizeWhitespace, + normalizeWhitespace: (params && params.normalizeWhitespace === true ? + true : /* Default */ false), + combineTextItems: (params && params.disableCombineTextItems === true ? + false : /* Default */ true), }); }, diff --git a/test/driver.js b/test/driver.js index f5508241d..3fc21e513 100644 --- a/test/driver.js +++ b/test/driver.js @@ -332,7 +332,7 @@ var Driver = (function DriverClosure() { this._log('Loading file "' + task.file + '"\n'); - var absoluteUrl = new URL(task.file, window.location).href; + var absoluteUrl = new URL(task.file, window.location).href; PDFJS.disableRange = task.disableRange; PDFJS.disableAutoFetch = !task.enableAutoFetch; try { @@ -469,12 +469,12 @@ var Driver = (function DriverClosure() { textLayerContext.clearRect(0, 0, textLayerCanvas.width, textLayerCanvas.height); // The text builder will draw its content on the test canvas - initPromise = - page.getTextContent({ normalizeWhitespace: true }).then( - function(textContent) { - return rasterizeTextLayer(textLayerContext, viewport, - textContent); - }); + initPromise = page.getTextContent({ + normalizeWhitespace: true, + }).then(function(textContent) { + return rasterizeTextLayer(textLayerContext, viewport, + textContent); + }); } else { textLayerCanvas = null; diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index 6205d3c14..531af34de 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -771,12 +771,14 @@ describe('api', function() { }); it('gets text content', function (done) { var defaultPromise = page.getTextContent(); - var normalizeWhitespacePromise = page.getTextContent({ - normalizeWhitespace: true }); + var parametersPromise = page.getTextContent({ + normalizeWhitespace: true, + disableCombineTextItems: true, + }); var promises = [ defaultPromise, - normalizeWhitespacePromise + parametersPromise, ]; Promise.all(promises).then(function (data) { expect(!!data[0].items).toEqual(true); diff --git a/web/pdf_page_view.js b/web/pdf_page_view.js index cef2a4193..467b8c029 100644 --- a/web/pdf_page_view.js +++ b/web/pdf_page_view.js @@ -503,12 +503,12 @@ var PDFPageView = (function PDFPageViewClosure() { function pdfPageRenderCallback() { pageViewDrawCallback(null); if (textLayer) { - self.pdfPage.getTextContent({ normalizeWhitespace: true }).then( - function textContentResolved(textContent) { - textLayer.setTextContent(textContent); - textLayer.render(TEXT_LAYER_RENDER_DELAY); - } - ); + self.pdfPage.getTextContent({ + normalizeWhitespace: true, + }).then(function textContentResolved(textContent) { + textLayer.setTextContent(textContent); + textLayer.render(TEXT_LAYER_RENDER_DELAY); + }); } }, function pdfPageRenderError(error) { diff --git a/web/pdf_viewer.js b/web/pdf_viewer.js index 0207c6f78..29197e7bd 100644 --- a/web/pdf_viewer.js +++ b/web/pdf_viewer.js @@ -784,7 +784,9 @@ var PDFViewer = (function pdfViewer() { getPageTextContent: function (pageIndex) { return this.pdfDocument.getPage(pageIndex + 1).then(function (page) { - return page.getTextContent({ normalizeWhitespace: true }); + return page.getTextContent({ + normalizeWhitespace: true, + }); }); },