1
0
Fork 0
mirror of https://github.com/mozilla/pdf.js.git synced 2025-04-22 16:18:08 +02:00

[api-minor] Add a parameter to PDFPageProxy_getTextContent that controls whether PartialEvaluator_getTextContent will attempt to combine same line text items

From the discussion in issue 7445, it seems that there may be cases where an API consumer would want to get the text content as is, without combined text items.
This commit is contained in:
Jonas Jenwald 2016-07-03 18:29:47 +02:00
parent 9228a04061
commit f297e4d17c
8 changed files with 42 additions and 27 deletions

View file

@ -265,7 +265,8 @@ var Page = (function PageClosure() {
},
extractTextContent: function Page_extractTextContent(task,
normalizeWhitespace) {
normalizeWhitespace,
combineTextItems) {
var handler = {
on: function nullHandlerOn() {},
send: function nullHandlerSend() {}
@ -298,7 +299,8 @@ var Page = (function PageClosure() {
task,
self.resources,
/* stateManager = */ null,
normalizeWhitespace);
normalizeWhitespace,
combineTextItems);
});
},

View file

@ -1110,7 +1110,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
getTextContent:
function PartialEvaluator_getTextContent(stream, task, resources,
stateManager,
normalizeWhitespace) {
normalizeWhitespace,
combineTextItems) {
stateManager = (stateManager || new StateManager(new TextState()));
@ -1421,7 +1422,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
var isSameTextLine = !textState.font ? false :
((textState.font.vertical ? args[0] : args[1]) === 0);
advance = args[0] - args[1];
if (isSameTextLine && textContentItem.initialized &&
if (combineTextItems &&
isSameTextLine && textContentItem.initialized &&
advance > 0 &&
advance <= textContentItem.fakeMultiSpaceMax) {
textState.translateTextLineMatrix(args[0], args[1]);
@ -1453,7 +1455,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
// Optimization to treat same line movement as advance.
advance = textState.calcTextLineMatrixAdvance(
args[0], args[1], args[2], args[3], args[4], args[5]);
if (advance !== null && textContentItem.initialized &&
if (combineTextItems &&
advance !== null && textContentItem.initialized &&
advance.value > 0 &&
advance.value <= textContentItem.fakeMultiSpaceMax) {
textState.translateTextLineMatrix(advance.width,
@ -1594,7 +1597,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
next(self.getTextContent(xobj, task,
xobj.dict.get('Resources') || resources, stateManager,
normalizeWhitespace).then(function (formTextContent) {
normalizeWhitespace, combineTextItems).then(
function (formTextContent) {
Util.appendToArray(textContent.items, formTextContent.items);
Util.extendObj(textContent.styles, formTextContent.styles);
stateManager.restore();

View file

@ -891,12 +891,14 @@ var WorkerMessageHandler = {
handler.on('GetTextContent', function wphExtractText(data) {
var pageIndex = data.pageIndex;
var normalizeWhitespace = data.normalizeWhitespace;
var combineTextItems = data.combineTextItems;
return pdfManager.getPage(pageIndex).then(function(page) {
var task = new WorkerTask('GetTextContent: page ' + pageIndex);
startWorkerTask(task);
var pageNum = pageIndex + 1;
var start = Date.now();
return page.extractTextContent(task, normalizeWhitespace).then(
return page.extractTextContent(task, normalizeWhitespace,
combineTextItems).then(
function(textContent) {
finishWorkerTask(task);
info('text indexing: page=' + pageNum + ' - time=' +

View file

@ -600,6 +600,8 @@ var PDFDocumentProxy = (function PDFDocumentProxyClosure() {
* @typedef {Object} getTextContentParameters
* @param {boolean} normalizeWhitespace - replaces all occurrences of
* whitespace with standard spaces (0x20). The default value is `false`.
* @param {boolean} disableCombineTextItems - do not attempt to combine
* same line {@link TextItem}'s. The default value is `false`.
*/
/**
@ -891,11 +893,12 @@ var PDFPageProxy = (function PDFPageProxyClosure() {
* object that represent the page text content.
*/
getTextContent: function PDFPageProxy_getTextContent(params) {
var normalizeWhitespace = (params && params.normalizeWhitespace) || false;
return this.transport.messageHandler.sendWithPromise('GetTextContent', {
pageIndex: this.pageNumber - 1,
normalizeWhitespace: normalizeWhitespace,
normalizeWhitespace: (params && params.normalizeWhitespace === true ?
true : /* Default */ false),
combineTextItems: (params && params.disableCombineTextItems === true ?
false : /* Default */ true),
});
},