mirror of
https://github.com/mozilla/pdf.js.git
synced 2025-04-22 16:18:08 +02:00
[api-minor] Add a parameter to PDFPageProxy_getTextContent
that enables replacing of all whitespace with standard spaces in the textLayer (issue 6612)
This patch goes a bit further than issue 6612 requires, and replaces all kinds of whitespace with standard spaces. When testing this locally, it actually seemed to slightly improve two existing test-cases (`tracemonkey-text` and `taro-text`). Fixes 6612.
This commit is contained in:
parent
c2dfe9e9a9
commit
6dfe53b976
12 changed files with 75 additions and 24 deletions
|
@ -218,7 +218,8 @@ var Page = (function PageClosure() {
|
|||
});
|
||||
},
|
||||
|
||||
extractTextContent: function Page_extractTextContent(task) {
|
||||
extractTextContent: function Page_extractTextContent(task,
|
||||
normalizeWhitespace) {
|
||||
var handler = {
|
||||
on: function nullHandlerOn() {},
|
||||
send: function nullHandlerSend() {}
|
||||
|
@ -248,7 +249,9 @@ var Page = (function PageClosure() {
|
|||
|
||||
return partialEvaluator.getTextContent(contentStream,
|
||||
task,
|
||||
self.resources);
|
||||
self.resources,
|
||||
/* stateManager = */ null,
|
||||
normalizeWhitespace);
|
||||
});
|
||||
},
|
||||
|
||||
|
|
|
@ -908,12 +908,15 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
|||
});
|
||||
},
|
||||
|
||||
getTextContent: function PartialEvaluator_getTextContent(stream, task,
|
||||
resources,
|
||||
stateManager) {
|
||||
getTextContent:
|
||||
function PartialEvaluator_getTextContent(stream, task, resources,
|
||||
stateManager,
|
||||
normalizeWhitespace) {
|
||||
|
||||
stateManager = (stateManager || new StateManager(new TextState()));
|
||||
|
||||
var WhitespaceRegexp = /\s/g;
|
||||
|
||||
var textContent = {
|
||||
items: [],
|
||||
styles: Object.create(null)
|
||||
|
@ -1027,11 +1030,23 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
|||
return textContentItem;
|
||||
}
|
||||
|
||||
function replaceWhitespace(str) {
|
||||
// Replaces all whitespaces with standard spaces (0x20), to avoid
|
||||
// alignment issues between the textLayer and the canvas if the text
|
||||
// contains e.g. tabs (fixes issue6612.pdf).
|
||||
var i = 0, ii = str.length, code;
|
||||
while (i < ii && (code = str.charCodeAt(i)) >= 0x20 && code <= 0x7F) {
|
||||
i++;
|
||||
}
|
||||
return (i < ii ? str.replace(WhitespaceRegexp, ' ') : str);
|
||||
}
|
||||
|
||||
function runBidiTransform(textChunk) {
|
||||
var str = textChunk.str.join('');
|
||||
var bidiResult = PDFJS.bidi(str, -1, textChunk.vertical);
|
||||
return {
|
||||
str: bidiResult.str,
|
||||
str: (normalizeWhitespace ? replaceWhitespace(bidiResult.str) :
|
||||
bidiResult.str),
|
||||
dir: bidiResult.dir,
|
||||
width: textChunk.width,
|
||||
height: textChunk.height,
|
||||
|
@ -1352,8 +1367,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
|||
}
|
||||
|
||||
return self.getTextContent(xobj, task,
|
||||
xobj.dict.get('Resources') || resources, stateManager).
|
||||
then(function (formTextContent) {
|
||||
xobj.dict.get('Resources') || resources, stateManager,
|
||||
normalizeWhitespace).then(function (formTextContent) {
|
||||
Util.appendToArray(textContent.items, formTextContent.items);
|
||||
Util.extendObj(textContent.styles, formTextContent.styles);
|
||||
stateManager.restore();
|
||||
|
|
|
@ -517,12 +517,14 @@ var WorkerMessageHandler = PDFJS.WorkerMessageHandler = {
|
|||
|
||||
handler.on('GetTextContent', function wphExtractText(data) {
|
||||
var pageIndex = data.pageIndex;
|
||||
var normalizeWhitespace = data.normalizeWhitespace;
|
||||
return pdfManager.getPage(pageIndex).then(function(page) {
|
||||
var task = new WorkerTask('GetTextContent: page ' + pageIndex);
|
||||
startWorkerTask(task);
|
||||
var pageNum = pageIndex + 1;
|
||||
var start = Date.now();
|
||||
return page.extractTextContent(task).then(function(textContent) {
|
||||
return page.extractTextContent(task, normalizeWhitespace).then(
|
||||
function(textContent) {
|
||||
finishWorkerTask(task);
|
||||
info('text indexing: page=' + pageNum + ' - time=' +
|
||||
(Date.now() - start) + 'ms');
|
||||
|
|
|
@ -708,6 +708,14 @@ var PDFDocumentProxy = (function PDFDocumentProxyClosure() {
|
|||
return PDFDocumentProxy;
|
||||
})();
|
||||
|
||||
/**
|
||||
* Page getTextContent parameters.
|
||||
*
|
||||
* @typedef {Object} getTextContentParameters
|
||||
* @param {boolean} normalizeWhitespace - replaces all occurrences of
|
||||
* whitespace with standard spaces (0x20). The default value is `false`.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Page text content.
|
||||
*
|
||||
|
@ -986,12 +994,16 @@ var PDFPageProxy = (function PDFPageProxyClosure() {
|
|||
},
|
||||
|
||||
/**
|
||||
* @param {getTextContentParameters} params - getTextContent parameters.
|
||||
* @return {Promise} That is resolved a {@link TextContent}
|
||||
* object that represent the page text content.
|
||||
*/
|
||||
getTextContent: function PDFPageProxy_getTextContent() {
|
||||
getTextContent: function PDFPageProxy_getTextContent(params) {
|
||||
var normalizeWhitespace = (params && params.normalizeWhitespace) || false;
|
||||
|
||||
return this.transport.messageHandler.sendWithPromise('GetTextContent', {
|
||||
pageIndex: this.pageNumber - 1
|
||||
pageIndex: this.pageNumber - 1,
|
||||
normalizeWhitespace: normalizeWhitespace,
|
||||
});
|
||||
},
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue