mirror of
https://github.com/mozilla/pdf.js.git
synced 2025-04-26 10:08:06 +02:00
Merge pull request #8488 from mukulmishra18/streams-getTextContent
Streams get text content
This commit is contained in:
commit
e2ca894fec
8 changed files with 275 additions and 114 deletions
|
@ -270,7 +270,7 @@ var Page = (function PageClosure() {
|
|||
},
|
||||
|
||||
extractTextContent({ handler, task, normalizeWhitespace,
|
||||
combineTextItems, }) {
|
||||
sink, combineTextItems, }) {
|
||||
var contentStreamPromise = this.pdfManager.ensure(this,
|
||||
'getContentStream');
|
||||
var resourcesPromise = this.loadResources([
|
||||
|
@ -298,6 +298,7 @@ var Page = (function PageClosure() {
|
|||
resources: this.resources,
|
||||
normalizeWhitespace,
|
||||
combineTextItems,
|
||||
sink,
|
||||
});
|
||||
});
|
||||
},
|
||||
|
|
|
@ -1176,7 +1176,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
|||
},
|
||||
|
||||
getTextContent({ stream, task, resources, stateManager = null,
|
||||
normalizeWhitespace = false, combineTextItems = false, }) {
|
||||
normalizeWhitespace = false, combineTextItems = false,
|
||||
sink, seenStyles = Object.create(null), }) {
|
||||
// Ensure that `resources`/`stateManager` is correctly initialized,
|
||||
// even if the provided parameter is e.g. `null`.
|
||||
resources = resources || Dict.empty;
|
||||
|
@ -1214,7 +1215,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
|||
|
||||
// The xobj is parsed iff it's needed, e.g. if there is a `DO` cmd.
|
||||
var xobjs = null;
|
||||
var xobjsCache = Object.create(null);
|
||||
var skipEmptyXObjs = Object.create(null);
|
||||
|
||||
var preprocessor = new EvaluatorPreprocessor(stream, xref, stateManager);
|
||||
|
||||
|
@ -1225,7 +1226,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
|||
return textContentItem;
|
||||
}
|
||||
var font = textState.font;
|
||||
if (!(font.loadedName in textContent.styles)) {
|
||||
if (!(font.loadedName in seenStyles)) {
|
||||
seenStyles[font.loadedName] = true;
|
||||
textContent.styles[font.loadedName] = {
|
||||
fontFamily: font.fallbackName,
|
||||
ascent: font.ascent,
|
||||
|
@ -1416,11 +1418,21 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
|||
textContentItem.str.length = 0;
|
||||
}
|
||||
|
||||
function enqueueChunk() {
|
||||
let length = textContent.items.length;
|
||||
if (length > 0) {
|
||||
sink.enqueue(textContent, length);
|
||||
textContent.items = [];
|
||||
textContent.styles = Object.create(null);
|
||||
}
|
||||
}
|
||||
|
||||
var timeSlotManager = new TimeSlotManager();
|
||||
|
||||
return new Promise(function promiseBody(resolve, reject) {
|
||||
var next = function (promise) {
|
||||
promise.then(function () {
|
||||
let next = function (promise) {
|
||||
enqueueChunk();
|
||||
Promise.all([promise, sink.ready]).then(function () {
|
||||
try {
|
||||
promiseBody(resolve, reject);
|
||||
} catch (ex) {
|
||||
|
@ -1615,11 +1627,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
|||
}
|
||||
|
||||
var name = args[0].name;
|
||||
if (xobjsCache.key === name) {
|
||||
if (xobjsCache.texts) {
|
||||
Util.appendToArray(textContent.items, xobjsCache.texts.items);
|
||||
Util.extendObj(textContent.styles, xobjsCache.texts.styles);
|
||||
}
|
||||
if (name in skipEmptyXObjs) {
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -1633,8 +1641,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
|||
assert(isName(type), 'XObject should have a Name subtype');
|
||||
|
||||
if (type.name !== 'Form') {
|
||||
xobjsCache.key = name;
|
||||
xobjsCache.texts = null;
|
||||
skipEmptyXObjs[name] = true;
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -1650,6 +1657,26 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
|||
xObjStateManager.transform(matrix);
|
||||
}
|
||||
|
||||
// Enqueue the `textContent` chunk before parsing the /Form
|
||||
// XObject.
|
||||
enqueueChunk();
|
||||
let sinkWrapper = {
|
||||
enqueueInvoked: false,
|
||||
|
||||
enqueue(chunk, size) {
|
||||
this.enqueueInvoked = true;
|
||||
sink.enqueue(chunk, size);
|
||||
},
|
||||
|
||||
get desiredSize() {
|
||||
return sink.desiredSize;
|
||||
},
|
||||
|
||||
get ready() {
|
||||
return sink.ready;
|
||||
},
|
||||
};
|
||||
|
||||
next(self.getTextContent({
|
||||
stream: xobj,
|
||||
task,
|
||||
|
@ -1657,12 +1684,12 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
|||
stateManager: xObjStateManager,
|
||||
normalizeWhitespace,
|
||||
combineTextItems,
|
||||
}).then(function (formTextContent) {
|
||||
Util.appendToArray(textContent.items, formTextContent.items);
|
||||
Util.extendObj(textContent.styles, formTextContent.styles);
|
||||
|
||||
xobjsCache.key = name;
|
||||
xobjsCache.texts = formTextContent;
|
||||
sink: sinkWrapper,
|
||||
seenStyles,
|
||||
}).then(function() {
|
||||
if (!sinkWrapper.enqueueInvoked) {
|
||||
skipEmptyXObjs[name] = true;
|
||||
}
|
||||
}));
|
||||
return;
|
||||
case OPS.setGState:
|
||||
|
@ -1686,20 +1713,27 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
|||
}
|
||||
break;
|
||||
} // switch
|
||||
if (textContent.items.length >= sink.desiredSize) {
|
||||
// Wait for ready, if we reach highWaterMark.
|
||||
stop = true;
|
||||
break;
|
||||
}
|
||||
} // while
|
||||
if (stop) {
|
||||
next(deferred);
|
||||
return;
|
||||
}
|
||||
flushTextContentItem();
|
||||
resolve(textContent);
|
||||
enqueueChunk();
|
||||
resolve();
|
||||
}).catch((reason) => {
|
||||
if (this.options.ignoreErrors) {
|
||||
// Error(s) in the TextContent -- allow text-extraction to continue.
|
||||
warn('getTextContent - ignoring errors during task: ' + task.name);
|
||||
|
||||
flushTextContentItem();
|
||||
return textContent;
|
||||
enqueueChunk();
|
||||
return;
|
||||
}
|
||||
throw reason;
|
||||
});
|
||||
|
|
|
@ -874,30 +874,35 @@ var WorkerMessageHandler = {
|
|||
});
|
||||
}, this);
|
||||
|
||||
handler.on('GetTextContent', function wphExtractText(data) {
|
||||
handler.on('GetTextContent', function wphExtractText(data, sink) {
|
||||
var pageIndex = data.pageIndex;
|
||||
return pdfManager.getPage(pageIndex).then(function(page) {
|
||||
sink.onPull = function (desiredSize) { };
|
||||
sink.onCancel = function (reason) { };
|
||||
|
||||
pdfManager.getPage(pageIndex).then(function(page) {
|
||||
var task = new WorkerTask('GetTextContent: page ' + pageIndex);
|
||||
startWorkerTask(task);
|
||||
|
||||
var pageNum = pageIndex + 1;
|
||||
var start = Date.now();
|
||||
return page.extractTextContent({
|
||||
page.extractTextContent({
|
||||
handler,
|
||||
task,
|
||||
sink,
|
||||
normalizeWhitespace: data.normalizeWhitespace,
|
||||
combineTextItems: data.combineTextItems,
|
||||
}).then(function(textContent) {
|
||||
}).then(function() {
|
||||
finishWorkerTask(task);
|
||||
|
||||
info('text indexing: page=' + pageNum + ' - time=' +
|
||||
(Date.now() - start) + 'ms');
|
||||
return textContent;
|
||||
sink.close();
|
||||
}, function (reason) {
|
||||
finishWorkerTask(task);
|
||||
if (task.terminated) {
|
||||
return; // ignoring errors from the terminated thread
|
||||
}
|
||||
sink.error(reason);
|
||||
throw reason;
|
||||
});
|
||||
});
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue