1
0
Fork 0
mirror of https://github.com/mozilla/pdf.js.git synced 2025-04-26 01:58:06 +02:00

Attempt to combine separate beginText/endText sequences in getTextContent (issue 9984)

Please note that while this *improves* issue 9984 slightly (and likely others too), it's not a complete solution.
The remaining issues are related to the, more general, problems with the existing heuristics related to attempting to combine separate text items.
This commit is contained in:
Jonas Jenwald 2018-08-18 13:28:40 +02:00
parent 160ca55163
commit 497b765ede
4 changed files with 47 additions and 7 deletions

View file

@ -1512,6 +1512,17 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
textContentItem.str.length = 0;
}
function isIdenticalSetFont(name, size) {
return (textState.font &&
name === textState.fontName && size === textState.fontSize);
}
function handleBeginText() {
flushTextContentItem();
textState.textMatrix = IDENTITY_MATRIX.slice();
textState.textLineMatrix = IDENTITY_MATRIX.slice();
}
function enqueueChunk() {
let length = textContent.items.length;
if (length > 0) {
@ -1537,6 +1548,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
task.ensureNotTerminated();
timeSlotManager.reset();
var stop, operation = {}, args = [];
let pendingBeginText = false;
while (!(stop = timeSlotManager.check())) {
// The arguments parsed by read() are not used beyond this loop, so
// we can reuse the same array on every iteration, thus avoiding
@ -1547,16 +1559,30 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
break;
}
textState = stateManager.state;
var fn = operation.fn;
var fn = operation.fn | 0;
args = operation.args;
var advance, diff;
switch (fn | 0) {
if (pendingBeginText) {
if (fn === OPS.setFont) {
const fontNameArg = args[0].name, fontSizeArg = args[1];
// For multiple identical Tf (setFont) commands, first check if
// the following command is Tm (setTextMatrix) before continuing.
if (isIdenticalSetFont(fontNameArg, fontSizeArg)) {
continue;
}
}
if (fn !== OPS.setTextMatrix) {
handleBeginText();
}
pendingBeginText = false;
}
switch (fn) {
case OPS.setFont:
// Optimization to ignore multiple identical Tf commands.
var fontNameArg = args[0].name, fontSizeArg = args[1];
if (textState.font && fontNameArg === textState.fontName &&
fontSizeArg === textState.fontSize) {
if (isIdenticalSetFont(fontNameArg, fontSizeArg)) {
break;
}
@ -1644,9 +1670,15 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
textState.wordSpacing = args[0];
break;
case OPS.beginText:
flushTextContentItem();
textState.textMatrix = IDENTITY_MATRIX.slice();
textState.textLineMatrix = IDENTITY_MATRIX.slice();
// Optimization to attempt to combine separate BT/ET sequences,
// by checking the next operator(s) before flushing text content
// and resetting the text/textLine matrices (see above).
if (combineTextItems) {
pendingBeginText = true;
break;
}
handleBeginText();
break;
case OPS.showSpacedText:
var items = args[0];