mirror of
https://github.com/mozilla/pdf.js.git
synced 2025-04-26 01:58:06 +02:00
Attempt to combine separate beginText/endText sequences in getTextContent
(issue 9984)
Please note that while this *improves* issue 9984 slightly (and likely others too), it's not a complete solution. The remaining issues are related to the, more general, problems with the existing heuristics related to attempting to combine separate text items.
This commit is contained in:
parent
160ca55163
commit
497b765ede
4 changed files with 47 additions and 7 deletions
|
@ -1512,6 +1512,17 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
|||
textContentItem.str.length = 0;
|
||||
}
|
||||
|
||||
function isIdenticalSetFont(name, size) {
|
||||
return (textState.font &&
|
||||
name === textState.fontName && size === textState.fontSize);
|
||||
}
|
||||
|
||||
function handleBeginText() {
|
||||
flushTextContentItem();
|
||||
textState.textMatrix = IDENTITY_MATRIX.slice();
|
||||
textState.textLineMatrix = IDENTITY_MATRIX.slice();
|
||||
}
|
||||
|
||||
function enqueueChunk() {
|
||||
let length = textContent.items.length;
|
||||
if (length > 0) {
|
||||
|
@ -1537,6 +1548,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
|||
task.ensureNotTerminated();
|
||||
timeSlotManager.reset();
|
||||
var stop, operation = {}, args = [];
|
||||
let pendingBeginText = false;
|
||||
while (!(stop = timeSlotManager.check())) {
|
||||
// The arguments parsed by read() are not used beyond this loop, so
|
||||
// we can reuse the same array on every iteration, thus avoiding
|
||||
|
@ -1547,16 +1559,30 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
|||
break;
|
||||
}
|
||||
textState = stateManager.state;
|
||||
var fn = operation.fn;
|
||||
var fn = operation.fn | 0;
|
||||
args = operation.args;
|
||||
var advance, diff;
|
||||
|
||||
switch (fn | 0) {
|
||||
if (pendingBeginText) {
|
||||
if (fn === OPS.setFont) {
|
||||
const fontNameArg = args[0].name, fontSizeArg = args[1];
|
||||
// For multiple identical Tf (setFont) commands, first check if
|
||||
// the following command is Tm (setTextMatrix) before continuing.
|
||||
if (isIdenticalSetFont(fontNameArg, fontSizeArg)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (fn !== OPS.setTextMatrix) {
|
||||
handleBeginText();
|
||||
}
|
||||
pendingBeginText = false;
|
||||
}
|
||||
|
||||
switch (fn) {
|
||||
case OPS.setFont:
|
||||
// Optimization to ignore multiple identical Tf commands.
|
||||
var fontNameArg = args[0].name, fontSizeArg = args[1];
|
||||
if (textState.font && fontNameArg === textState.fontName &&
|
||||
fontSizeArg === textState.fontSize) {
|
||||
if (isIdenticalSetFont(fontNameArg, fontSizeArg)) {
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -1644,9 +1670,15 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
|
|||
textState.wordSpacing = args[0];
|
||||
break;
|
||||
case OPS.beginText:
|
||||
flushTextContentItem();
|
||||
textState.textMatrix = IDENTITY_MATRIX.slice();
|
||||
textState.textLineMatrix = IDENTITY_MATRIX.slice();
|
||||
// Optimization to attempt to combine separate BT/ET sequences,
|
||||
// by checking the next operator(s) before flushing text content
|
||||
// and resetting the text/textLine matrices (see above).
|
||||
if (combineTextItems) {
|
||||
pendingBeginText = true;
|
||||
break;
|
||||
}
|
||||
|
||||
handleBeginText();
|
||||
break;
|
||||
case OPS.showSpacedText:
|
||||
var items = args[0];
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue