1
0
Fork 0
mirror of https://github.com/mozilla/pdf.js.git synced 2025-04-26 10:08:06 +02:00

Add local caching of non-font Graphics State (ExtGState) data in PartialEvaluator.getTextContent

It turns out that `getTextContent` suffers from *similar* problems with repeated GStates as `getOperatorList`; please see the previous patch.

While only `/ExtGState` resources containing Fonts will actually be *parsed* by `PartialEvaluator.getTextContent`, we're still forced to fetch/validate repeated `/ExtGState` resources even though *most* of them won't affect the textContent (since they mostly contain purely graphical state).

With these changes we also no longer need to immediately reset the current text-state when encountering a `setGState` operator, which may thus improve text-selection in some cases.
This commit is contained in:
Jonas Jenwald 2020-07-11 14:05:53 +02:00
parent 90eb579713
commit 981ff41b5f

View file

@ -1848,6 +1848,7 @@ class PartialEvaluator {
// The xobj is parsed iff it's needed, e.g. if there is a `DO` cmd. // The xobj is parsed iff it's needed, e.g. if there is a `DO` cmd.
var xobjs = null; var xobjs = null;
const emptyXObjectCache = new LocalImageCache(); const emptyXObjectCache = new LocalImageCache();
const emptyGStateCache = new LocalGStateCache();
var preprocessor = new EvaluatorPreprocessor(stream, xref, stateManager); var preprocessor = new EvaluatorPreprocessor(stream, xref, stateManager);
@ -2420,25 +2421,59 @@ class PartialEvaluator {
); );
return; return;
case OPS.setGState: case OPS.setGState:
flushTextContentItem(); name = args[0].name;
var dictName = args[0]; if (name && emptyGStateCache.getByName(name)) {
var extGState = resources.get("ExtGState"); break;
}
if (!isDict(extGState) || !isName(dictName)) { next(
break; new Promise(function (resolveGState, rejectGState) {
if (!name) {
throw new FormatError("GState must be referred to by name.");
} }
var gState = extGState.get(dictName.name);
if (!isDict(gState)) { const extGState = resources.get("ExtGState");
break; if (!(extGState instanceof Dict)) {
throw new FormatError("ExtGState should be a dictionary.");
} }
var gStateFont = gState.get("Font");
if (gStateFont) { const gState = extGState.get(name);
textState.fontName = null; // TODO: Attempt to lookup cached GStates by reference as well,
textState.fontSize = gStateFont[1]; // if and only if there are PDF documents where doing so
next(handleSetFont(null, gStateFont[0])); // would significantly improve performance.
if (!(gState instanceof Dict)) {
throw new FormatError("GState should be a dictionary.");
}
const gStateFont = gState.get("Font");
if (!gStateFont) {
emptyGStateCache.set(name, gState.objId, true);
resolveGState();
return; return;
} }
break; flushTextContentItem();
textState.fontName = null;
textState.fontSize = gStateFont[1];
handleSetFont(null, gStateFont[0]).then(
resolveGState,
rejectGState
);
}).catch(function (reason) {
if (reason instanceof AbortException) {
return;
}
if (self.options.ignoreErrors) {
// Error(s) in the ExtGState -- allow text-extraction to
// continue.
warn(`getTextContent - ignoring ExtGState: "${reason}".`);
return;
}
throw reason;
})
);
return;
} // switch } // switch
if (textContent.items.length >= sink.desiredSize) { if (textContent.items.length >= sink.desiredSize) {
// Wait for ready, if we reach highWaterMark. // Wait for ready, if we reach highWaterMark.