diff --git a/src/core/annotation.js b/src/core/annotation.js index 4e881c6c9..d6b612b14 100644 --- a/src/core/annotation.js +++ b/src/core/annotation.js @@ -49,6 +49,8 @@ import { lookupNormalRect, lookupRect, numberToString, + RESOURCES_KEYS_OPERATOR_LIST, + RESOURCES_KEYS_TEXT_CONTENT, stringToAsciiOrUTF16BE, stringToUTF16String, } from "./core_utils.js"; @@ -1196,7 +1198,7 @@ class Annotation { const appearanceDict = appearance.dict; const resources = await this.loadResources( - ["ExtGState", "ColorSpace", "Pattern", "Shading", "XObject", "Font"], + RESOURCES_KEYS_OPERATOR_LIST, appearance ); const bbox = lookupRect(appearanceDict.getArray("BBox"), [0, 0, 1, 1]); @@ -1257,7 +1259,7 @@ class Annotation { } const resources = await this.loadResources( - ["ExtGState", "Font", "Properties", "XObject"], + RESOURCES_KEYS_TEXT_CONTENT, this.appearance ); diff --git a/src/core/core_utils.js b/src/core/core_utils.js index 28c4004c0..b23072d59 100644 --- a/src/core/core_utils.js +++ b/src/core/core_utils.js @@ -32,6 +32,23 @@ const MIN_INT_32 = -(2 ** 31); const IDENTITY_MATRIX = [1, 0, 0, 1, 0, 0]; +const RESOURCES_KEYS_OPERATOR_LIST = [ + "ColorSpace", + "ExtGState", + "Font", + "Pattern", + "Properties", + "Shading", + "XObject", +]; + +const RESOURCES_KEYS_TEXT_CONTENT = [ + "ExtGState", + "Font", + "Properties", + "XObject", +]; + function getLookupTableFactory(initializer) { let lookup; return function () { @@ -745,6 +762,8 @@ export { readUint16, readUint32, recoverJsURL, + RESOURCES_KEYS_OPERATOR_LIST, + RESOURCES_KEYS_TEXT_CONTENT, stringToAsciiOrUTF16BE, stringToUTF16HexString, stringToUTF16String, diff --git a/src/core/document.js b/src/core/document.js index daa40bfcc..c543f718b 100644 --- a/src/core/document.js +++ b/src/core/document.js @@ -45,6 +45,8 @@ import { lookupNormalRect, MissingDataException, PDF_VERSION_REGEXP, + RESOURCES_KEYS_OPERATOR_LIST, + RESOURCES_KEYS_TEXT_CONTENT, validateCSSFont, XRefEntryException, XRefParseException, @@ -419,6 +421,25 @@ class Page { await objectLoader.load(); } + async #getMergedResources(streamDict, keys) { + // In rare cases /Resources are also found in the /Contents stream-dict, + // in addition to in the /Page dict, hence we need to prefer those when + // available (see issue18894.pdf). + const localResources = streamDict?.get("Resources"); + + if (!(localResources instanceof Dict)) { + return this.resources; + } + const objectLoader = new ObjectLoader(localResources, keys, this.xref); + await objectLoader.load(); + + return Dict.merge({ + xref: this.xref, + dictArray: [localResources, this.resources], + mergeSubDicts: true, + }); + } + async getOperatorList({ handler, sink, @@ -429,15 +450,7 @@ class Page { modifiedIds = null, }) { const contentStreamPromise = this.getContentStream(); - const resourcesPromise = this.loadResources([ - "ColorSpace", - "ExtGState", - "Font", - "Pattern", - "Properties", - "Shading", - "XObject", - ]); + const resourcesPromise = this.loadResources(RESOURCES_KEYS_OPERATOR_LIST); const partialEvaluator = new PartialEvaluator({ xref: this.xref, @@ -525,11 +538,15 @@ class Page { contentStreamPromise, resourcesPromise, ]).then(async ([contentStream]) => { + const resources = await this.#getMergedResources( + contentStream.dict, + RESOURCES_KEYS_OPERATOR_LIST + ); const opList = new OperatorList(intent, sink); handler.send("StartRenderPage", { transparency: partialEvaluator.hasBlendModes( - this.resources, + resources, this.nonBlendModesSet ), pageIndex: this.pageIndex, @@ -539,7 +556,7 @@ class Page { await partialEvaluator.getOperatorList({ stream: contentStream, task, - resources: this.resources, + resources, operatorList: opList, }); return opList; @@ -642,12 +659,7 @@ class Page { sink, }) { const contentStreamPromise = this.getContentStream(); - const resourcesPromise = this.loadResources([ - "ExtGState", - "Font", - "Properties", - "XObject", - ]); + const resourcesPromise = this.loadResources(RESOURCES_KEYS_TEXT_CONTENT); const langPromise = this.pdfManager.ensureCatalog("lang"); const [contentStream, , lang] = await Promise.all([ @@ -655,6 +667,11 @@ class Page { resourcesPromise, langPromise, ]); + const resources = await this.#getMergedResources( + contentStream.dict, + RESOURCES_KEYS_TEXT_CONTENT + ); + const partialEvaluator = new PartialEvaluator({ xref: this.xref, handler, @@ -672,7 +689,7 @@ class Page { return partialEvaluator.getTextContent({ stream: contentStream, task, - resources: this.resources, + resources, includeMarkedContent, disableNormalization, sink, diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 3b1cb549f..50380aea7 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -462,7 +462,8 @@ class PartialEvaluator { operatorList, task, initialState, - localColorSpaceCache + localColorSpaceCache, + seenRefs ) { const dict = xobj.dict; const matrix = lookupMatrix(dict.getArray("Matrix"), null); @@ -526,6 +527,7 @@ class PartialEvaluator { resources: dict.get("Resources") || resources, operatorList, initialState, + prevRefs: seenRefs, }); operatorList.addOp(OPS.paintFormXObjectEnd, []); @@ -850,7 +852,8 @@ class PartialEvaluator { operatorList, task, stateManager, - localColorSpaceCache + localColorSpaceCache, + seenRefs ) { const smaskContent = smask.get("G"); const smaskOptions = { @@ -880,7 +883,8 @@ class PartialEvaluator { operatorList, task, stateManager.state.clone({ newPath: true }), - localColorSpaceCache + localColorSpaceCache, + seenRefs ); } @@ -1065,6 +1069,7 @@ class PartialEvaluator { stateManager, localGStateCache, localColorSpaceCache, + seenRefs, }) { const gStateRef = gState.objId; let isSimpleGState = true; @@ -1127,7 +1132,8 @@ class PartialEvaluator { operatorList, task, stateManager, - localColorSpaceCache + localColorSpaceCache, + seenRefs ) ); gStateObj.push([key, true]); @@ -1696,7 +1702,19 @@ class PartialEvaluator { operatorList, initialState = null, fallbackFontDict = null, + prevRefs = null, }) { + const objId = stream.dict?.objId; + const seenRefs = new RefSet(prevRefs); + + if (objId) { + if (prevRefs?.has(objId)) { + throw new Error( + `getOperatorList - ignoring circular reference: ${objId}` + ); + } + seenRefs.put(objId); + } // Ensure that `resources`/`initialState` is correctly initialized, // even if the provided parameter is e.g. `null`. resources ||= Dict.empty; @@ -1808,7 +1826,8 @@ class PartialEvaluator { operatorList, task, stateManager.state.clone({ newPath: true }), - localColorSpaceCache + localColorSpaceCache, + seenRefs ) .then(function () { stateManager.restore(); @@ -2158,6 +2177,7 @@ class PartialEvaluator { stateManager, localGStateCache, localColorSpaceCache, + seenRefs, }) .then(resolveGState, rejectGState); }).catch(function (reason) { @@ -2339,7 +2359,19 @@ class PartialEvaluator { markedContentData = null, disableNormalization = false, keepWhiteSpace = false, + prevRefs = null, }) { + const objId = stream.dict?.objId; + const seenRefs = new RefSet(prevRefs); + + if (objId) { + if (prevRefs?.has(objId)) { + throw new Error( + `getTextContent - ignoring circular reference: ${objId}` + ); + } + seenRefs.put(objId); + } // Ensure that `resources`/`stateManager` is correctly initialized, // even if the provided parameter is e.g. `null`. resources ||= Dict.empty; @@ -3326,6 +3358,7 @@ class PartialEvaluator { markedContentData, disableNormalization, keepWhiteSpace, + prevRefs: seenRefs, }) .then(function () { if (!sinkWrapper.enqueueInvoked) { diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 7c631177a..9c4b08a8e 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -206,6 +206,7 @@ !issue3928.pdf !issue8565.pdf !clippath.pdf +!issue19800.pdf !issue8795_reduced.pdf !bug1755507.pdf !close-path-bug.pdf diff --git a/test/pdfs/issue19800.pdf b/test/pdfs/issue19800.pdf new file mode 100644 index 000000000..a8f75bb0b --- /dev/null +++ b/test/pdfs/issue19800.pdf @@ -0,0 +1,90 @@ +%PDF-1.4 +1 0 obj +<< + /Type /Catalog + /Outlines 2 0 R + /Pages 3 0 R +>> +endobj +2 0 obj +<< + /Type /Outlines + /Count 0 +>> +endobj +3 0 obj +<< + /Type /Pages + /Kids [ 4 0 R ] + /Count 1 +>> +endobj +4 0 obj +<< + /Type /Page + /Parent 3 0 R + /MediaBox [ 0 0 500 300 ] + /Contents 5 0 R + /Resources << + /ProcSet [ /PDF /Text ] + /XObject << /X1 6 0 R >> + >> +>> +endobj +5 0 obj +<< /Length 24 >> +stream +1 0 0 1 25 25 cm /X1 Do +endstream +endobj +6 0 obj +<< /Subtype /Form + /BBox [0 0 1000 1000] + /Length 61 + /Resources << + /ProcSet [ /PDF /Text ] + /Font << /F1 7 0 R >> + /XObject << /X0 8 0 R >> + >> +>> +stream +BT +/F1 24 Tf +(Hello world) Tj +ET +0.5 0 0 0.5 25 25 cm /X0 Do +endstream +endobj +7 0 obj +<< + /Type /Font + /Subtype /Type1 + /Name /F1 + /BaseFont /Helvetica + /Encoding /MacRomanEncoding +>> +endobj +8 0 obj +<< /Subtype /Form + /BBox [0 0 1000 1000] + /Length 61 + /Resources << + /ProcSet [ /PDF /Text ] + /Font << /F1 7 0 R >> + /XObject << /X1 6 0 R >> + >> +>> +stream +BT +/F1 24 Tf +(Hello world) Tj +ET +0.5 0 0 0.5 25 25 cm /X1 Do +endstream +endobj +trailer +<< + /Size 8 + /Root 1 0 R +>> +%%EOF diff --git a/test/test_manifest.json b/test/test_manifest.json index 86c1d819e..621818daa 100644 --- a/test/test_manifest.json +++ b/test/test_manifest.json @@ -6897,6 +6897,20 @@ "lastPage": 2, "type": "eq" }, + { + "id": "issue19800-eq", + "file": "pdfs/issue19800.pdf", + "md5": "92825d3178196bdd01096c4081609efd", + "rounds": 1, + "type": "eq" + }, + { + "id": "issue19800-text", + "file": "pdfs/issue19800.pdf", + "md5": "92825d3178196bdd01096c4081609efd", + "rounds": 1, + "type": "text" + }, { "id": "issue3438", "file": "pdfs/issue3438.pdf",