diff --git a/src/core/primitives.js b/src/core/primitives.js index 0ada1849e..e1bc4798b 100644 --- a/src/core/primitives.js +++ b/src/core/primitives.js @@ -378,6 +378,12 @@ class RefSetCache { clear() { this._map.clear(); } + + *items() { + for (const [ref, value] of this._map) { + yield [Ref.fromString(ref), value]; + } + } } function isName(v, name) { diff --git a/src/core/struct_tree.js b/src/core/struct_tree.js index 68fe82694..9f4f552b0 100644 --- a/src/core/struct_tree.js +++ b/src/core/struct_tree.js @@ -119,19 +119,19 @@ class StructTreeRoot { newRefs, }) { const root = pdfManager.catalog.cloneDict(); + const cache = new RefSetCache(); + cache.put(catalogRef, root); + const structTreeRootRef = xref.getNewTemporaryRef(); root.set("StructTreeRoot", structTreeRootRef); - const buffer = []; - await writeObject(catalogRef, root, buffer, xref); - newRefs.push({ ref: catalogRef, data: buffer.join("") }); - const structTreeRoot = new Dict(xref); structTreeRoot.set("Type", Name.get("StructTreeRoot")); const parentTreeRef = xref.getNewTemporaryRef(); structTreeRoot.set("ParentTree", parentTreeRef); const kids = []; structTreeRoot.set("K", kids); + cache.put(structTreeRootRef, structTreeRoot); const parentTree = new Dict(xref); const nums = []; @@ -144,18 +144,18 @@ class StructTreeRoot { nums, xref, pdfManager, - newRefs, - buffer, + cache, }); structTreeRoot.set("ParentTreeNextKey", nextKey); - buffer.length = 0; - await writeObject(parentTreeRef, parentTree, buffer, xref); - newRefs.push({ ref: parentTreeRef, data: buffer.join("") }); + cache.put(parentTreeRef, parentTree); - buffer.length = 0; - await writeObject(structTreeRootRef, structTreeRoot, buffer, xref); - newRefs.push({ ref: structTreeRootRef, data: buffer.join("") }); + const buffer = []; + for (const [ref, obj] of cache.items()) { + buffer.length = 0; + await writeObject(ref, obj, buffer, xref); + newRefs.push({ ref, data: buffer.join("") }); + } } async canUpdateStructTree({ pdfManager, xref, newAnnotationsByPage }) { @@ -232,6 +232,8 @@ class StructTreeRoot { const xref = this.dict.xref; const structTreeRoot = this.dict.clone(); const structTreeRootRef = this.ref; + const cache = new RefSetCache(); + cache.put(structTreeRootRef, structTreeRoot); let parentTreeRef = structTreeRoot.getRaw("ParentTree"); let parentTree; @@ -243,6 +245,7 @@ class StructTreeRoot { structTreeRoot.set("ParentTree", parentTreeRef); } parentTree = parentTree.clone(); + cache.put(parentTreeRef, parentTree); let nums = parentTree.getRaw("Nums"); let numsRef = null; @@ -255,47 +258,27 @@ class StructTreeRoot { parentTree.set("Nums", nums); } - let kids = structTreeRoot.getRaw("K"); - let kidsRef = null; - if (kids instanceof Ref) { - kidsRef = kids; - kids = xref.fetch(kidsRef); - } else { - kidsRef = xref.getNewTemporaryRef(); - structTreeRoot.set("K", kidsRef); - } - kids = Array.isArray(kids) ? kids.slice() : [kids]; - - const buffer = []; const newNextkey = await StructTreeRoot.#writeKids({ newAnnotationsByPage, structTreeRootRef, - kids, + kids: null, nums, xref, pdfManager, - newRefs, - buffer, + cache, }); structTreeRoot.set("ParentTreeNextKey", newNextkey); - buffer.length = 0; - await writeObject(kidsRef, kids, buffer, xref); - newRefs.push({ ref: kidsRef, data: buffer.join("") }); - if (numsRef) { - buffer.length = 0; - await writeObject(numsRef, nums, buffer, xref); - newRefs.push({ ref: numsRef, data: buffer.join("") }); + cache.put(numsRef, nums); } - buffer.length = 0; - await writeObject(parentTreeRef, parentTree, buffer, xref); - newRefs.push({ ref: parentTreeRef, data: buffer.join("") }); - - buffer.length = 0; - await writeObject(structTreeRootRef, structTreeRoot, buffer, xref); - newRefs.push({ ref: structTreeRootRef, data: buffer.join("") }); + const buffer = []; + for (const [ref, obj] of cache.items()) { + buffer.length = 0; + await writeObject(ref, obj, buffer, xref); + newRefs.push({ ref, data: buffer.join("") }); + } } static async #writeKids({ @@ -305,8 +288,7 @@ class StructTreeRoot { nums, xref, pdfManager, - newRefs, - buffer, + cache, }) { const objr = Name.get("OBJR"); let nextKey = -Infinity; @@ -349,19 +331,15 @@ class StructTreeRoot { tagDict.set("ActualText", actualText); } - if (structTreeParent) { - await this.#updateParentTag({ - structTreeParent, - tagDict, - newTagRef: tagRef, - fallbackRef: structTreeRootRef, - xref, - newRefs, - buffer, - }); - } else { - tagDict.set("P", structTreeRootRef); - } + await this.#updateParentTag({ + structTreeParent, + tagDict, + newTagRef: tagRef, + structTreeRootRef, + fallbackKids: kids, + xref, + cache, + }); const objDict = new Dict(xref); tagDict.set("K", objDict); @@ -372,23 +350,24 @@ class StructTreeRoot { } objDict.set("Obj", ref); - buffer.length = 0; - await writeObject(tagRef, tagDict, buffer, xref); - newRefs.push({ ref: tagRef, data: buffer.join("") }); - + cache.put(tagRef, tagDict); nums.push(parentTreeId, tagRef); - kids.push(tagRef); } } return nextKey + 1; } static #collectParents({ elements, xref, pageDict, numberTree }) { - const idToElement = new Map(); + const idToElements = new Map(); for (const element of elements) { if (element.structTreeParentId) { const id = parseInt(element.structTreeParentId.split("_mc")[1], 10); - idToElement.set(id, element); + let elems = idToElements.get(id); + if (!elems) { + elems = []; + idToElements.set(id, elems); + } + elems.push(element); } } @@ -400,13 +379,16 @@ class StructTreeRoot { const parentArray = numberTree.get(id); const updateElement = (kid, pageKid, kidRef) => { - const element = idToElement.get(kid); - if (element) { + const elems = idToElements.get(kid); + if (elems) { const parentRef = pageKid.getRaw("P"); const parentDict = xref.fetchIfRef(parentRef); if (parentRef instanceof Ref && parentDict instanceof Dict) { // It should always the case, but we check just in case. - element.structTreeParent = { ref: kidRef, dict: pageKid }; + const params = { ref: kidRef, dict: pageKid }; + for (const element of elems) { + element.structTreeParent = params; + } } return true; } @@ -431,67 +413,73 @@ class StructTreeRoot { if (Number.isInteger(kid) && updateElement(kid, pageKid, kidRef)) { break; } + if (!(kid instanceof Dict)) { + continue; + } + if (!isName(kid.get("Type"), "MCR")) { + break; + } + const mcid = kid.get("MCID"); + if (Number.isInteger(mcid) && updateElement(mcid, pageKid, kidRef)) { + break; + } } } } static async #updateParentTag({ - structTreeParent: { ref, dict }, + structTreeParent, tagDict, newTagRef, - fallbackRef, + structTreeRootRef, + fallbackKids, xref, - newRefs, - buffer, + cache, }) { - // We get the parent of the tag. - const parentRef = dict.getRaw("P"); - let parentDict = xref.fetchIfRef(parentRef); + let ref = null; + let parentRef; + if (structTreeParent) { + ({ ref } = structTreeParent); + + // We get the parent of the tag. + parentRef = structTreeParent.dict.getRaw("P") || structTreeRootRef; + } else { + parentRef = structTreeRootRef; + } tagDict.set("P", parentRef); // We get the kids in order to insert a new tag at the right position. - let saveParentDict = false; - let parentKids; - let parentKidsRef = parentDict.getRaw("K"); - if (!(parentKidsRef instanceof Ref)) { - parentKids = parentKidsRef; - parentKidsRef = xref.getNewTemporaryRef(); - parentDict = parentDict.clone(); - parentDict.set("K", parentKidsRef); - saveParentDict = true; - } else { - parentKids = xref.fetch(parentKidsRef); - } - - if (Array.isArray(parentKids)) { - const index = parentKids.indexOf(ref); - if (index >= 0) { - parentKids = parentKids.slice(); - parentKids.splice(index + 1, 0, newTagRef); - } else { - warn("Cannot update the struct tree: parent kid not found."); - tagDict.set("P", fallbackRef); - return; - } - } else if (parentKids instanceof Dict) { - parentKids = [parentKidsRef, newTagRef]; - parentKidsRef = xref.getNewTemporaryRef(); - parentDict.set("K", parentKidsRef); - saveParentDict = true; - } - - buffer.length = 0; - await writeObject(parentKidsRef, parentKids, buffer, xref); - newRefs.push({ ref: parentKidsRef, data: buffer.join("") }); - - if (!saveParentDict) { + const parentDict = xref.fetchIfRef(parentRef); + if (!parentDict) { + fallbackKids.push(newTagRef); return; } - buffer.length = 0; - await writeObject(parentRef, parentDict, buffer, xref); - newRefs.push({ ref: parentRef, data: buffer.join("") }); + let cachedParentDict = cache.get(parentRef); + if (!cachedParentDict) { + cachedParentDict = parentDict.clone(); + cache.put(parentRef, cachedParentDict); + } + const parentKidsRaw = cachedParentDict.getRaw("K"); + let cachedParentKids = + parentKidsRaw instanceof Ref ? cache.get(parentKidsRaw) : null; + if (!cachedParentKids) { + cachedParentKids = xref.fetchIfRef(parentKidsRaw); + cachedParentKids = Array.isArray(cachedParentKids) + ? cachedParentKids.slice() + : [parentKidsRaw]; + const parentKidsRef = xref.getNewTemporaryRef(); + cachedParentDict.set("K", parentKidsRef); + cache.put(parentKidsRef, cachedParentKids); + } + + const index = cachedParentKids.indexOf(ref); + cachedParentKids.splice( + index >= 0 ? index + 1 : cachedParentKids.length, + 0, + newTagRef + ); } } diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index c14796e4f..0f5161625 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -645,3 +645,4 @@ !issue12213.pdf !tracemonkey_freetext.pdf !issue17998.pdf +!pdfjs_wikipedia.pdf diff --git a/test/pdfs/pdfjs_wikipedia.pdf b/test/pdfs/pdfjs_wikipedia.pdf new file mode 100755 index 000000000..677110483 Binary files /dev/null and b/test/pdfs/pdfjs_wikipedia.pdf differ diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index 5be6c949b..08db9bfb8 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -1030,6 +1030,20 @@ describe("api", function () { await pdfLoadingTask.destroy(); }); + function findNode(parent, node, index, check) { + if (check(node)) { + return [parent.children[index - 1], node]; + } + for (let i = 0; i < node.children?.length ?? 0; i++) { + const child = node.children[i]; + const elements = findNode(node, child, i, check); + if (elements) { + return elements; + } + } + return null; + } + it("gets number of pages", function () { expect(pdfDocument.numPages).toEqual(3); }); @@ -2396,7 +2410,22 @@ describe("api", function () { pdfDoc = await loadingTask.promise; const page = await pdfDoc.getPage(1); const tree = await page.getStructTree(); - const leaf = tree.children[0].children[6].children[1]; + const [predecessor, leaf] = findNode( + null, + tree, + 0, + node => node.role === "Figure" + ); + + expect(predecessor).toEqual({ + role: "Span", + children: [ + { + type: "content", + id: "p3R_mc12", + }, + ], + }); expect(leaf).toEqual({ role: "Figure", @@ -2412,6 +2441,104 @@ describe("api", function () { await loadingTask.destroy(); }); + it("write a new stamp annotation in a tagged pdf (with some MCIDs), save and check the structure tree", async function () { + if (isNodeJS) { + pending("Cannot create a bitmap from Node.js."); + } + + const TEST_IMAGES_PATH = "../images/"; + const filename = "firefox_logo.png"; + const path = new URL(TEST_IMAGES_PATH + filename, window.location).href; + + const response = await fetch(path); + const blob = await response.blob(); + const bitmap = await createImageBitmap(blob); + + let loadingTask = getDocument( + buildGetDocumentParams("pdfjs_wikipedia.pdf") + ); + let pdfDoc = await loadingTask.promise; + for (let i = 0; i < 2; i++) { + pdfDoc.annotationStorage.setValue(`pdfjs_internal_editor_${i}`, { + annotationType: AnnotationEditorType.STAMP, + bitmapId: `im${i}`, + pageIndex: 0, + rect: [257 + i, 572 + i, 286 + i, 603 + i], + rotation: 0, + isSvg: false, + structTreeParentId: "p2R_mc155", + accessibilityData: { + type: "Figure", + alt: `Firefox logo ${i}`, + }, + bitmap: structuredClone(bitmap), + }); + } + + const data = await pdfDoc.saveDocument(); + await loadingTask.destroy(); + + loadingTask = getDocument(data); + pdfDoc = await loadingTask.promise; + const page = await pdfDoc.getPage(1); + const tree = await page.getStructTree(); + + let [predecessor, figure] = findNode( + null, + tree, + 0, + node => node.role === "Figure" && node.alt === "Firefox logo 1" + ); + expect(predecessor).toEqual({ + role: "NonStruct", + children: [ + { + type: "content", + id: "p2R_mc155", + }, + ], + }); + expect(figure).toEqual({ + role: "Figure", + children: [ + { + type: "annotation", + id: "pdfjs_internal_id_420R", + }, + ], + alt: "Firefox logo 1", + }); + + [predecessor, figure] = findNode( + null, + tree, + 0, + node => node.role === "Figure" && node.alt === "Firefox logo 0" + ); + expect(predecessor).toEqual({ + role: "Figure", + children: [ + { + type: "annotation", + id: "pdfjs_internal_id_420R", + }, + ], + alt: "Firefox logo 1", + }); + expect(figure).toEqual({ + role: "Figure", + children: [ + { + type: "annotation", + id: "pdfjs_internal_id_416R", + }, + ], + alt: "Firefox logo 0", + }); + + await loadingTask.destroy(); + }); + it("write a new stamp annotation in a tagged pdf, save, repeat and check the structure tree", async function () { if (isNodeJS) { pending("Cannot create a bitmap from Node.js."); diff --git a/test/unit/primitives_spec.js b/test/unit/primitives_spec.js index b5ec0287e..caefdf025 100644 --- a/test/unit/primitives_spec.js +++ b/test/unit/primitives_spec.js @@ -498,6 +498,15 @@ describe("primitives", function () { cache.put(ref2, obj2); expect([...cache]).toEqual([obj1, obj2]); }); + + it("should support iteration over key-value pairs", function () { + cache.put(ref1, obj1); + cache.put(ref2, obj2); + expect([...cache.items()]).toEqual([ + [ref1, obj1], + [ref2, obj2], + ]); + }); }); describe("isName", function () {