1
0
Fork 0
mirror of https://github.com/mozilla/pdf.js.git synced 2025-04-19 22:58:07 +02:00

Merge pull request #17986 from calixteman/fix_struct_tree

Allow to insert several annotations under the same parent in the structure tree
This commit is contained in:
calixteman 2024-04-24 18:32:00 +02:00 committed by GitHub
commit d1f494d68c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 243 additions and 112 deletions

View file

@ -378,6 +378,12 @@ class RefSetCache {
clear() {
this._map.clear();
}
*items() {
for (const [ref, value] of this._map) {
yield [Ref.fromString(ref), value];
}
}
}
function isName(v, name) {

View file

@ -119,19 +119,19 @@ class StructTreeRoot {
newRefs,
}) {
const root = pdfManager.catalog.cloneDict();
const cache = new RefSetCache();
cache.put(catalogRef, root);
const structTreeRootRef = xref.getNewTemporaryRef();
root.set("StructTreeRoot", structTreeRootRef);
const buffer = [];
await writeObject(catalogRef, root, buffer, xref);
newRefs.push({ ref: catalogRef, data: buffer.join("") });
const structTreeRoot = new Dict(xref);
structTreeRoot.set("Type", Name.get("StructTreeRoot"));
const parentTreeRef = xref.getNewTemporaryRef();
structTreeRoot.set("ParentTree", parentTreeRef);
const kids = [];
structTreeRoot.set("K", kids);
cache.put(structTreeRootRef, structTreeRoot);
const parentTree = new Dict(xref);
const nums = [];
@ -144,18 +144,18 @@ class StructTreeRoot {
nums,
xref,
pdfManager,
newRefs,
buffer,
cache,
});
structTreeRoot.set("ParentTreeNextKey", nextKey);
buffer.length = 0;
await writeObject(parentTreeRef, parentTree, buffer, xref);
newRefs.push({ ref: parentTreeRef, data: buffer.join("") });
cache.put(parentTreeRef, parentTree);
buffer.length = 0;
await writeObject(structTreeRootRef, structTreeRoot, buffer, xref);
newRefs.push({ ref: structTreeRootRef, data: buffer.join("") });
const buffer = [];
for (const [ref, obj] of cache.items()) {
buffer.length = 0;
await writeObject(ref, obj, buffer, xref);
newRefs.push({ ref, data: buffer.join("") });
}
}
async canUpdateStructTree({ pdfManager, xref, newAnnotationsByPage }) {
@ -232,6 +232,8 @@ class StructTreeRoot {
const xref = this.dict.xref;
const structTreeRoot = this.dict.clone();
const structTreeRootRef = this.ref;
const cache = new RefSetCache();
cache.put(structTreeRootRef, structTreeRoot);
let parentTreeRef = structTreeRoot.getRaw("ParentTree");
let parentTree;
@ -243,6 +245,7 @@ class StructTreeRoot {
structTreeRoot.set("ParentTree", parentTreeRef);
}
parentTree = parentTree.clone();
cache.put(parentTreeRef, parentTree);
let nums = parentTree.getRaw("Nums");
let numsRef = null;
@ -255,47 +258,27 @@ class StructTreeRoot {
parentTree.set("Nums", nums);
}
let kids = structTreeRoot.getRaw("K");
let kidsRef = null;
if (kids instanceof Ref) {
kidsRef = kids;
kids = xref.fetch(kidsRef);
} else {
kidsRef = xref.getNewTemporaryRef();
structTreeRoot.set("K", kidsRef);
}
kids = Array.isArray(kids) ? kids.slice() : [kids];
const buffer = [];
const newNextkey = await StructTreeRoot.#writeKids({
newAnnotationsByPage,
structTreeRootRef,
kids,
kids: null,
nums,
xref,
pdfManager,
newRefs,
buffer,
cache,
});
structTreeRoot.set("ParentTreeNextKey", newNextkey);
buffer.length = 0;
await writeObject(kidsRef, kids, buffer, xref);
newRefs.push({ ref: kidsRef, data: buffer.join("") });
if (numsRef) {
buffer.length = 0;
await writeObject(numsRef, nums, buffer, xref);
newRefs.push({ ref: numsRef, data: buffer.join("") });
cache.put(numsRef, nums);
}
buffer.length = 0;
await writeObject(parentTreeRef, parentTree, buffer, xref);
newRefs.push({ ref: parentTreeRef, data: buffer.join("") });
buffer.length = 0;
await writeObject(structTreeRootRef, structTreeRoot, buffer, xref);
newRefs.push({ ref: structTreeRootRef, data: buffer.join("") });
const buffer = [];
for (const [ref, obj] of cache.items()) {
buffer.length = 0;
await writeObject(ref, obj, buffer, xref);
newRefs.push({ ref, data: buffer.join("") });
}
}
static async #writeKids({
@ -305,8 +288,7 @@ class StructTreeRoot {
nums,
xref,
pdfManager,
newRefs,
buffer,
cache,
}) {
const objr = Name.get("OBJR");
let nextKey = -Infinity;
@ -349,19 +331,15 @@ class StructTreeRoot {
tagDict.set("ActualText", actualText);
}
if (structTreeParent) {
await this.#updateParentTag({
structTreeParent,
tagDict,
newTagRef: tagRef,
fallbackRef: structTreeRootRef,
xref,
newRefs,
buffer,
});
} else {
tagDict.set("P", structTreeRootRef);
}
await this.#updateParentTag({
structTreeParent,
tagDict,
newTagRef: tagRef,
structTreeRootRef,
fallbackKids: kids,
xref,
cache,
});
const objDict = new Dict(xref);
tagDict.set("K", objDict);
@ -372,23 +350,24 @@ class StructTreeRoot {
}
objDict.set("Obj", ref);
buffer.length = 0;
await writeObject(tagRef, tagDict, buffer, xref);
newRefs.push({ ref: tagRef, data: buffer.join("") });
cache.put(tagRef, tagDict);
nums.push(parentTreeId, tagRef);
kids.push(tagRef);
}
}
return nextKey + 1;
}
static #collectParents({ elements, xref, pageDict, numberTree }) {
const idToElement = new Map();
const idToElements = new Map();
for (const element of elements) {
if (element.structTreeParentId) {
const id = parseInt(element.structTreeParentId.split("_mc")[1], 10);
idToElement.set(id, element);
let elems = idToElements.get(id);
if (!elems) {
elems = [];
idToElements.set(id, elems);
}
elems.push(element);
}
}
@ -400,13 +379,16 @@ class StructTreeRoot {
const parentArray = numberTree.get(id);
const updateElement = (kid, pageKid, kidRef) => {
const element = idToElement.get(kid);
if (element) {
const elems = idToElements.get(kid);
if (elems) {
const parentRef = pageKid.getRaw("P");
const parentDict = xref.fetchIfRef(parentRef);
if (parentRef instanceof Ref && parentDict instanceof Dict) {
// It should always the case, but we check just in case.
element.structTreeParent = { ref: kidRef, dict: pageKid };
const params = { ref: kidRef, dict: pageKid };
for (const element of elems) {
element.structTreeParent = params;
}
}
return true;
}
@ -431,67 +413,73 @@ class StructTreeRoot {
if (Number.isInteger(kid) && updateElement(kid, pageKid, kidRef)) {
break;
}
if (!(kid instanceof Dict)) {
continue;
}
if (!isName(kid.get("Type"), "MCR")) {
break;
}
const mcid = kid.get("MCID");
if (Number.isInteger(mcid) && updateElement(mcid, pageKid, kidRef)) {
break;
}
}
}
}
static async #updateParentTag({
structTreeParent: { ref, dict },
structTreeParent,
tagDict,
newTagRef,
fallbackRef,
structTreeRootRef,
fallbackKids,
xref,
newRefs,
buffer,
cache,
}) {
// We get the parent of the tag.
const parentRef = dict.getRaw("P");
let parentDict = xref.fetchIfRef(parentRef);
let ref = null;
let parentRef;
if (structTreeParent) {
({ ref } = structTreeParent);
// We get the parent of the tag.
parentRef = structTreeParent.dict.getRaw("P") || structTreeRootRef;
} else {
parentRef = structTreeRootRef;
}
tagDict.set("P", parentRef);
// We get the kids in order to insert a new tag at the right position.
let saveParentDict = false;
let parentKids;
let parentKidsRef = parentDict.getRaw("K");
if (!(parentKidsRef instanceof Ref)) {
parentKids = parentKidsRef;
parentKidsRef = xref.getNewTemporaryRef();
parentDict = parentDict.clone();
parentDict.set("K", parentKidsRef);
saveParentDict = true;
} else {
parentKids = xref.fetch(parentKidsRef);
}
if (Array.isArray(parentKids)) {
const index = parentKids.indexOf(ref);
if (index >= 0) {
parentKids = parentKids.slice();
parentKids.splice(index + 1, 0, newTagRef);
} else {
warn("Cannot update the struct tree: parent kid not found.");
tagDict.set("P", fallbackRef);
return;
}
} else if (parentKids instanceof Dict) {
parentKids = [parentKidsRef, newTagRef];
parentKidsRef = xref.getNewTemporaryRef();
parentDict.set("K", parentKidsRef);
saveParentDict = true;
}
buffer.length = 0;
await writeObject(parentKidsRef, parentKids, buffer, xref);
newRefs.push({ ref: parentKidsRef, data: buffer.join("") });
if (!saveParentDict) {
const parentDict = xref.fetchIfRef(parentRef);
if (!parentDict) {
fallbackKids.push(newTagRef);
return;
}
buffer.length = 0;
await writeObject(parentRef, parentDict, buffer, xref);
newRefs.push({ ref: parentRef, data: buffer.join("") });
let cachedParentDict = cache.get(parentRef);
if (!cachedParentDict) {
cachedParentDict = parentDict.clone();
cache.put(parentRef, cachedParentDict);
}
const parentKidsRaw = cachedParentDict.getRaw("K");
let cachedParentKids =
parentKidsRaw instanceof Ref ? cache.get(parentKidsRaw) : null;
if (!cachedParentKids) {
cachedParentKids = xref.fetchIfRef(parentKidsRaw);
cachedParentKids = Array.isArray(cachedParentKids)
? cachedParentKids.slice()
: [parentKidsRaw];
const parentKidsRef = xref.getNewTemporaryRef();
cachedParentDict.set("K", parentKidsRef);
cache.put(parentKidsRef, cachedParentKids);
}
const index = cachedParentKids.indexOf(ref);
cachedParentKids.splice(
index >= 0 ? index + 1 : cachedParentKids.length,
0,
newTagRef
);
}
}

View file

@ -645,3 +645,4 @@
!issue12213.pdf
!tracemonkey_freetext.pdf
!issue17998.pdf
!pdfjs_wikipedia.pdf

BIN
test/pdfs/pdfjs_wikipedia.pdf Executable file

Binary file not shown.

View file

@ -1030,6 +1030,20 @@ describe("api", function () {
await pdfLoadingTask.destroy();
});
function findNode(parent, node, index, check) {
if (check(node)) {
return [parent.children[index - 1], node];
}
for (let i = 0; i < node.children?.length ?? 0; i++) {
const child = node.children[i];
const elements = findNode(node, child, i, check);
if (elements) {
return elements;
}
}
return null;
}
it("gets number of pages", function () {
expect(pdfDocument.numPages).toEqual(3);
});
@ -2396,7 +2410,22 @@ describe("api", function () {
pdfDoc = await loadingTask.promise;
const page = await pdfDoc.getPage(1);
const tree = await page.getStructTree();
const leaf = tree.children[0].children[6].children[1];
const [predecessor, leaf] = findNode(
null,
tree,
0,
node => node.role === "Figure"
);
expect(predecessor).toEqual({
role: "Span",
children: [
{
type: "content",
id: "p3R_mc12",
},
],
});
expect(leaf).toEqual({
role: "Figure",
@ -2412,6 +2441,104 @@ describe("api", function () {
await loadingTask.destroy();
});
it("write a new stamp annotation in a tagged pdf (with some MCIDs), save and check the structure tree", async function () {
if (isNodeJS) {
pending("Cannot create a bitmap from Node.js.");
}
const TEST_IMAGES_PATH = "../images/";
const filename = "firefox_logo.png";
const path = new URL(TEST_IMAGES_PATH + filename, window.location).href;
const response = await fetch(path);
const blob = await response.blob();
const bitmap = await createImageBitmap(blob);
let loadingTask = getDocument(
buildGetDocumentParams("pdfjs_wikipedia.pdf")
);
let pdfDoc = await loadingTask.promise;
for (let i = 0; i < 2; i++) {
pdfDoc.annotationStorage.setValue(`pdfjs_internal_editor_${i}`, {
annotationType: AnnotationEditorType.STAMP,
bitmapId: `im${i}`,
pageIndex: 0,
rect: [257 + i, 572 + i, 286 + i, 603 + i],
rotation: 0,
isSvg: false,
structTreeParentId: "p2R_mc155",
accessibilityData: {
type: "Figure",
alt: `Firefox logo ${i}`,
},
bitmap: structuredClone(bitmap),
});
}
const data = await pdfDoc.saveDocument();
await loadingTask.destroy();
loadingTask = getDocument(data);
pdfDoc = await loadingTask.promise;
const page = await pdfDoc.getPage(1);
const tree = await page.getStructTree();
let [predecessor, figure] = findNode(
null,
tree,
0,
node => node.role === "Figure" && node.alt === "Firefox logo 1"
);
expect(predecessor).toEqual({
role: "NonStruct",
children: [
{
type: "content",
id: "p2R_mc155",
},
],
});
expect(figure).toEqual({
role: "Figure",
children: [
{
type: "annotation",
id: "pdfjs_internal_id_420R",
},
],
alt: "Firefox logo 1",
});
[predecessor, figure] = findNode(
null,
tree,
0,
node => node.role === "Figure" && node.alt === "Firefox logo 0"
);
expect(predecessor).toEqual({
role: "Figure",
children: [
{
type: "annotation",
id: "pdfjs_internal_id_420R",
},
],
alt: "Firefox logo 1",
});
expect(figure).toEqual({
role: "Figure",
children: [
{
type: "annotation",
id: "pdfjs_internal_id_416R",
},
],
alt: "Firefox logo 0",
});
await loadingTask.destroy();
});
it("write a new stamp annotation in a tagged pdf, save, repeat and check the structure tree", async function () {
if (isNodeJS) {
pending("Cannot create a bitmap from Node.js.");

View file

@ -498,6 +498,15 @@ describe("primitives", function () {
cache.put(ref2, obj2);
expect([...cache]).toEqual([obj1, obj2]);
});
it("should support iteration over key-value pairs", function () {
cache.put(ref1, obj1);
cache.put(ref2, obj2);
expect([...cache.items()]).toEqual([
[ref1, obj1],
[ref2, obj2],
]);
});
});
describe("isName", function () {