1
0
Fork 0
mirror of https://github.com/mozilla/pdf.js.git synced 2025-04-22 16:18:08 +02:00

Merge pull request #13171 from brendandahl/struct-tree

[api-minor] Add support for basic structure tree for accessibility.
This commit is contained in:
Tim van der Meij 2021-04-09 21:32:44 +02:00 committed by GitHub
commit 03c8c89002
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
22 changed files with 911 additions and 14 deletions

View file

@ -58,6 +58,7 @@ import { calculateMD5 } from "./crypto.js";
import { Linearization } from "./parser.js";
import { OperatorList } from "./operator_list.js";
import { PartialEvaluator } from "./evaluator.js";
import { StructTreePage } from "./struct_tree.js";
import { XFAFactory } from "./xfa/factory.js";
const DEFAULT_USER_UNIT = 1.0;
@ -104,6 +105,10 @@ class Page {
static createObjId() {
return `p${pageIndex}_${++idCounters.obj}`;
}
static getPageObjId() {
return `page${ref.toString()}`;
}
};
}
@ -406,6 +411,7 @@ class Page {
handler,
task,
normalizeWhitespace,
includeMarkedContent,
sink,
combineTextItems,
}) {
@ -437,12 +443,22 @@ class Page {
task,
resources: this.resources,
normalizeWhitespace,
includeMarkedContent,
combineTextItems,
sink,
});
});
}
async getStructTree() {
const structTreeRoot = await this.pdfManager.ensureCatalog(
"structTreeRoot"
);
const tree = new StructTreePage(structTreeRoot, this.pageDict);
tree.parse();
return tree;
}
getAnnotationsData(intent) {
return this._parsedAnnotations.then(function (annotations) {
const annotationsData = [];
@ -604,6 +620,10 @@ class PDFDocument {
static createObjId() {
unreachable("Abstract method `createObjId` called.");
}
static getPageObjId() {
unreachable("Abstract method `getPageObjId` called.");
}
};
}

View file

@ -1913,7 +1913,10 @@ class PartialEvaluator {
return;
}
// Other marked content types aren't supported yet.
args = [args[0].name];
args = [
args[0].name,
args[1] instanceof Dict ? args[1].get("MCID") : null,
];
break;
case OPS.beginMarkedContent:
@ -1973,6 +1976,7 @@ class PartialEvaluator {
stateManager = null,
normalizeWhitespace = false,
combineTextItems = false,
includeMarkedContent = false,
sink,
seenStyles = new Set(),
}) {
@ -2573,6 +2577,7 @@ class PartialEvaluator {
stateManager: xObjStateManager,
normalizeWhitespace,
combineTextItems,
includeMarkedContent,
sink: sinkWrapper,
seenStyles,
})
@ -2650,6 +2655,38 @@ class PartialEvaluator {
})
);
return;
case OPS.beginMarkedContent:
if (includeMarkedContent) {
textContent.items.push({
type: "beginMarkedContent",
tag: isName(args[0]) ? args[0].name : null,
});
}
break;
case OPS.beginMarkedContentProps:
if (includeMarkedContent) {
flushTextContentItem();
let mcid = null;
if (isDict(args[1])) {
mcid = args[1].get("MCID");
}
textContent.items.push({
type: "beginMarkedContentProps",
id: Number.isInteger(mcid)
? `${self.idFactory.getPageObjId()}_mcid${mcid}`
: null,
tag: isName(args[0]) ? args[0].name : null,
});
}
break;
case OPS.endMarkedContent:
if (includeMarkedContent) {
flushTextContentItem();
textContent.items.push({
type: "endMarkedContent",
});
}
break;
} // switch
if (textContent.items.length >= sink.desiredSize) {
// Wait for ready, if we reach highWaterMark.

View file

@ -60,6 +60,7 @@ import { CipherTransformFactory } from "./crypto.js";
import { ColorSpace } from "./colorspace.js";
import { GlobalImageCache } from "./image_utils.js";
import { MetadataParser } from "./metadata_parser.js";
import { StructTreeRoot } from "./struct_tree.js";
function fetchDestination(dest) {
return isDict(dest) ? dest.get("D") : dest;
@ -200,6 +201,32 @@ class Catalog {
return markInfo;
}
get structTreeRoot() {
let structTree = null;
try {
structTree = this._readStructTreeRoot();
} catch (ex) {
if (ex instanceof MissingDataException) {
throw ex;
}
warn("Unable read to structTreeRoot info.");
}
return shadow(this, "structTreeRoot", structTree);
}
/**
* @private
*/
_readStructTreeRoot() {
const obj = this._catDict.get("StructTreeRoot");
if (!isDict(obj)) {
return null;
}
const root = new StructTreeRoot(obj);
root.init();
return root;
}
get toplevelPagesDict() {
const pagesObj = this._catDict.get("Pages");
if (!isDict(pagesObj)) {
@ -2626,4 +2653,4 @@ const ObjectLoader = (function () {
return ObjectLoader;
})();
export { Catalog, FileSpec, ObjectLoader, XRef };
export { Catalog, FileSpec, NumberTree, ObjectLoader, XRef };

335
src/core/struct_tree.js Normal file
View file

@ -0,0 +1,335 @@
/* Copyright 2021 Mozilla Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import { isDict, isName, isRef } from "./primitives.js";
import { isString, stringToPDFString, warn } from "../shared/util.js";
import { NumberTree } from "./obj.js";
const MAX_DEPTH = 40;
const StructElementType = {
PAGE_CONTENT: "PAGE_CONTENT",
STREAM_CONTENT: "STREAM_CONTENT",
OBJECT: "OBJECT",
ELEMENT: "ELEMENT",
};
class StructTreeRoot {
constructor(rootDict) {
this.dict = rootDict;
this.roleMap = new Map();
}
init() {
this.readRoleMap();
}
readRoleMap() {
const roleMapDict = this.dict.get("RoleMap");
if (!isDict(roleMapDict)) {
return;
}
roleMapDict.forEach((key, value) => {
if (!isName(value)) {
return;
}
this.roleMap.set(key, value.name);
});
}
}
/**
* Instead of loading the whole tree we load just the page's relevant structure
* elements, which means we need a wrapper structure to represent the tree.
*/
class StructElementNode {
constructor(tree, dict) {
this.tree = tree;
this.dict = dict;
this.kids = [];
this.parseKids();
}
get role() {
const nameObj = this.dict.get("S");
const name = isName(nameObj) ? nameObj.name : "";
const { root } = this.tree;
if (root.roleMap.has(name)) {
return root.roleMap.get(name);
}
return name;
}
parseKids() {
let pageObjId = null;
const objRef = this.dict.getRaw("Pg");
if (isRef(objRef)) {
pageObjId = objRef.toString();
}
const kids = this.dict.get("K");
if (Array.isArray(kids)) {
for (const kid of kids) {
const element = this.parseKid(pageObjId, kid);
if (element) {
this.kids.push(element);
}
}
} else {
const element = this.parseKid(pageObjId, kids);
if (element) {
this.kids.push(element);
}
}
}
parseKid(pageObjId, kid) {
// A direct link to content, the integer is an mcid.
if (Number.isInteger(kid)) {
if (this.tree.pageDict.objId !== pageObjId) {
return null;
}
return new StructElement({
type: StructElementType.PAGE_CONTENT,
mcid: kid,
pageObjId,
});
}
// Find the dictionary for the kid.
let kidDict = null;
if (isRef(kid)) {
kidDict = this.dict.xref.fetch(kid);
} else if (isDict(kid)) {
kidDict = kid;
}
if (!kidDict) {
return null;
}
const pageRef = kidDict.getRaw("Pg");
if (isRef(pageRef)) {
pageObjId = pageRef.toString();
}
const type = isName(kidDict.get("Type")) ? kidDict.get("Type").name : null;
if (type === "MCR") {
if (this.tree.pageDict.objId !== pageObjId) {
return null;
}
return new StructElement({
type: StructElementType.STREAM_CONTENT,
refObjId: isRef(kidDict.getRaw("Stm"))
? kidDict.getRaw("Stm").toString()
: null,
pageObjId,
mcid: kidDict.get("MCID"),
});
}
if (type === "OBJR") {
if (this.tree.pageDict.objId !== pageObjId) {
return null;
}
return new StructElement({
type: StructElementType.OBJECT,
refObjId: isRef(kidDict.getRaw("Obj"))
? kidDict.getRaw("Obj").toString()
: null,
pageObjId,
});
}
return new StructElement({
type: StructElementType.ELEMENT,
dict: kidDict,
});
}
}
class StructElement {
constructor({
type,
dict = null,
mcid = null,
pageObjId = null,
refObjId = null,
}) {
this.type = type;
this.dict = dict;
this.mcid = mcid;
this.pageObjId = pageObjId;
this.refObjId = refObjId;
this.parentNode = null;
}
}
class StructTreePage {
constructor(structTreeRoot, pageDict) {
this.root = structTreeRoot;
this.rootDict = structTreeRoot ? structTreeRoot.dict : null;
this.pageDict = pageDict;
this.nodes = [];
}
parse() {
if (!this.root || !this.rootDict) {
return;
}
const parentTree = this.rootDict.get("ParentTree");
if (!parentTree) {
return;
}
const id = this.pageDict.get("StructParents");
if (!Number.isInteger(id)) {
return;
}
const numberTree = new NumberTree(parentTree, this.rootDict.xref);
const parentArray = numberTree.get(id);
if (!Array.isArray(parentArray)) {
return;
}
const map = new Map();
for (const ref of parentArray) {
if (isRef(ref)) {
this.addNode(this.rootDict.xref.fetch(ref), map);
}
}
}
addNode(dict, map, level = 0) {
if (level > MAX_DEPTH) {
warn("StructTree MAX_DEPTH reached.");
return null;
}
if (map.has(dict)) {
return map.get(dict);
}
const element = new StructElementNode(this, dict);
map.set(dict, element);
const parent = dict.get("P");
if (!parent || isName(parent.get("Type"), "StructTreeRoot")) {
if (!this.addTopLevelNode(dict, element)) {
map.delete(dict);
}
return element;
}
const parentNode = this.addNode(parent, map, level + 1);
if (!parentNode) {
return element;
}
let save = false;
for (const kid of parentNode.kids) {
if (kid.type === StructElementType.ELEMENT && kid.dict === dict) {
kid.parentNode = element;
save = true;
}
}
if (!save) {
map.delete(dict);
}
return element;
}
addTopLevelNode(dict, element) {
const obj = this.rootDict.get("K");
if (!obj) {
return false;
}
if (isDict(obj)) {
if (obj.objId !== dict.objId) {
return false;
}
this.nodes[0] = element;
return true;
}
if (!Array.isArray(obj)) {
return true;
}
let save = false;
for (let i = 0; i < obj.length; i++) {
const kidRef = obj[i];
if (kidRef && kidRef.toString() === dict.objId) {
this.nodes[i] = element;
save = true;
}
}
return save;
}
/**
* Convert the tree structure into a simplifed object literal that can
* be sent to the main thread.
* @returns {Object}
*/
get serializable() {
function nodeToSerializable(node, parent, level = 0) {
if (level > MAX_DEPTH) {
warn("StructTree too deep to be fully serialized.");
return;
}
const obj = Object.create(null);
obj.role = node.role;
obj.children = [];
parent.children.push(obj);
const alt = node.dict.get("Alt");
if (isString(alt)) {
obj.alt = stringToPDFString(alt);
}
for (const kid of node.kids) {
const kidElement =
kid.type === StructElementType.ELEMENT ? kid.parentNode : null;
if (kidElement) {
nodeToSerializable(kidElement, obj, level + 1);
continue;
} else if (
kid.type === StructElementType.PAGE_CONTENT ||
kid.type === StructElementType.STREAM_CONTENT
) {
obj.children.push({
type: "content",
id: `page${kid.pageObjId}_mcid${kid.mcid}`,
});
} else if (kid.type === StructElementType.OBJECT) {
obj.children.push({
type: "object",
id: kid.refObjId,
});
}
}
}
const root = Object.create(null);
root.children = [];
root.role = "Root";
for (const child of this.nodes) {
if (!child) {
continue;
}
nodeToSerializable(child, root);
}
return root;
}
}
export { StructTreePage, StructTreeRoot };

View file

@ -717,6 +717,7 @@ class WorkerMessageHandler {
task,
sink,
normalizeWhitespace: data.normalizeWhitespace,
includeMarkedContent: data.includeMarkedContent,
combineTextItems: data.combineTextItems,
})
.then(
@ -745,6 +746,18 @@ class WorkerMessageHandler {
});
});
handler.on("GetStructTree", function wphGetStructTree(data) {
const pageIndex = data.pageIndex;
return pdfManager
.getPage(pageIndex)
.then(function (page) {
return pdfManager.ensure(page, "getStructTree");
})
.then(function (structTree) {
return structTree.serializable;
});
});
handler.on("FontFallback", function (data) {
return pdfManager.fontFallback(data.id, handler);
});

View file

@ -1026,13 +1026,17 @@ class PDFDocumentProxy {
* whitespace with standard spaces (0x20). The default value is `false`.
* @property {boolean} disableCombineTextItems - Do not attempt to combine
* same line {@link TextItem}'s. The default value is `false`.
* @property {boolean} [includeMarkedContent] - When true include marked
* content items in the items array of TextContent. The default is `false`.
*/
/**
* Page text content.
*
* @typedef {Object} TextContent
* @property {Array<TextItem>} items - Array of {@link TextItem} objects.
* @property {Array<TextItem | TextMarkedContent>} items - Array of
* {@link TextItem} and {@link TextMarkedContent} objects. TextMarkedContent
* items are included when includeMarkedContent is true.
* @property {Object<string, TextStyle>} styles - {@link TextStyle} objects,
* indexed by font name.
*/
@ -1047,6 +1051,17 @@ class PDFDocumentProxy {
* @property {number} width - Width in device space.
* @property {number} height - Height in device space.
* @property {string} fontName - Font name used by PDF.js for converted font.
*
*/
/**
* Page text marked content part.
*
* @typedef {Object} TextMarkedContent
* @property {string} type - Either 'beginMarkedContent',
* 'beginMarkedContentProps', or 'endMarkedContent'.
* @property {string} id - The marked content identifier. Only used for type
* 'beginMarkedContentProps'.
*/
/**
@ -1103,6 +1118,25 @@ class PDFDocumentProxy {
* states set.
*/
/**
* Structure tree node. The root node will have a role "Root".
*
* @typedef {Object} StructTreeNode
* @property {Array<StructTreeNode | StructTreeContent>} children - Array of
* {@link StructTreeNode} and {@link StructTreeContent} objects.
* @property {string} role - element's role, already mapped if a role map exists
* in the PDF.
*/
/**
* Structure tree content.
*
* @typedef {Object} StructTreeContent
* @property {string} type - either "content" for page and stream structure
* elements or "object" for object references.
* @property {string} id - unique id that will map to the text layer.
*/
/**
* PDF page operator list.
*
@ -1435,6 +1469,7 @@ class PDFPageProxy {
streamTextContent({
normalizeWhitespace = false,
disableCombineTextItems = false,
includeMarkedContent = false,
} = {}) {
const TEXT_CONTENT_CHUNK_SIZE = 100;
@ -1444,6 +1479,7 @@ class PDFPageProxy {
pageIndex: this._pageIndex,
normalizeWhitespace: normalizeWhitespace === true,
combineTextItems: disableCombineTextItems !== true,
includeMarkedContent: includeMarkedContent === true,
},
{
highWaterMark: TEXT_CONTENT_CHUNK_SIZE,
@ -1484,6 +1520,16 @@ class PDFPageProxy {
});
}
/**
* @returns {Promise<StructTreeNode>} A promise that is resolved with a
* {@link StructTreeNode} object that represents the page's structure tree.
*/
getStructTree() {
return (this._structTreePromise ||= this._transport.getStructTree(
this._pageIndex
));
}
/**
* Destroys the page object.
* @private
@ -1513,6 +1559,7 @@ class PDFPageProxy {
this._annotationsPromise = null;
this._jsActionsPromise = null;
this._xfaPromise = null;
this._structTreePromise = null;
this.pendingCleanup = false;
return Promise.all(waitOn);
}
@ -1548,6 +1595,7 @@ class PDFPageProxy {
this._annotationsPromise = null;
this._jsActionsPromise = null;
this._xfaPromise = null;
this._structTreePromise = null;
if (resetStats && this._stats) {
this._stats = new StatTimer();
}
@ -2773,6 +2821,12 @@ class WorkerTransport {
});
}
getStructTree(pageIndex) {
return this.messageHandler.sendWithPromise("GetStructTree", {
pageIndex,
});
}
getOutline() {
return this.messageHandler.sendWithPromise("GetOutline", null);
}

View file

@ -638,6 +638,23 @@ const renderTextLayer = (function renderTextLayerClosure() {
_processItems(items, styleCache) {
for (let i = 0, len = items.length; i < len; i++) {
if (items[i].str === undefined) {
if (
items[i].type === "beginMarkedContentProps" ||
items[i].type === "beginMarkedContent"
) {
const parent = this._container;
this._container = document.createElement("span");
this._container.classList.add("markedContent");
if (items[i].id !== null) {
this._container.setAttribute("id", `${items[i].id}`);
}
parent.appendChild(this._container);
} else if (items[i].type === "endMarkedContent") {
this._container = this._container.parentNode;
}
continue;
}
this._textContentItemsStr.push(items[i].str);
appendText(this, items[i], styleCache, this._layoutTextCtx);
}