1
0
Fork 0
mirror of https://github.com/mozilla/pdf.js.git synced 2025-04-22 16:18:08 +02:00

[api-minor] Include the document /Lang attribute in the textContent-data

- These changes will allow a simpler way of implementing PR 17770.

 - The /Lang attribute is fetched lazily, with the first `getTextContent` invocation. Given the existing worker-thread caching, this will thus only need to be done *once* per PDF document (and most PDFs don't included this data).

 - This makes the /Lang attribute *directly available* in the `textLayer`, which has the following advantages:
    - We don't need to block, and thus delay, overall viewer initialization on fetching it (nor pass it around throughout the viewer).

    - Third-party users of the `textLayer` will automatically benefit from this, once we start actually using the /Lang attribute in PR 17770.
      *Please note:* This also, importantly, means that the `text` reference-tests will then cover this code (which wouldn't otherwise have been the case).
This commit is contained in:
Jonas Jenwald 2024-04-15 12:30:09 +02:00
parent c0b5d93ef4
commit 6d523c316c
6 changed files with 56 additions and 41 deletions

View file

@ -166,7 +166,7 @@ class Catalog {
return shadow(
this,
"lang",
typeof lang === "string" ? stringToPDFString(lang) : null
lang && typeof lang === "string" ? stringToPDFString(lang) : null
);
}

View file

@ -395,10 +395,9 @@ class Page {
}
loadResources(keys) {
if (!this.resourcesPromise) {
// TODO: add async `_getInheritableProperty` and remove this.
this.resourcesPromise = this.pdfManager.ensure(this, "resources");
}
// TODO: add async `_getInheritableProperty` and remove this.
this.resourcesPromise ||= this.pdfManager.ensure(this, "resources");
return this.resourcesPromise.then(() => {
const objectLoader = new ObjectLoader(this.resources, keys, this.xref);
return objectLoader.load();
@ -625,7 +624,7 @@ class Page {
});
}
extractTextContent({
async extractTextContent({
handler,
task,
includeMarkedContent,
@ -639,31 +638,35 @@ class Page {
"Properties",
"XObject",
]);
const langPromise = this.pdfManager.ensureCatalog("lang");
const dataPromises = Promise.all([contentStreamPromise, resourcesPromise]);
return dataPromises.then(([contentStream]) => {
const partialEvaluator = new PartialEvaluator({
xref: this.xref,
handler,
pageIndex: this.pageIndex,
idFactory: this._localIdFactory,
fontCache: this.fontCache,
builtInCMapCache: this.builtInCMapCache,
standardFontDataCache: this.standardFontDataCache,
globalImageCache: this.globalImageCache,
systemFontCache: this.systemFontCache,
options: this.evaluatorOptions,
});
const [contentStream, , lang] = await Promise.all([
contentStreamPromise,
resourcesPromise,
langPromise,
]);
const partialEvaluator = new PartialEvaluator({
xref: this.xref,
handler,
pageIndex: this.pageIndex,
idFactory: this._localIdFactory,
fontCache: this.fontCache,
builtInCMapCache: this.builtInCMapCache,
standardFontDataCache: this.standardFontDataCache,
globalImageCache: this.globalImageCache,
systemFontCache: this.systemFontCache,
options: this.evaluatorOptions,
});
return partialEvaluator.getTextContent({
stream: contentStream,
task,
resources: this.resources,
includeMarkedContent,
disableNormalization,
sink,
viewBox: this.view,
});
return partialEvaluator.getTextContent({
stream: contentStream,
task,
resources: this.resources,
includeMarkedContent,
disableNormalization,
sink,
viewBox: this.view,
lang,
});
}

View file

@ -2307,6 +2307,7 @@ class PartialEvaluator {
sink,
seenStyles = new Set(),
viewBox,
lang = null,
markedContentData = null,
disableNormalization = false,
keepWhiteSpace = false,
@ -2323,6 +2324,7 @@ class PartialEvaluator {
const textContent = {
items: [],
styles: Object.create(null),
lang,
};
const textContentItem = {
initialized: false,
@ -3296,6 +3298,7 @@ class PartialEvaluator {
sink: sinkWrapper,
seenStyles,
viewBox,
lang,
markedContentData,
disableNormalization,
keepWhiteSpace,

View file

@ -1160,6 +1160,7 @@ class PDFDocumentProxy {
* items are included when includeMarkedContent is true.
* @property {Object<string, TextStyle>} styles - {@link TextStyle} objects,
* indexed by font name.
* @property {string | null} lang - The document /Lang attribute.
*/
/**
@ -1677,6 +1678,7 @@ class PDFPageProxy {
resolve(textContent);
return;
}
textContent.lang ??= value.lang;
Object.assign(textContent.styles, value.styles);
textContent.items.push(...value.items);
pump();
@ -1687,6 +1689,7 @@ class PDFPageProxy {
const textContent = {
items: [],
styles: Object.create(null),
lang: null,
};
pump();
});

View file

@ -64,7 +64,7 @@ const DEFAULT_FONT_ASCENT = 0.8;
const ascentCache = new Map();
let _canvasContext = null;
function getCtx() {
function getCtx(lang = null) {
if (!_canvasContext) {
// We don't use an OffscreenCanvas here because we use serif/sans serif
// fonts with it and they depends on the locale.
@ -89,13 +89,13 @@ function cleanupTextLayer() {
_canvasContext = null;
}
function getAscent(fontFamily) {
function getAscent(fontFamily, lang) {
const cachedAscent = ascentCache.get(fontFamily);
if (cachedAscent) {
return cachedAscent;
}
const ctx = getCtx();
const ctx = getCtx(lang);
const savedFont = ctx.font;
ctx.canvas.width = ctx.canvas.height = DEFAULT_FONT_SIZE;
@ -162,7 +162,7 @@ function getAscent(fontFamily) {
return DEFAULT_FONT_ASCENT;
}
function appendText(task, geom) {
function appendText(task, geom, lang) {
// Initialize all used properties to keep the caches monomorphic.
const textDiv = document.createElement("span");
const textDivProperties = {
@ -184,7 +184,7 @@ function appendText(task, geom) {
const fontFamily =
(task._fontInspectorEnabled && style.fontSubstitution) || style.fontFamily;
const fontHeight = Math.hypot(tx[2], tx[3]);
const fontAscent = fontHeight * getAscent(fontFamily);
const fontAscent = fontHeight * getAscent(fontFamily, lang);
let left, top;
if (angle === 0) {
@ -324,7 +324,7 @@ class TextLayerRenderTask {
div: null,
scale: viewport.scale * (globalThis.devicePixelRatio || 1),
properties: null,
ctx: getCtx(),
ctx: null,
};
this._styleCache = Object.create(null);
const { pageWidth, pageHeight, pageX, pageY } = viewport.rawDims;
@ -371,7 +371,11 @@ class TextLayerRenderTask {
/**
* @private
*/
_processItems(items) {
_processItems(items, lang) {
if (!this._layoutTextParams.ctx) {
this._textDivProperties.set(this._rootContainer, { lang });
this._layoutTextParams.ctx = getCtx(lang);
}
const textDivs = this._textDivs,
textContentItemsStr = this._textContentItemsStr;
@ -403,7 +407,7 @@ class TextLayerRenderTask {
continue;
}
textContentItemsStr.push(item.str);
appendText(this, item);
appendText(this, item, lang);
}
}
@ -440,7 +444,7 @@ class TextLayerRenderTask {
}
Object.assign(styleCache, value.styles);
this._processItems(value.items);
this._processItems(value.items, value.lang);
pump();
}, this._capability.reject);
};
@ -476,7 +480,7 @@ function updateTextLayer({
}
if (mustRescale) {
const ctx = getCtx();
const ctx = getCtx(textDivProperties.get(container)?.lang);
const scale = viewport.scale * (globalThis.devicePixelRatio || 1);
const params = {
prevFontSize: null,