1
0
Fork 0
mirror of https://github.com/mozilla/pdf.js.git synced 2025-04-19 22:58:07 +02:00

Merge pull request #17941 from Snuffleupagus/getTextContent-lang

[api-minor] Include the document /Lang attribute in the textContent-data
This commit is contained in:
Jonas Jenwald 2024-05-14 13:57:46 +02:00 committed by GitHub
commit bb9bb34721
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 56 additions and 41 deletions

View file

@ -166,7 +166,7 @@ class Catalog {
return shadow(
this,
"lang",
typeof lang === "string" ? stringToPDFString(lang) : null
lang && typeof lang === "string" ? stringToPDFString(lang) : null
);
}

View file

@ -395,10 +395,9 @@ class Page {
}
loadResources(keys) {
if (!this.resourcesPromise) {
// TODO: add async `_getInheritableProperty` and remove this.
this.resourcesPromise = this.pdfManager.ensure(this, "resources");
}
// TODO: add async `_getInheritableProperty` and remove this.
this.resourcesPromise ||= this.pdfManager.ensure(this, "resources");
return this.resourcesPromise.then(() => {
const objectLoader = new ObjectLoader(this.resources, keys, this.xref);
return objectLoader.load();
@ -625,7 +624,7 @@ class Page {
});
}
extractTextContent({
async extractTextContent({
handler,
task,
includeMarkedContent,
@ -639,31 +638,35 @@ class Page {
"Properties",
"XObject",
]);
const langPromise = this.pdfManager.ensureCatalog("lang");
const dataPromises = Promise.all([contentStreamPromise, resourcesPromise]);
return dataPromises.then(([contentStream]) => {
const partialEvaluator = new PartialEvaluator({
xref: this.xref,
handler,
pageIndex: this.pageIndex,
idFactory: this._localIdFactory,
fontCache: this.fontCache,
builtInCMapCache: this.builtInCMapCache,
standardFontDataCache: this.standardFontDataCache,
globalImageCache: this.globalImageCache,
systemFontCache: this.systemFontCache,
options: this.evaluatorOptions,
});
const [contentStream, , lang] = await Promise.all([
contentStreamPromise,
resourcesPromise,
langPromise,
]);
const partialEvaluator = new PartialEvaluator({
xref: this.xref,
handler,
pageIndex: this.pageIndex,
idFactory: this._localIdFactory,
fontCache: this.fontCache,
builtInCMapCache: this.builtInCMapCache,
standardFontDataCache: this.standardFontDataCache,
globalImageCache: this.globalImageCache,
systemFontCache: this.systemFontCache,
options: this.evaluatorOptions,
});
return partialEvaluator.getTextContent({
stream: contentStream,
task,
resources: this.resources,
includeMarkedContent,
disableNormalization,
sink,
viewBox: this.view,
});
return partialEvaluator.getTextContent({
stream: contentStream,
task,
resources: this.resources,
includeMarkedContent,
disableNormalization,
sink,
viewBox: this.view,
lang,
});
}

View file

@ -2307,6 +2307,7 @@ class PartialEvaluator {
sink,
seenStyles = new Set(),
viewBox,
lang = null,
markedContentData = null,
disableNormalization = false,
keepWhiteSpace = false,
@ -2323,6 +2324,7 @@ class PartialEvaluator {
const textContent = {
items: [],
styles: Object.create(null),
lang,
};
const textContentItem = {
initialized: false,
@ -3296,6 +3298,7 @@ class PartialEvaluator {
sink: sinkWrapper,
seenStyles,
viewBox,
lang,
markedContentData,
disableNormalization,
keepWhiteSpace,

View file

@ -1160,6 +1160,7 @@ class PDFDocumentProxy {
* items are included when includeMarkedContent is true.
* @property {Object<string, TextStyle>} styles - {@link TextStyle} objects,
* indexed by font name.
* @property {string | null} lang - The document /Lang attribute.
*/
/**
@ -1677,6 +1678,7 @@ class PDFPageProxy {
resolve(textContent);
return;
}
textContent.lang ??= value.lang;
Object.assign(textContent.styles, value.styles);
textContent.items.push(...value.items);
pump();
@ -1687,6 +1689,7 @@ class PDFPageProxy {
const textContent = {
items: [],
styles: Object.create(null),
lang: null,
};
pump();
});

View file

@ -64,7 +64,7 @@ const DEFAULT_FONT_ASCENT = 0.8;
const ascentCache = new Map();
let _canvasContext = null;
function getCtx() {
function getCtx(lang = null) {
if (!_canvasContext) {
// We don't use an OffscreenCanvas here because we use serif/sans serif
// fonts with it and they depends on the locale.
@ -89,13 +89,13 @@ function cleanupTextLayer() {
_canvasContext = null;
}
function getAscent(fontFamily) {
function getAscent(fontFamily, lang) {
const cachedAscent = ascentCache.get(fontFamily);
if (cachedAscent) {
return cachedAscent;
}
const ctx = getCtx();
const ctx = getCtx(lang);
const savedFont = ctx.font;
ctx.canvas.width = ctx.canvas.height = DEFAULT_FONT_SIZE;
@ -162,7 +162,7 @@ function getAscent(fontFamily) {
return DEFAULT_FONT_ASCENT;
}
function appendText(task, geom) {
function appendText(task, geom, lang) {
// Initialize all used properties to keep the caches monomorphic.
const textDiv = document.createElement("span");
const textDivProperties = {
@ -184,7 +184,7 @@ function appendText(task, geom) {
const fontFamily =
(task._fontInspectorEnabled && style.fontSubstitution) || style.fontFamily;
const fontHeight = Math.hypot(tx[2], tx[3]);
const fontAscent = fontHeight * getAscent(fontFamily);
const fontAscent = fontHeight * getAscent(fontFamily, lang);
let left, top;
if (angle === 0) {
@ -324,7 +324,7 @@ class TextLayerRenderTask {
div: null,
scale: viewport.scale * (globalThis.devicePixelRatio || 1),
properties: null,
ctx: getCtx(),
ctx: null,
};
this._styleCache = Object.create(null);
const { pageWidth, pageHeight, pageX, pageY } = viewport.rawDims;
@ -371,7 +371,11 @@ class TextLayerRenderTask {
/**
* @private
*/
_processItems(items) {
_processItems(items, lang) {
if (!this._layoutTextParams.ctx) {
this._textDivProperties.set(this._rootContainer, { lang });
this._layoutTextParams.ctx = getCtx(lang);
}
const textDivs = this._textDivs,
textContentItemsStr = this._textContentItemsStr;
@ -403,7 +407,7 @@ class TextLayerRenderTask {
continue;
}
textContentItemsStr.push(item.str);
appendText(this, item);
appendText(this, item, lang);
}
}
@ -440,7 +444,7 @@ class TextLayerRenderTask {
}
Object.assign(styleCache, value.styles);
this._processItems(value.items);
this._processItems(value.items, value.lang);
pump();
}, this._capability.reject);
};
@ -476,7 +480,7 @@ function updateTextLayer({
}
if (mustRescale) {
const ctx = getCtx();
const ctx = getCtx(textDivProperties.get(container)?.lang);
const scale = viewport.scale * (globalThis.devicePixelRatio || 1);
const params = {
prevFontSize: null,

View file

@ -3128,10 +3128,11 @@ describe("api", function () {
});
it("gets text content", async function () {
const { items, styles } = await page.getTextContent();
const { items, styles, lang } = await page.getTextContent();
expect(items.length).toEqual(15);
expect(objectSize(styles)).toEqual(5);
expect(lang).toEqual("en");
const text = mergeText(items);
expect(text).toEqual(`Table Of Content
@ -3146,13 +3147,14 @@ page 1 / 3`);
);
const pdfDoc = await loadingTask.promise;
const pdfPage = await pdfDoc.getPage(1);
const { items, styles } = await pdfPage.getTextContent({
const { items, styles, lang } = await pdfPage.getTextContent({
disableNormalization: true,
});
expect(items.length).toEqual(1);
// Font name will be a random object id.
const fontName = items[0].fontName;
expect(Object.keys(styles)).toEqual([fontName]);
expect(lang).toEqual(null);
expect(items[0]).toEqual({
dir: "ltr",