diff --git a/src/core/catalog.js b/src/core/catalog.js index 5d77972aa..47d2812de 100644 --- a/src/core/catalog.js +++ b/src/core/catalog.js @@ -166,7 +166,7 @@ class Catalog { return shadow( this, "lang", - typeof lang === "string" ? stringToPDFString(lang) : null + lang && typeof lang === "string" ? stringToPDFString(lang) : null ); } diff --git a/src/core/document.js b/src/core/document.js index 8e6c2243b..c59d3752b 100644 --- a/src/core/document.js +++ b/src/core/document.js @@ -395,10 +395,9 @@ class Page { } loadResources(keys) { - if (!this.resourcesPromise) { - // TODO: add async `_getInheritableProperty` and remove this. - this.resourcesPromise = this.pdfManager.ensure(this, "resources"); - } + // TODO: add async `_getInheritableProperty` and remove this. + this.resourcesPromise ||= this.pdfManager.ensure(this, "resources"); + return this.resourcesPromise.then(() => { const objectLoader = new ObjectLoader(this.resources, keys, this.xref); return objectLoader.load(); @@ -625,7 +624,7 @@ class Page { }); } - extractTextContent({ + async extractTextContent({ handler, task, includeMarkedContent, @@ -639,31 +638,35 @@ class Page { "Properties", "XObject", ]); + const langPromise = this.pdfManager.ensureCatalog("lang"); - const dataPromises = Promise.all([contentStreamPromise, resourcesPromise]); - return dataPromises.then(([contentStream]) => { - const partialEvaluator = new PartialEvaluator({ - xref: this.xref, - handler, - pageIndex: this.pageIndex, - idFactory: this._localIdFactory, - fontCache: this.fontCache, - builtInCMapCache: this.builtInCMapCache, - standardFontDataCache: this.standardFontDataCache, - globalImageCache: this.globalImageCache, - systemFontCache: this.systemFontCache, - options: this.evaluatorOptions, - }); + const [contentStream, , lang] = await Promise.all([ + contentStreamPromise, + resourcesPromise, + langPromise, + ]); + const partialEvaluator = new PartialEvaluator({ + xref: this.xref, + handler, + pageIndex: this.pageIndex, + idFactory: this._localIdFactory, + fontCache: this.fontCache, + builtInCMapCache: this.builtInCMapCache, + standardFontDataCache: this.standardFontDataCache, + globalImageCache: this.globalImageCache, + systemFontCache: this.systemFontCache, + options: this.evaluatorOptions, + }); - return partialEvaluator.getTextContent({ - stream: contentStream, - task, - resources: this.resources, - includeMarkedContent, - disableNormalization, - sink, - viewBox: this.view, - }); + return partialEvaluator.getTextContent({ + stream: contentStream, + task, + resources: this.resources, + includeMarkedContent, + disableNormalization, + sink, + viewBox: this.view, + lang, }); } diff --git a/src/core/evaluator.js b/src/core/evaluator.js index f78d2d2e9..e1bd19434 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -2307,6 +2307,7 @@ class PartialEvaluator { sink, seenStyles = new Set(), viewBox, + lang = null, markedContentData = null, disableNormalization = false, keepWhiteSpace = false, @@ -2323,6 +2324,7 @@ class PartialEvaluator { const textContent = { items: [], styles: Object.create(null), + lang, }; const textContentItem = { initialized: false, @@ -3296,6 +3298,7 @@ class PartialEvaluator { sink: sinkWrapper, seenStyles, viewBox, + lang, markedContentData, disableNormalization, keepWhiteSpace, diff --git a/src/display/api.js b/src/display/api.js index 90819139d..aaedfcf3b 100644 --- a/src/display/api.js +++ b/src/display/api.js @@ -1160,6 +1160,7 @@ class PDFDocumentProxy { * items are included when includeMarkedContent is true. * @property {Object} styles - {@link TextStyle} objects, * indexed by font name. + * @property {string | null} lang - The document /Lang attribute. */ /** @@ -1677,6 +1678,7 @@ class PDFPageProxy { resolve(textContent); return; } + textContent.lang ??= value.lang; Object.assign(textContent.styles, value.styles); textContent.items.push(...value.items); pump(); @@ -1687,6 +1689,7 @@ class PDFPageProxy { const textContent = { items: [], styles: Object.create(null), + lang: null, }; pump(); }); diff --git a/src/display/text_layer.js b/src/display/text_layer.js index f36bf62a4..6b4f70656 100644 --- a/src/display/text_layer.js +++ b/src/display/text_layer.js @@ -64,7 +64,7 @@ const DEFAULT_FONT_ASCENT = 0.8; const ascentCache = new Map(); let _canvasContext = null; -function getCtx() { +function getCtx(lang = null) { if (!_canvasContext) { // We don't use an OffscreenCanvas here because we use serif/sans serif // fonts with it and they depends on the locale. @@ -89,13 +89,13 @@ function cleanupTextLayer() { _canvasContext = null; } -function getAscent(fontFamily) { +function getAscent(fontFamily, lang) { const cachedAscent = ascentCache.get(fontFamily); if (cachedAscent) { return cachedAscent; } - const ctx = getCtx(); + const ctx = getCtx(lang); const savedFont = ctx.font; ctx.canvas.width = ctx.canvas.height = DEFAULT_FONT_SIZE; @@ -162,7 +162,7 @@ function getAscent(fontFamily) { return DEFAULT_FONT_ASCENT; } -function appendText(task, geom) { +function appendText(task, geom, lang) { // Initialize all used properties to keep the caches monomorphic. const textDiv = document.createElement("span"); const textDivProperties = { @@ -184,7 +184,7 @@ function appendText(task, geom) { const fontFamily = (task._fontInspectorEnabled && style.fontSubstitution) || style.fontFamily; const fontHeight = Math.hypot(tx[2], tx[3]); - const fontAscent = fontHeight * getAscent(fontFamily); + const fontAscent = fontHeight * getAscent(fontFamily, lang); let left, top; if (angle === 0) { @@ -324,7 +324,7 @@ class TextLayerRenderTask { div: null, scale: viewport.scale * (globalThis.devicePixelRatio || 1), properties: null, - ctx: getCtx(), + ctx: null, }; this._styleCache = Object.create(null); const { pageWidth, pageHeight, pageX, pageY } = viewport.rawDims; @@ -371,7 +371,11 @@ class TextLayerRenderTask { /** * @private */ - _processItems(items) { + _processItems(items, lang) { + if (!this._layoutTextParams.ctx) { + this._textDivProperties.set(this._rootContainer, { lang }); + this._layoutTextParams.ctx = getCtx(lang); + } const textDivs = this._textDivs, textContentItemsStr = this._textContentItemsStr; @@ -403,7 +407,7 @@ class TextLayerRenderTask { continue; } textContentItemsStr.push(item.str); - appendText(this, item); + appendText(this, item, lang); } } @@ -440,7 +444,7 @@ class TextLayerRenderTask { } Object.assign(styleCache, value.styles); - this._processItems(value.items); + this._processItems(value.items, value.lang); pump(); }, this._capability.reject); }; @@ -476,7 +480,7 @@ function updateTextLayer({ } if (mustRescale) { - const ctx = getCtx(); + const ctx = getCtx(textDivProperties.get(container)?.lang); const scale = viewport.scale * (globalThis.devicePixelRatio || 1); const params = { prevFontSize: null, diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index 04646e893..c1ab291df 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -3128,10 +3128,11 @@ describe("api", function () { }); it("gets text content", async function () { - const { items, styles } = await page.getTextContent(); + const { items, styles, lang } = await page.getTextContent(); expect(items.length).toEqual(15); expect(objectSize(styles)).toEqual(5); + expect(lang).toEqual("en"); const text = mergeText(items); expect(text).toEqual(`Table Of Content @@ -3146,13 +3147,14 @@ page 1 / 3`); ); const pdfDoc = await loadingTask.promise; const pdfPage = await pdfDoc.getPage(1); - const { items, styles } = await pdfPage.getTextContent({ + const { items, styles, lang } = await pdfPage.getTextContent({ disableNormalization: true, }); expect(items.length).toEqual(1); // Font name will be a random object id. const fontName = items[0].fontName; expect(Object.keys(styles)).toEqual([fontName]); + expect(lang).toEqual(null); expect(items[0]).toEqual({ dir: "ltr",