mirror of
https://github.com/mozilla/pdf.js.git
synced 2025-04-19 22:58:07 +02:00
Merge pull request #17941 from Snuffleupagus/getTextContent-lang
[api-minor] Include the document /Lang attribute in the textContent-data
This commit is contained in:
commit
bb9bb34721
6 changed files with 56 additions and 41 deletions
|
@ -166,7 +166,7 @@ class Catalog {
|
|||
return shadow(
|
||||
this,
|
||||
"lang",
|
||||
typeof lang === "string" ? stringToPDFString(lang) : null
|
||||
lang && typeof lang === "string" ? stringToPDFString(lang) : null
|
||||
);
|
||||
}
|
||||
|
||||
|
|
|
@ -395,10 +395,9 @@ class Page {
|
|||
}
|
||||
|
||||
loadResources(keys) {
|
||||
if (!this.resourcesPromise) {
|
||||
// TODO: add async `_getInheritableProperty` and remove this.
|
||||
this.resourcesPromise = this.pdfManager.ensure(this, "resources");
|
||||
}
|
||||
// TODO: add async `_getInheritableProperty` and remove this.
|
||||
this.resourcesPromise ||= this.pdfManager.ensure(this, "resources");
|
||||
|
||||
return this.resourcesPromise.then(() => {
|
||||
const objectLoader = new ObjectLoader(this.resources, keys, this.xref);
|
||||
return objectLoader.load();
|
||||
|
@ -625,7 +624,7 @@ class Page {
|
|||
});
|
||||
}
|
||||
|
||||
extractTextContent({
|
||||
async extractTextContent({
|
||||
handler,
|
||||
task,
|
||||
includeMarkedContent,
|
||||
|
@ -639,31 +638,35 @@ class Page {
|
|||
"Properties",
|
||||
"XObject",
|
||||
]);
|
||||
const langPromise = this.pdfManager.ensureCatalog("lang");
|
||||
|
||||
const dataPromises = Promise.all([contentStreamPromise, resourcesPromise]);
|
||||
return dataPromises.then(([contentStream]) => {
|
||||
const partialEvaluator = new PartialEvaluator({
|
||||
xref: this.xref,
|
||||
handler,
|
||||
pageIndex: this.pageIndex,
|
||||
idFactory: this._localIdFactory,
|
||||
fontCache: this.fontCache,
|
||||
builtInCMapCache: this.builtInCMapCache,
|
||||
standardFontDataCache: this.standardFontDataCache,
|
||||
globalImageCache: this.globalImageCache,
|
||||
systemFontCache: this.systemFontCache,
|
||||
options: this.evaluatorOptions,
|
||||
});
|
||||
const [contentStream, , lang] = await Promise.all([
|
||||
contentStreamPromise,
|
||||
resourcesPromise,
|
||||
langPromise,
|
||||
]);
|
||||
const partialEvaluator = new PartialEvaluator({
|
||||
xref: this.xref,
|
||||
handler,
|
||||
pageIndex: this.pageIndex,
|
||||
idFactory: this._localIdFactory,
|
||||
fontCache: this.fontCache,
|
||||
builtInCMapCache: this.builtInCMapCache,
|
||||
standardFontDataCache: this.standardFontDataCache,
|
||||
globalImageCache: this.globalImageCache,
|
||||
systemFontCache: this.systemFontCache,
|
||||
options: this.evaluatorOptions,
|
||||
});
|
||||
|
||||
return partialEvaluator.getTextContent({
|
||||
stream: contentStream,
|
||||
task,
|
||||
resources: this.resources,
|
||||
includeMarkedContent,
|
||||
disableNormalization,
|
||||
sink,
|
||||
viewBox: this.view,
|
||||
});
|
||||
return partialEvaluator.getTextContent({
|
||||
stream: contentStream,
|
||||
task,
|
||||
resources: this.resources,
|
||||
includeMarkedContent,
|
||||
disableNormalization,
|
||||
sink,
|
||||
viewBox: this.view,
|
||||
lang,
|
||||
});
|
||||
}
|
||||
|
||||
|
|
|
@ -2307,6 +2307,7 @@ class PartialEvaluator {
|
|||
sink,
|
||||
seenStyles = new Set(),
|
||||
viewBox,
|
||||
lang = null,
|
||||
markedContentData = null,
|
||||
disableNormalization = false,
|
||||
keepWhiteSpace = false,
|
||||
|
@ -2323,6 +2324,7 @@ class PartialEvaluator {
|
|||
const textContent = {
|
||||
items: [],
|
||||
styles: Object.create(null),
|
||||
lang,
|
||||
};
|
||||
const textContentItem = {
|
||||
initialized: false,
|
||||
|
@ -3296,6 +3298,7 @@ class PartialEvaluator {
|
|||
sink: sinkWrapper,
|
||||
seenStyles,
|
||||
viewBox,
|
||||
lang,
|
||||
markedContentData,
|
||||
disableNormalization,
|
||||
keepWhiteSpace,
|
||||
|
|
|
@ -1160,6 +1160,7 @@ class PDFDocumentProxy {
|
|||
* items are included when includeMarkedContent is true.
|
||||
* @property {Object<string, TextStyle>} styles - {@link TextStyle} objects,
|
||||
* indexed by font name.
|
||||
* @property {string | null} lang - The document /Lang attribute.
|
||||
*/
|
||||
|
||||
/**
|
||||
|
@ -1677,6 +1678,7 @@ class PDFPageProxy {
|
|||
resolve(textContent);
|
||||
return;
|
||||
}
|
||||
textContent.lang ??= value.lang;
|
||||
Object.assign(textContent.styles, value.styles);
|
||||
textContent.items.push(...value.items);
|
||||
pump();
|
||||
|
@ -1687,6 +1689,7 @@ class PDFPageProxy {
|
|||
const textContent = {
|
||||
items: [],
|
||||
styles: Object.create(null),
|
||||
lang: null,
|
||||
};
|
||||
pump();
|
||||
});
|
||||
|
|
|
@ -64,7 +64,7 @@ const DEFAULT_FONT_ASCENT = 0.8;
|
|||
const ascentCache = new Map();
|
||||
let _canvasContext = null;
|
||||
|
||||
function getCtx() {
|
||||
function getCtx(lang = null) {
|
||||
if (!_canvasContext) {
|
||||
// We don't use an OffscreenCanvas here because we use serif/sans serif
|
||||
// fonts with it and they depends on the locale.
|
||||
|
@ -89,13 +89,13 @@ function cleanupTextLayer() {
|
|||
_canvasContext = null;
|
||||
}
|
||||
|
||||
function getAscent(fontFamily) {
|
||||
function getAscent(fontFamily, lang) {
|
||||
const cachedAscent = ascentCache.get(fontFamily);
|
||||
if (cachedAscent) {
|
||||
return cachedAscent;
|
||||
}
|
||||
|
||||
const ctx = getCtx();
|
||||
const ctx = getCtx(lang);
|
||||
|
||||
const savedFont = ctx.font;
|
||||
ctx.canvas.width = ctx.canvas.height = DEFAULT_FONT_SIZE;
|
||||
|
@ -162,7 +162,7 @@ function getAscent(fontFamily) {
|
|||
return DEFAULT_FONT_ASCENT;
|
||||
}
|
||||
|
||||
function appendText(task, geom) {
|
||||
function appendText(task, geom, lang) {
|
||||
// Initialize all used properties to keep the caches monomorphic.
|
||||
const textDiv = document.createElement("span");
|
||||
const textDivProperties = {
|
||||
|
@ -184,7 +184,7 @@ function appendText(task, geom) {
|
|||
const fontFamily =
|
||||
(task._fontInspectorEnabled && style.fontSubstitution) || style.fontFamily;
|
||||
const fontHeight = Math.hypot(tx[2], tx[3]);
|
||||
const fontAscent = fontHeight * getAscent(fontFamily);
|
||||
const fontAscent = fontHeight * getAscent(fontFamily, lang);
|
||||
|
||||
let left, top;
|
||||
if (angle === 0) {
|
||||
|
@ -324,7 +324,7 @@ class TextLayerRenderTask {
|
|||
div: null,
|
||||
scale: viewport.scale * (globalThis.devicePixelRatio || 1),
|
||||
properties: null,
|
||||
ctx: getCtx(),
|
||||
ctx: null,
|
||||
};
|
||||
this._styleCache = Object.create(null);
|
||||
const { pageWidth, pageHeight, pageX, pageY } = viewport.rawDims;
|
||||
|
@ -371,7 +371,11 @@ class TextLayerRenderTask {
|
|||
/**
|
||||
* @private
|
||||
*/
|
||||
_processItems(items) {
|
||||
_processItems(items, lang) {
|
||||
if (!this._layoutTextParams.ctx) {
|
||||
this._textDivProperties.set(this._rootContainer, { lang });
|
||||
this._layoutTextParams.ctx = getCtx(lang);
|
||||
}
|
||||
const textDivs = this._textDivs,
|
||||
textContentItemsStr = this._textContentItemsStr;
|
||||
|
||||
|
@ -403,7 +407,7 @@ class TextLayerRenderTask {
|
|||
continue;
|
||||
}
|
||||
textContentItemsStr.push(item.str);
|
||||
appendText(this, item);
|
||||
appendText(this, item, lang);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -440,7 +444,7 @@ class TextLayerRenderTask {
|
|||
}
|
||||
|
||||
Object.assign(styleCache, value.styles);
|
||||
this._processItems(value.items);
|
||||
this._processItems(value.items, value.lang);
|
||||
pump();
|
||||
}, this._capability.reject);
|
||||
};
|
||||
|
@ -476,7 +480,7 @@ function updateTextLayer({
|
|||
}
|
||||
|
||||
if (mustRescale) {
|
||||
const ctx = getCtx();
|
||||
const ctx = getCtx(textDivProperties.get(container)?.lang);
|
||||
const scale = viewport.scale * (globalThis.devicePixelRatio || 1);
|
||||
const params = {
|
||||
prevFontSize: null,
|
||||
|
|
|
@ -3128,10 +3128,11 @@ describe("api", function () {
|
|||
});
|
||||
|
||||
it("gets text content", async function () {
|
||||
const { items, styles } = await page.getTextContent();
|
||||
const { items, styles, lang } = await page.getTextContent();
|
||||
|
||||
expect(items.length).toEqual(15);
|
||||
expect(objectSize(styles)).toEqual(5);
|
||||
expect(lang).toEqual("en");
|
||||
|
||||
const text = mergeText(items);
|
||||
expect(text).toEqual(`Table Of Content
|
||||
|
@ -3146,13 +3147,14 @@ page 1 / 3`);
|
|||
);
|
||||
const pdfDoc = await loadingTask.promise;
|
||||
const pdfPage = await pdfDoc.getPage(1);
|
||||
const { items, styles } = await pdfPage.getTextContent({
|
||||
const { items, styles, lang } = await pdfPage.getTextContent({
|
||||
disableNormalization: true,
|
||||
});
|
||||
expect(items.length).toEqual(1);
|
||||
// Font name will be a random object id.
|
||||
const fontName = items[0].fontName;
|
||||
expect(Object.keys(styles)).toEqual([fontName]);
|
||||
expect(lang).toEqual(null);
|
||||
|
||||
expect(items[0]).toEqual({
|
||||
dir: "ltr",
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue