From 049848ba009014d5056dd79676028cec91bd2a5c Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Sun, 21 Apr 2024 11:29:03 +0200 Subject: [PATCH 1/3] Unify the `ReadableStream` and `TextContent` code-paths in `src/display/text_layer.js` The only reason that this code still accepts `TextContent` is for backward-compatibility purposes, so we can simplify the implementation by always using a `ReadableStream` internally. --- src/display/text_layer.js | 83 ++++++++++++++++++------------------ test/driver.js | 14 +++--- test/unit/text_layer_spec.js | 42 ++++++++++++++++++ 3 files changed, 88 insertions(+), 51 deletions(-) diff --git a/src/display/text_layer.js b/src/display/text_layer.js index 5394f0dc2..03a139cdf 100644 --- a/src/display/text_layer.js +++ b/src/display/text_layer.js @@ -250,9 +250,7 @@ function appendText(task, geom, styles) { textDivProperties.canvasWidth = style.vertical ? geom.height : geom.width; } task._textDivProperties.set(textDiv, textDivProperties); - if (task._isReadableStream) { - task._layoutText(textDiv); - } + task._layoutText(textDiv); } function layout(params) { @@ -298,16 +296,14 @@ function render(task) { capability.resolve(); return; } - - if (!task._isReadableStream) { - for (const textDiv of textDivs) { - task._layoutText(textDiv); - } - } capability.resolve(); } class TextLayerRenderTask { + #reader = null; + + #textContentSource = null; + constructor({ textContentSource, container, @@ -316,14 +312,26 @@ class TextLayerRenderTask { textDivProperties, textContentItemsStr, }) { - this._textContentSource = textContentSource; - this._isReadableStream = textContentSource instanceof ReadableStream; + if (textContentSource instanceof ReadableStream) { + this.#textContentSource = textContentSource; + } else if ( + (typeof PDFJSDev === "undefined" || PDFJSDev.test("GENERIC")) && + typeof textContentSource === "object" + ) { + this.#textContentSource = new ReadableStream({ + start(controller) { + controller.enqueue(textContentSource); + controller.close(); + }, + }); + } else { + throw new Error('No "textContentSource" parameter specified.'); + } this._container = this._rootContainer = container; this._textDivs = textDivs || []; this._textContentItemsStr = textContentItemsStr || []; this._fontInspectorEnabled = !!globalThis.FontInspector?.enabled; - this._reader = null; this._textDivProperties = textDivProperties || new WeakMap(); this._canceled = false; this._capability = Promise.withResolvers(); @@ -365,15 +373,14 @@ class TextLayerRenderTask { */ cancel() { this._canceled = true; - if (this._reader) { - this._reader - .cancel(new AbortException("TextLayer task cancelled.")) - .catch(() => { - // Avoid "Uncaught promise" messages in the console. - }); - this._reader = null; - } - this._capability.reject(new AbortException("TextLayer task cancelled.")); + const abortEx = new AbortException("TextLayer task cancelled."); + + this.#reader?.cancel(abortEx).catch(() => { + // Avoid "Uncaught promise" messages in the console. + }); + this.#reader = null; + + this._capability.reject(abortEx); } /** @@ -429,29 +436,21 @@ class TextLayerRenderTask { const { promise, resolve, reject } = Promise.withResolvers(); let styleCache = Object.create(null); - if (this._isReadableStream) { - const pump = () => { - this._reader.read().then(({ value, done }) => { - if (done) { - resolve(); - return; - } + const pump = () => { + this.#reader.read().then(({ value, done }) => { + if (done) { + resolve(); + return; + } - Object.assign(styleCache, value.styles); - this._processItems(value.items, styleCache); - pump(); - }, reject); - }; + Object.assign(styleCache, value.styles); + this._processItems(value.items, styleCache); + pump(); + }, reject); + }; - this._reader = this._textContentSource.getReader(); - pump(); - } else if (this._textContentSource) { - const { items, styles } = this._textContentSource; - this._processItems(items, styles); - resolve(); - } else { - throw new Error('No "textContentSource" parameter specified.'); - } + this.#reader = this.#textContentSource.getReader(); + pump(); promise.then(() => { styleCache = null; diff --git a/test/driver.js b/test/driver.js index e47a48a5d..d0a91ce48 100644 --- a/test/driver.js +++ b/test/driver.js @@ -335,15 +335,11 @@ class Rasterize { await task.promise; - const { _pageWidth, _pageHeight, _textContentSource, _textDivs } = task; + const { _pageWidth, _pageHeight, _textDivs } = task; const boxes = []; - let posRegex; - for ( - let i = 0, j = 0, ii = _textContentSource.items.length; - i < ii; - i++ - ) { - const { width, height, type } = _textContentSource.items[i]; + let j = 0, + posRegex; + for (const { width, height, type } of textContent.items) { if (type) { continue; } @@ -396,7 +392,7 @@ class Rasterize { drawLayer.destroy(); } catch (reason) { - throw new Error(`Rasterize.textLayer: "${reason?.message}".`); + throw new Error(`Rasterize.highlightLayer: "${reason?.message}".`); } } diff --git a/test/unit/text_layer_spec.js b/test/unit/text_layer_spec.js index ca4a400e9..b6d93eab7 100644 --- a/test/unit/text_layer_spec.js +++ b/test/unit/text_layer_spec.js @@ -58,5 +58,47 @@ describe("textLayer", function () { "", "page 1 / 3", ]); + + await loadingTask.destroy(); + }); + + it("creates textLayer from TextContent", async function () { + if (isNodeJS) { + pending("document.createElement is not supported in Node.js."); + } + const loadingTask = getDocument(buildGetDocumentParams("basicapi.pdf")); + const pdfDocument = await loadingTask.promise; + const page = await pdfDocument.getPage(1); + + const textContentItemsStr = []; + + const textLayerRenderTask = renderTextLayer({ + textContentSource: await page.getTextContent(), + container: document.createElement("div"), + viewport: page.getViewport({ scale: 1 }), + textContentItemsStr, + }); + expect(textLayerRenderTask instanceof TextLayerRenderTask).toEqual(true); + + await textLayerRenderTask.promise; + expect(textContentItemsStr).toEqual([ + "Table Of Content", + "", + "Chapter 1", + " ", + "..........................................................", + " ", + "2", + "", + "Paragraph 1.1", + " ", + "......................................................", + " ", + "3", + "", + "page 1 / 3", + ]); + + await loadingTask.destroy(); }); }); From 30840e411eacfeb83b7acdd7787c110ff44b5f8a Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Tue, 7 May 2024 10:47:05 +0200 Subject: [PATCH 2/3] Ensure that the textLayer `styleCache` is always cleared, even on failure By also moving it to the `TextLayerRenderTask`-instance, we can avoid a bit of manual parameter passing. --- src/display/text_layer.js | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/display/text_layer.js b/src/display/text_layer.js index 03a139cdf..f302ddb7d 100644 --- a/src/display/text_layer.js +++ b/src/display/text_layer.js @@ -162,7 +162,7 @@ function getAscent(fontFamily) { return DEFAULT_FONT_ASCENT; } -function appendText(task, geom, styles) { +function appendText(task, geom) { // Initialize all used properties to keep the caches monomorphic. const textDiv = document.createElement("span"); const textDivProperties = { @@ -176,7 +176,7 @@ function appendText(task, geom, styles) { const tx = Util.transform(task._transform, geom.transform); let angle = Math.atan2(tx[1], tx[0]); - const style = styles[geom.fontName]; + const style = task._styleCache[geom.fontName]; if (style.vertical) { angle += Math.PI / 2; } @@ -343,6 +343,7 @@ class TextLayerRenderTask { properties: null, ctx: getCtx(), }; + this._styleCache = Object.create(null); const { pageWidth, pageHeight, pageX, pageY } = viewport.rawDims; this._transform = [1, 0, 0, -1, -pageX, pageY + pageHeight]; this._pageWidth = pageWidth; @@ -354,6 +355,7 @@ class TextLayerRenderTask { this._capability.promise .finally(() => { this._layoutTextParams = null; + this._styleCache = null; }) .catch(() => { // Avoid "Uncaught promise" messages in the console. @@ -386,7 +388,7 @@ class TextLayerRenderTask { /** * @private */ - _processItems(items, styleCache) { + _processItems(items) { for (const item of items) { if (item.str === undefined) { if ( @@ -406,7 +408,7 @@ class TextLayerRenderTask { continue; } this._textContentItemsStr.push(item.str); - appendText(this, item, styleCache); + appendText(this, item); } } @@ -434,7 +436,7 @@ class TextLayerRenderTask { */ _render() { const { promise, resolve, reject } = Promise.withResolvers(); - let styleCache = Object.create(null); + const styleCache = this._styleCache; const pump = () => { this.#reader.read().then(({ value, done }) => { @@ -444,7 +446,7 @@ class TextLayerRenderTask { } Object.assign(styleCache, value.styles); - this._processItems(value.items, styleCache); + this._processItems(value.items); pump(); }, reject); }; @@ -453,7 +455,6 @@ class TextLayerRenderTask { pump(); promise.then(() => { - styleCache = null; render(this); }, this._capability.reject); } From 8d86e18a32144180f04a96e209f92b7f628018ee Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Tue, 7 May 2024 11:37:07 +0200 Subject: [PATCH 3/3] Restore the `MAX_TEXT_DIVS_TO_RENDER` limit in the textLayer This limit is currently completely non-functional, since the check happens *after* the entire textLayer has been parsed and appended to the DOM. It seems that this has been *accidentally* broken ever since the introduction of `ReadableStream` support. The reason that this hasn't caused noticeable textLayer-related performance issues in practice is probably because we nowadays manage to coalesce the textLayer into fewer overall DOM elements, whereas years ago many PDF documents ended up with one DOM element *per* glyph. By moving this check, and thus restoring the functionality, we're also able to remove the `render` helper function and simplify the code. --- src/display/text_layer.js | 43 +++++++++++++++------------------------ 1 file changed, 16 insertions(+), 27 deletions(-) diff --git a/src/display/text_layer.js b/src/display/text_layer.js index f302ddb7d..f36bf62a4 100644 --- a/src/display/text_layer.js +++ b/src/display/text_layer.js @@ -16,7 +16,7 @@ /** @typedef {import("./display_utils").PageViewport} PageViewport */ /** @typedef {import("./api").TextContent} TextContent */ -import { AbortException, Util } from "../shared/util.js"; +import { AbortException, Util, warn } from "../shared/util.js"; import { setLayerDimensions } from "./display_utils.js"; /** @@ -282,23 +282,6 @@ function layout(params) { } } -function render(task) { - if (task._canceled) { - return; - } - const textDivs = task._textDivs; - const capability = task._capability; - const textDivsLength = textDivs.length; - - // No point in rendering many divs as it would make the browser - // unusable even after the divs are rendered. - if (textDivsLength > MAX_TEXT_DIVS_TO_RENDER) { - capability.resolve(); - return; - } - capability.resolve(); -} - class TextLayerRenderTask { #reader = null; @@ -389,7 +372,19 @@ class TextLayerRenderTask { * @private */ _processItems(items) { + const textDivs = this._textDivs, + textContentItemsStr = this._textContentItemsStr; + for (const item of items) { + // No point in rendering many divs as it would make the browser + // unusable even after the divs are rendered. + if (textDivs.length > MAX_TEXT_DIVS_TO_RENDER) { + warn("Ignoring additional textDivs for performance reasons."); + + this._processItems = () => {}; // Avoid multiple warnings for one page. + return; + } + if (item.str === undefined) { if ( item.type === "beginMarkedContentProps" || @@ -407,7 +402,7 @@ class TextLayerRenderTask { } continue; } - this._textContentItemsStr.push(item.str); + textContentItemsStr.push(item.str); appendText(this, item); } } @@ -435,28 +430,22 @@ class TextLayerRenderTask { * @private */ _render() { - const { promise, resolve, reject } = Promise.withResolvers(); const styleCache = this._styleCache; const pump = () => { this.#reader.read().then(({ value, done }) => { if (done) { - resolve(); + this._capability.resolve(); return; } Object.assign(styleCache, value.styles); this._processItems(value.items); pump(); - }, reject); + }, this._capability.reject); }; - this.#reader = this.#textContentSource.getReader(); pump(); - - promise.then(() => { - render(this); - }, this._capability.reject); } }