[api-minor] Add a parameter to PDFPageProxy_getTextContent that enables replacing of all whitespace with standard spaces in the textLayer (issue 6612)

This patch goes a bit further than issue 6612 requires, and replaces all kinds of whitespace with standard spaces. When testing this locally, it actually seemed to slightly improve two existing test-cases (`tracemonkey-text` and `taro-text`). Fixes 6612.
2025-04-22 16:18:08 +02:00 · 2015-11-23 16:57:43 +01:00 · 2015-11-23 16:57:43 +01:00 · 6dfe53b976
commit 6dfe53b976
parent c2dfe9e9a9
12 changed files with 75 additions and 24 deletions
--- a/src/core/core.js
+++ b/src/core/core.js
@ -218,7 +218,8 @@ var Page = (function PageClosure() {
      });
    },

-    extractTextContent: function Page_extractTextContent(task) {
+    extractTextContent: function Page_extractTextContent(task,
+                                                         normalizeWhitespace) {
      var handler = {
        on: function nullHandlerOn() {},
        send: function nullHandlerSend() {}
@ -248,7 +249,9 @@ var Page = (function PageClosure() {

        return partialEvaluator.getTextContent(contentStream,
                                               task,
-                                               self.resources);
+                                               self.resources,
+                                               /* stateManager = */ null,
+                                               normalizeWhitespace);
      });
    },

--- a/src/core/evaluator.js
+++ b/src/core/evaluator.js
@ -908,12 +908,15 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
      });
    },

-    getTextContent: function PartialEvaluator_getTextContent(stream, task,
-                                                             resources,
-                                                             stateManager) {
+    getTextContent:
+        function PartialEvaluator_getTextContent(stream, task, resources,
+                                                 stateManager,
+                                                 normalizeWhitespace) {

      stateManager = (stateManager || new StateManager(new TextState()));

+      var WhitespaceRegexp = /\s/g;
+
      var textContent = {
        items: [],
        styles: Object.create(null)
@ -1027,11 +1030,23 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
        return textContentItem;
      }

+      function replaceWhitespace(str) {
+        // Replaces all whitespaces with standard spaces (0x20), to avoid
+        // alignment issues between the textLayer and the canvas if the text
+        // contains e.g. tabs (fixes issue6612.pdf).
+        var i = 0, ii = str.length, code;
+        while (i < ii && (code = str.charCodeAt(i)) >= 0x20 && code <= 0x7F) {
+          i++;
+        }
+        return (i < ii ? str.replace(WhitespaceRegexp, ' ') : str);
+      }
+
      function runBidiTransform(textChunk) {
        var str = textChunk.str.join('');
        var bidiResult = PDFJS.bidi(str, -1, textChunk.vertical);
        return {
-          str: bidiResult.str,
+          str: (normalizeWhitespace ? replaceWhitespace(bidiResult.str) :
+                                      bidiResult.str),
          dir: bidiResult.dir,
          width: textChunk.width,
          height: textChunk.height,
@ -1352,8 +1367,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
              }

              return self.getTextContent(xobj, task,
-                xobj.dict.get('Resources') || resources, stateManager).
-                then(function (formTextContent) {
+                xobj.dict.get('Resources') || resources, stateManager,
+                normalizeWhitespace).then(function (formTextContent) {
                  Util.appendToArray(textContent.items, formTextContent.items);
                  Util.extendObj(textContent.styles, formTextContent.styles);
                  stateManager.restore();
--- a/src/core/worker.js
+++ b/src/core/worker.js
@ -517,12 +517,14 @@ var WorkerMessageHandler = PDFJS.WorkerMessageHandler = {

    handler.on('GetTextContent', function wphExtractText(data) {
      var pageIndex = data.pageIndex;
+      var normalizeWhitespace = data.normalizeWhitespace;
      return pdfManager.getPage(pageIndex).then(function(page) {
        var task = new WorkerTask('GetTextContent: page ' + pageIndex);
        startWorkerTask(task);
        var pageNum = pageIndex + 1;
        var start = Date.now();
-        return page.extractTextContent(task).then(function(textContent) {
+        return page.extractTextContent(task, normalizeWhitespace).then(
+            function(textContent) {
          finishWorkerTask(task);
          info('text indexing: page=' + pageNum + ' - time=' +
               (Date.now() - start) + 'ms');
--- a/src/display/api.js
+++ b/src/display/api.js
@ -708,6 +708,14 @@ var PDFDocumentProxy = (function PDFDocumentProxyClosure() {
  return PDFDocumentProxy;
 })();

+/**
+ * Page getTextContent parameters.
+ *
+ * @typedef {Object} getTextContentParameters
+ * @param {boolean} normalizeWhitespace - replaces all occurrences of
+ *   whitespace with standard spaces (0x20). The default value is `false`.
+ */
+
 /**
 * Page text content.
 *
@ -986,12 +994,16 @@ var PDFPageProxy = (function PDFPageProxyClosure() {
    },

    /**
+     * @param {getTextContentParameters} params - getTextContent parameters.
     * @return {Promise} That is resolved a {@link TextContent}
     * object that represent the page text content.
     */
-    getTextContent: function PDFPageProxy_getTextContent() {
+    getTextContent: function PDFPageProxy_getTextContent(params) {
+      var normalizeWhitespace = (params && params.normalizeWhitespace) || false;
+
      return this.transport.messageHandler.sendWithPromise('GetTextContent', {
-        pageIndex: this.pageNumber - 1
+        pageIndex: this.pageNumber - 1,
+        normalizeWhitespace: normalizeWhitespace,
      });
    },