1
0
Fork 0
mirror of https://github.com/mozilla/pdf.js.git synced 2025-04-22 16:18:08 +02:00

Merge pull request #14703 from calixteman/14627

[text selection] Add the whitespaces present in the pdf in the text chunk
This commit is contained in:
Jonas Jenwald 2022-03-27 15:20:19 +02:00 committed by GitHub
commit 0dd6bc9a85
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 94 additions and 23 deletions

View file

@ -2187,17 +2187,62 @@ class PartialEvaluator {
spaceInFlowMax: 0,
trackingSpaceMin: Infinity,
negativeSpaceMax: -Infinity,
notASpace: -Infinity,
transform: null,
fontName: null,
hasEOL: false,
};
// Use a circular buffer (length === 2) to save the last chars in the
// text stream.
// This implementation of the circular buffer is using a fixed array
// and the position of the next element:
// function addElement(x) {
// buffer[pos] = x;
// pos = (pos + 1) % buffer.length;
// }
// It's a way faster than:
// function addElement(x) {
// buffer.push(x);
// buffer.shift();
// }
//
// It's useful to know when we need to add a whitespace in the
// text chunk.
const twoLastChars = [" ", " "];
let twoLastCharsPos = 0;
/**
* Save the last char.
* @param {string} char
* @returns {boolean} true when the two last chars before adding the new one
* are a non-whitespace followed by a whitespace.
*/
function saveLastChar(char) {
const nextPos = (twoLastCharsPos + 1) % 2;
const ret =
twoLastChars[twoLastCharsPos] !== " " && twoLastChars[nextPos] === " ";
twoLastChars[twoLastCharsPos] = char;
twoLastCharsPos = nextPos;
return ret;
}
function resetLastChars() {
twoLastChars[0] = twoLastChars[1] = " ";
twoLastCharsPos = 0;
}
// Used in addFakeSpaces.
// A white <= fontSize * TRACKING_SPACE_FACTOR is a tracking space
// so it doesn't count as a space.
const TRACKING_SPACE_FACTOR = 0.1;
// When a white <= fontSize * NOT_A_SPACE_FACTOR, there is no space
// even if one is present in the text stream.
const NOT_A_SPACE_FACTOR = 0.03;
// A negative white < fontSize * NEGATIVE_SPACE_FACTOR induces
// a break (a new chunk of text is created).
// It doesn't change anything when the text is copied but
@ -2299,6 +2344,7 @@ class PartialEvaluator {
textContentItem.trackingSpaceMin =
textState.fontSize * TRACKING_SPACE_FACTOR;
textContentItem.notASpace = textState.fontSize * NOT_A_SPACE_FACTOR;
textContentItem.negativeSpaceMax =
textState.fontSize * NEGATIVE_SPACE_FACTOR;
textContentItem.spaceInFlowMin =
@ -2483,6 +2529,7 @@ class PartialEvaluator {
return true;
}
resetLastChars();
flushTextContentItem();
return true;
}
@ -2491,6 +2538,13 @@ class PartialEvaluator {
appendEOL();
return true;
}
if (advanceY <= textOrientation * textContentItem.notASpace) {
// The real spacing between 2 consecutive chars is thin enough to be
// considered a non-space.
resetLastChars();
}
if (advanceY <= textOrientation * textContentItem.trackingSpaceMin) {
textContentItem.height += advanceY;
} else if (
@ -2501,6 +2555,7 @@ class PartialEvaluator {
)
) {
if (textContentItem.str.length === 0) {
resetLastChars();
textContent.items.push({
str: " ",
dir: "ltr",
@ -2532,6 +2587,10 @@ class PartialEvaluator {
appendEOL();
return true;
}
// We're moving back so in case the last char was a whitespace
// we cancel it: it doesn't make sense to insert it.
resetLastChars();
flushTextContentItem();
return true;
}
@ -2541,12 +2600,19 @@ class PartialEvaluator {
return true;
}
if (advanceX <= textOrientation * textContentItem.notASpace) {
// The real spacing between 2 consecutive chars is thin enough to be
// considered a non-space.
resetLastChars();
}
if (advanceX <= textOrientation * textContentItem.trackingSpaceMin) {
textContentItem.width += advanceX;
} else if (
!addFakeSpaces(advanceX, textContentItem.prevTransform, textOrientation)
) {
if (textContentItem.str.length === 0) {
resetLastChars();
textContent.items.push({
str: " ",
dir: "ltr",
@ -2600,14 +2666,7 @@ class PartialEvaluator {
}
let scaledDim = glyphWidth * scale;
if (
glyph.isWhitespace &&
(i === 0 ||
i + 1 === ii ||
glyphs[i - 1].isWhitespace ||
glyphs[i + 1].isWhitespace ||
extraSpacing)
) {
if (glyph.isWhitespace) {
// Don't push a " " in the textContentItem
// (except when it's between two non-spaces chars),
// it will be done (if required) in next call to
@ -2623,6 +2682,7 @@ class PartialEvaluator {
charSpacing += -scaledDim + textState.wordSpacing;
textState.translateTextMatrix(0, -charSpacing);
}
saveLastChar(" ");
continue;
}
@ -2653,17 +2713,18 @@ class PartialEvaluator {
textChunk.prevTransform = getCurrentTextTransform();
}
if (glyph.isWhitespace) {
let glyphUnicode = glyph.unicode;
glyphUnicode = NormalizedUnicodes[glyphUnicode] || glyphUnicode;
glyphUnicode = reverseIfRtl(glyphUnicode);
if (saveLastChar(glyphUnicode)) {
// The two last chars are a non-whitespace followed by a whitespace
// and then this non-whitespace, so we insert a whitespace here.
// Replaces all whitespaces with standard spaces (0x20), to avoid
// alignment issues between the textLayer and the canvas if the text
// contains e.g. tabs (fixes issue6612.pdf).
textChunk.str.push(" ");
} else {
let glyphUnicode = glyph.unicode;
glyphUnicode = NormalizedUnicodes[glyphUnicode] || glyphUnicode;
glyphUnicode = reverseIfRtl(glyphUnicode);
textChunk.str.push(glyphUnicode);
}
textChunk.str.push(glyphUnicode);
if (charSpacing) {
if (!font.vertical) {
@ -2679,6 +2740,7 @@ class PartialEvaluator {
}
function appendEOL() {
resetLastChars();
if (textContentItem.initialized) {
textContentItem.hasEOL = true;
flushTextContentItem();
@ -2701,6 +2763,7 @@ class PartialEvaluator {
width <= textOrientation * textContentItem.spaceInFlowMax
) {
if (textContentItem.initialized) {
resetLastChars();
textContentItem.str.push(" ");
}
return false;
@ -2715,6 +2778,7 @@ class PartialEvaluator {
}
flushTextContentItem();
resetLastChars();
textContent.items.push({
str: " ",
// TODO: check if using the orientation from last chunk is