mirror of
https://github.com/mozilla/pdf.js.git
synced 2025-04-22 16:18:08 +02:00
Merge pull request #14703 from calixteman/14627
[text selection] Add the whitespaces present in the pdf in the text chunk
This commit is contained in:
commit
0dd6bc9a85
6 changed files with 94 additions and 23 deletions
|
@ -2187,17 +2187,62 @@ class PartialEvaluator {
|
|||
spaceInFlowMax: 0,
|
||||
trackingSpaceMin: Infinity,
|
||||
negativeSpaceMax: -Infinity,
|
||||
notASpace: -Infinity,
|
||||
transform: null,
|
||||
fontName: null,
|
||||
hasEOL: false,
|
||||
};
|
||||
|
||||
// Use a circular buffer (length === 2) to save the last chars in the
|
||||
// text stream.
|
||||
// This implementation of the circular buffer is using a fixed array
|
||||
// and the position of the next element:
|
||||
// function addElement(x) {
|
||||
// buffer[pos] = x;
|
||||
// pos = (pos + 1) % buffer.length;
|
||||
// }
|
||||
// It's a way faster than:
|
||||
// function addElement(x) {
|
||||
// buffer.push(x);
|
||||
// buffer.shift();
|
||||
// }
|
||||
//
|
||||
// It's useful to know when we need to add a whitespace in the
|
||||
// text chunk.
|
||||
const twoLastChars = [" ", " "];
|
||||
let twoLastCharsPos = 0;
|
||||
|
||||
/**
|
||||
* Save the last char.
|
||||
* @param {string} char
|
||||
* @returns {boolean} true when the two last chars before adding the new one
|
||||
* are a non-whitespace followed by a whitespace.
|
||||
*/
|
||||
function saveLastChar(char) {
|
||||
const nextPos = (twoLastCharsPos + 1) % 2;
|
||||
const ret =
|
||||
twoLastChars[twoLastCharsPos] !== " " && twoLastChars[nextPos] === " ";
|
||||
twoLastChars[twoLastCharsPos] = char;
|
||||
twoLastCharsPos = nextPos;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
function resetLastChars() {
|
||||
twoLastChars[0] = twoLastChars[1] = " ";
|
||||
twoLastCharsPos = 0;
|
||||
}
|
||||
|
||||
// Used in addFakeSpaces.
|
||||
|
||||
// A white <= fontSize * TRACKING_SPACE_FACTOR is a tracking space
|
||||
// so it doesn't count as a space.
|
||||
const TRACKING_SPACE_FACTOR = 0.1;
|
||||
|
||||
// When a white <= fontSize * NOT_A_SPACE_FACTOR, there is no space
|
||||
// even if one is present in the text stream.
|
||||
const NOT_A_SPACE_FACTOR = 0.03;
|
||||
|
||||
// A negative white < fontSize * NEGATIVE_SPACE_FACTOR induces
|
||||
// a break (a new chunk of text is created).
|
||||
// It doesn't change anything when the text is copied but
|
||||
|
@ -2299,6 +2344,7 @@ class PartialEvaluator {
|
|||
|
||||
textContentItem.trackingSpaceMin =
|
||||
textState.fontSize * TRACKING_SPACE_FACTOR;
|
||||
textContentItem.notASpace = textState.fontSize * NOT_A_SPACE_FACTOR;
|
||||
textContentItem.negativeSpaceMax =
|
||||
textState.fontSize * NEGATIVE_SPACE_FACTOR;
|
||||
textContentItem.spaceInFlowMin =
|
||||
|
@ -2483,6 +2529,7 @@ class PartialEvaluator {
|
|||
return true;
|
||||
}
|
||||
|
||||
resetLastChars();
|
||||
flushTextContentItem();
|
||||
return true;
|
||||
}
|
||||
|
@ -2491,6 +2538,13 @@ class PartialEvaluator {
|
|||
appendEOL();
|
||||
return true;
|
||||
}
|
||||
|
||||
if (advanceY <= textOrientation * textContentItem.notASpace) {
|
||||
// The real spacing between 2 consecutive chars is thin enough to be
|
||||
// considered a non-space.
|
||||
resetLastChars();
|
||||
}
|
||||
|
||||
if (advanceY <= textOrientation * textContentItem.trackingSpaceMin) {
|
||||
textContentItem.height += advanceY;
|
||||
} else if (
|
||||
|
@ -2501,6 +2555,7 @@ class PartialEvaluator {
|
|||
)
|
||||
) {
|
||||
if (textContentItem.str.length === 0) {
|
||||
resetLastChars();
|
||||
textContent.items.push({
|
||||
str: " ",
|
||||
dir: "ltr",
|
||||
|
@ -2532,6 +2587,10 @@ class PartialEvaluator {
|
|||
appendEOL();
|
||||
return true;
|
||||
}
|
||||
|
||||
// We're moving back so in case the last char was a whitespace
|
||||
// we cancel it: it doesn't make sense to insert it.
|
||||
resetLastChars();
|
||||
flushTextContentItem();
|
||||
return true;
|
||||
}
|
||||
|
@ -2541,12 +2600,19 @@ class PartialEvaluator {
|
|||
return true;
|
||||
}
|
||||
|
||||
if (advanceX <= textOrientation * textContentItem.notASpace) {
|
||||
// The real spacing between 2 consecutive chars is thin enough to be
|
||||
// considered a non-space.
|
||||
resetLastChars();
|
||||
}
|
||||
|
||||
if (advanceX <= textOrientation * textContentItem.trackingSpaceMin) {
|
||||
textContentItem.width += advanceX;
|
||||
} else if (
|
||||
!addFakeSpaces(advanceX, textContentItem.prevTransform, textOrientation)
|
||||
) {
|
||||
if (textContentItem.str.length === 0) {
|
||||
resetLastChars();
|
||||
textContent.items.push({
|
||||
str: " ",
|
||||
dir: "ltr",
|
||||
|
@ -2600,14 +2666,7 @@ class PartialEvaluator {
|
|||
}
|
||||
let scaledDim = glyphWidth * scale;
|
||||
|
||||
if (
|
||||
glyph.isWhitespace &&
|
||||
(i === 0 ||
|
||||
i + 1 === ii ||
|
||||
glyphs[i - 1].isWhitespace ||
|
||||
glyphs[i + 1].isWhitespace ||
|
||||
extraSpacing)
|
||||
) {
|
||||
if (glyph.isWhitespace) {
|
||||
// Don't push a " " in the textContentItem
|
||||
// (except when it's between two non-spaces chars),
|
||||
// it will be done (if required) in next call to
|
||||
|
@ -2623,6 +2682,7 @@ class PartialEvaluator {
|
|||
charSpacing += -scaledDim + textState.wordSpacing;
|
||||
textState.translateTextMatrix(0, -charSpacing);
|
||||
}
|
||||
saveLastChar(" ");
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -2653,17 +2713,18 @@ class PartialEvaluator {
|
|||
textChunk.prevTransform = getCurrentTextTransform();
|
||||
}
|
||||
|
||||
if (glyph.isWhitespace) {
|
||||
let glyphUnicode = glyph.unicode;
|
||||
glyphUnicode = NormalizedUnicodes[glyphUnicode] || glyphUnicode;
|
||||
glyphUnicode = reverseIfRtl(glyphUnicode);
|
||||
if (saveLastChar(glyphUnicode)) {
|
||||
// The two last chars are a non-whitespace followed by a whitespace
|
||||
// and then this non-whitespace, so we insert a whitespace here.
|
||||
// Replaces all whitespaces with standard spaces (0x20), to avoid
|
||||
// alignment issues between the textLayer and the canvas if the text
|
||||
// contains e.g. tabs (fixes issue6612.pdf).
|
||||
textChunk.str.push(" ");
|
||||
} else {
|
||||
let glyphUnicode = glyph.unicode;
|
||||
glyphUnicode = NormalizedUnicodes[glyphUnicode] || glyphUnicode;
|
||||
glyphUnicode = reverseIfRtl(glyphUnicode);
|
||||
textChunk.str.push(glyphUnicode);
|
||||
}
|
||||
textChunk.str.push(glyphUnicode);
|
||||
|
||||
if (charSpacing) {
|
||||
if (!font.vertical) {
|
||||
|
@ -2679,6 +2740,7 @@ class PartialEvaluator {
|
|||
}
|
||||
|
||||
function appendEOL() {
|
||||
resetLastChars();
|
||||
if (textContentItem.initialized) {
|
||||
textContentItem.hasEOL = true;
|
||||
flushTextContentItem();
|
||||
|
@ -2701,6 +2763,7 @@ class PartialEvaluator {
|
|||
width <= textOrientation * textContentItem.spaceInFlowMax
|
||||
) {
|
||||
if (textContentItem.initialized) {
|
||||
resetLastChars();
|
||||
textContentItem.str.push(" ");
|
||||
}
|
||||
return false;
|
||||
|
@ -2715,6 +2778,7 @@ class PartialEvaluator {
|
|||
}
|
||||
|
||||
flushTextContentItem();
|
||||
resetLastChars();
|
||||
textContent.items.push({
|
||||
str: " ",
|
||||
// TODO: check if using the orientation from last chunk is
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue