1
0
Fork 0
mirror of https://github.com/mozilla/pdf.js.git synced 2025-04-25 09:38:06 +02:00

Fix issues in text selection

- PR #13257 fixed a lot of issues but not all and this patch aims to fix almost all remaining issues.
  - the idea in this new patch is to compare position of new glyph with the last position where a glyph has been drawn;
    - no space are "drawn": it just moves the cursor but they aren't added in the chunk;
    - so this way a space followed by a cursor move can be treated as only one space: it helps to merge all spaces into one.
  - to make difference between real spaces and tracking ones, we used a factor of the space width (from the font)
    - it was a pretty good idea in general but it fails with some fonts where space was too big:
    - in Poppler, they're using a factor of the font size: this is an excellent idea (<= 0.1 * fontSize implies tracking space).
This commit is contained in:
Calixte Denizet 2021-05-23 19:03:53 +02:00
parent f5b79be0b7
commit 61d1063276
10 changed files with 361 additions and 143 deletions

View file

@ -2169,6 +2169,8 @@ class PartialEvaluator {
stateManager = stateManager || new StateManager(new TextState());
const WhitespaceRegexp = /\s/g;
const DiacriticRegExp = new RegExp("^\\p{Mn}$", "u");
const NormalizedUnicodes = getNormalizedUnicodes();
const textContent = {
items: [],
@ -2182,34 +2184,37 @@ class PartialEvaluator {
width: 0,
height: 0,
vertical: false,
lastCharSize: 0,
prevTransform: null,
textAdvanceScale: 0,
spaceWidth: 0,
spaceInFlowMin: 0,
spaceInFlowMax: 0,
trackingSpaceMin: Infinity,
negativeSpaceMax: -Infinity,
transform: null,
fontName: null,
hasEOL: false,
isLastCharWhiteSpace: false,
};
// Used in addFakeSpaces.
// wsw stands for whitespace width.
// A white <= wsw * TRACKING_SPACE_FACTOR is a tracking space
// A white <= fontSize * TRACKING_SPACE_FACTOR is a tracking space
// so it doesn't count as a space.
const TRACKING_SPACE_FACTOR = 0.3;
const TRACKING_SPACE_FACTOR = 0.1;
// A white with a width in [wsw * MIN_FACTOR; wsw * MAX_FACTOR]
// A negative white < fontSize * NEGATIVE_SPACE_FACTOR induces
// a break (a new chunk of text is created).
// It doesn't change anything when the text is copied but
// it improves potential mismatch between text layer and canvas.
const NEGATIVE_SPACE_FACTOR = -0.2;
// A white with a width in [fontSize * MIN_FACTOR; fontSize * MAX_FACTOR]
// is a space which will be inserted in the current flow of words.
// If the width is outside of this range then the flow is broken
// (which means a new span in the text layer).
// It's useful to adjust the best as possible the span in the layer
// to what is displayed in the canvas.
const SPACE_IN_FLOW_MIN_FACTOR = 0.3;
const SPACE_IN_FLOW_MAX_FACTOR = 1.3;
const SPACE_IN_FLOW_MIN_FACTOR = 0.1;
const SPACE_IN_FLOW_MAX_FACTOR = 0.6;
const self = this;
const xref = this.xref;
@ -2294,18 +2299,15 @@ class PartialEvaluator {
);
const scaleCtmX = Math.hypot(textState.ctm[0], textState.ctm[1]);
textContentItem.textAdvanceScale = scaleCtmX * scaleLineX;
textContentItem.lastCharSize = textContentItem.lastCharSize || 0;
const spaceWidth = (font.spaceWidth / 1000) * textState.fontSize;
if (spaceWidth) {
textContentItem.spaceWidth = spaceWidth;
textContentItem.trackingSpaceMin = spaceWidth * TRACKING_SPACE_FACTOR;
textContentItem.spaceInFlowMin = spaceWidth * SPACE_IN_FLOW_MIN_FACTOR;
textContentItem.spaceInFlowMax = spaceWidth * SPACE_IN_FLOW_MAX_FACTOR;
} else {
textContentItem.spaceWidth = 0;
textContentItem.trackingSpaceMin = Infinity;
}
textContentItem.trackingSpaceMin =
textState.fontSize * TRACKING_SPACE_FACTOR;
textContentItem.negativeSpaceMax =
textState.fontSize * NEGATIVE_SPACE_FACTOR;
textContentItem.spaceInFlowMin =
textState.fontSize * SPACE_IN_FLOW_MIN_FACTOR;
textContentItem.spaceInFlowMax =
textState.fontSize * SPACE_IN_FLOW_MAX_FACTOR;
textContentItem.hasEOL = false;
@ -2395,7 +2397,7 @@ class PartialEvaluator {
});
}
function compareWithLastPosition(fontSize) {
function compareWithLastPosition() {
if (
!combineTextItems ||
!textState.font ||
@ -2405,36 +2407,76 @@ class PartialEvaluator {
}
const currentTransform = getCurrentTextTransform();
const posX = currentTransform[4];
const posY = currentTransform[5];
const lastPosX = textContentItem.prevTransform[4];
const lastPosY = textContentItem.prevTransform[5];
let posX = currentTransform[4];
let posY = currentTransform[5];
let lastPosX = textContentItem.prevTransform[4];
let lastPosY = textContentItem.prevTransform[5];
if (lastPosX === posX && lastPosY === posY) {
return;
}
const advanceX = (posX - lastPosX) / textContentItem.textAdvanceScale;
const advanceY = (posY - lastPosY) / textContentItem.textAdvanceScale;
const HALF_LAST_CHAR = -0.5 * textContentItem.lastCharSize;
let rotate = 0;
// Take into account the rotation is the current transform.
// Only rotations with an angle of 0, 90, 180 or 270 are considered.
if (
currentTransform[0] &&
currentTransform[1] === 0 &&
currentTransform[2] === 0
) {
rotate = currentTransform[0] > 0 ? 0 : 180;
} else if (
currentTransform[1] &&
currentTransform[0] === 0 &&
currentTransform[3] === 0
) {
rotate += currentTransform[1] > 0 ? 90 : 270;
}
if (rotate !== 0) {
switch (rotate) {
case 90:
[posX, posY] = [posY, posX];
[lastPosX, lastPosY] = [lastPosY, lastPosX];
break;
case 180:
[posX, posY, lastPosX, lastPosY] = [
-posX,
-posY,
-lastPosX,
-lastPosY,
];
break;
case 270:
[posX, posY] = [-posY, -posX];
[lastPosX, lastPosY] = [-lastPosY, -lastPosX];
break;
}
}
if (textState.font.vertical) {
if (
Math.abs(advanceX) >
textContentItem.width /
textContentItem.textAdvanceScale /* not the same column */
) {
const advanceY = (lastPosY - posY) / textContentItem.textAdvanceScale;
const advanceX = posX - lastPosX;
if (advanceY < textContentItem.negativeSpaceMax) {
if (
Math.abs(advanceX) >
0.5 * textContentItem.width /* not the same column */
) {
appendEOL();
return;
}
flushTextContentItem();
return;
}
if (Math.abs(advanceX) > textContentItem.height) {
appendEOL();
return;
}
if (HALF_LAST_CHAR > advanceY) {
return;
}
if (advanceY > textContentItem.trackingSpaceMin) {
if (advanceY <= textContentItem.trackingSpaceMin) {
textContentItem.height += advanceY;
} else if (!addFakeSpaces(advanceY, 0, textContentItem.prevTransform)) {
} else if (!addFakeSpaces(advanceY, textContentItem.prevTransform)) {
if (textContentItem.str.length === 0) {
textContent.items.push({
str: " ",
@ -2445,7 +2487,6 @@ class PartialEvaluator {
fontName: textContentItem.fontName,
hasEOL: false,
});
textContentItem.isLastCharWhiteSpace = true;
} else {
textContentItem.height += advanceY;
}
@ -2454,22 +2495,28 @@ class PartialEvaluator {
return;
}
if (
Math.abs(advanceY) >
textContentItem.height /
textContentItem.textAdvanceScale /* not the same line */
) {
appendEOL();
const advanceX = (posX - lastPosX) / textContentItem.textAdvanceScale;
const advanceY = posY - lastPosY;
if (advanceX < textContentItem.negativeSpaceMax) {
if (
Math.abs(advanceY) >
0.5 * textContentItem.height /* not the same line */
) {
appendEOL();
return;
}
flushTextContentItem();
return;
}
if (HALF_LAST_CHAR > advanceX) {
if (Math.abs(advanceY) > textContentItem.height) {
appendEOL();
return;
}
if (advanceX <= textContentItem.trackingSpaceMin) {
textContentItem.width += advanceX;
} else if (!addFakeSpaces(advanceX, 0, textContentItem.prevTransform)) {
} else if (!addFakeSpaces(advanceX, textContentItem.prevTransform)) {
if (textContentItem.str.length === 0) {
textContent.items.push({
str: " ",
@ -2480,14 +2527,13 @@ class PartialEvaluator {
fontName: textContentItem.fontName,
hasEOL: false,
});
textContentItem.isLastCharWhiteSpace = true;
} else {
textContentItem.width += advanceX;
}
}
}
function buildTextContentItem({ chars, extraSpacing, isFirstChunk }) {
function buildTextContentItem({ chars, extraSpacing }) {
const font = textState.font;
if (!chars) {
// Just move according to the space we have.
@ -2499,87 +2545,91 @@ class PartialEvaluator {
0
);
} else {
textState.translateTextMatrix(0, charSpacing);
textState.translateTextMatrix(0, -charSpacing);
}
}
return;
}
const NormalizedUnicodes = getNormalizedUnicodes();
const glyphs = font.charsToGlyphs(chars);
const scale = textState.fontMatrix[0] * textState.fontSize;
if (isFirstChunk) {
compareWithLastPosition(scale);
}
let textChunk = ensureTextContentItem();
let size = 0;
let lastCharSize = 0;
for (let i = 0, ii = glyphs.length; i < ii; i++) {
const glyph = glyphs[i];
let charSpacing =
textState.charSpacing + (i === ii - 1 ? extraSpacing : 0);
textState.charSpacing + (i + 1 === ii ? extraSpacing : 0);
let glyphWidth = glyph.width;
if (font.vertical) {
glyphWidth = glyph.vmetric ? glyph.vmetric[0] : -glyphWidth;
}
let scaledDim = glyphWidth * scale;
let glyphUnicode = glyph.unicode;
if (glyph.isSpace) {
charSpacing += textState.wordSpacing;
textChunk.isLastCharWhiteSpace = true;
} else {
glyphUnicode = NormalizedUnicodes[glyphUnicode] || glyphUnicode;
glyphUnicode = reverseIfRtl(glyphUnicode);
textChunk.isLastCharWhiteSpace = false;
if (
glyphUnicode === " " &&
(i === 0 ||
i + 1 === ii ||
glyphs[i - 1].unicode === " " ||
glyphs[i + 1].unicode === " ")
) {
// Don't push a " " in the textContentItem
// (except when it's between two non-spaces chars),
// it will be done (if required) in next call to
// compareWithLastPosition.
// This way we can merge real spaces and spaces due to cursor moves.
if (!font.vertical) {
charSpacing += scaledDim + textState.wordSpacing;
textState.translateTextMatrix(
charSpacing * textState.textHScale,
0
);
} else {
charSpacing += -scaledDim + textState.wordSpacing;
textState.translateTextMatrix(0, -charSpacing);
}
continue;
}
textChunk.str.push(glyphUnicode);
const glyphWidth =
font.vertical && glyph.vmetric ? glyph.vmetric[0] : glyph.width;
compareWithLastPosition();
// Must be called after compareWithLastPosition because
// the textContentItem could have been flushed.
const textChunk = ensureTextContentItem();
if (DiacriticRegExp.test(glyph.unicode)) {
scaledDim = 0;
}
let scaledDim = glyphWidth * scale;
if (!font.vertical) {
scaledDim *= textState.textHScale;
textState.translateTextMatrix(scaledDim, 0);
textChunk.width += scaledDim;
} else {
textState.translateTextMatrix(0, scaledDim);
scaledDim = Math.abs(scaledDim);
textChunk.height += scaledDim;
}
size += scaledDim;
if (scaledDim) {
// Save the position of the last visible character.
textChunk.prevTransform = getCurrentTextTransform();
}
glyphUnicode = NormalizedUnicodes[glyphUnicode] || glyphUnicode;
glyphUnicode = reverseIfRtl(glyphUnicode);
textChunk.str.push(glyphUnicode);
if (charSpacing) {
if (!font.vertical) {
charSpacing *= textState.textHScale;
}
scaledDim += charSpacing;
const wasSplit =
charSpacing > textContentItem.trackingSpaceMin &&
addFakeSpaces(charSpacing, size);
if (!font.vertical) {
textState.translateTextMatrix(charSpacing, 0);
textState.translateTextMatrix(
charSpacing * textState.textHScale,
0
);
} else {
textState.translateTextMatrix(0, charSpacing);
}
if (wasSplit) {
textChunk = ensureTextContentItem();
size = 0;
} else {
size += charSpacing;
textState.translateTextMatrix(0, -charSpacing);
}
}
lastCharSize = scaledDim;
}
textChunk.lastCharSize = lastCharSize;
if (!font.vertical) {
textChunk.width += size;
} else {
textChunk.height += size;
}
textChunk.prevTransform = getCurrentTextTransform();
}
function appendEOL() {
@ -2597,19 +2647,15 @@ class PartialEvaluator {
hasEOL: true,
});
}
textContentItem.isLastCharWhiteSpace = false;
textContentItem.lastCharSize = 0;
}
function addFakeSpaces(width, size, transf = null) {
function addFakeSpaces(width, transf) {
if (
textContentItem.spaceInFlowMin <= width &&
width <= textContentItem.spaceInFlowMax
) {
if (textContentItem.initialized) {
textContentItem.str.push(" ");
textContentItem.isLastCharWhiteSpace = true;
}
return false;
}
@ -2617,22 +2663,12 @@ class PartialEvaluator {
const fontName = textContentItem.fontName;
let height = 0;
width *= textContentItem.textAdvanceScale;
if (!textContentItem.vertical) {
textContentItem.width += size;
} else {
textContentItem.height += size;
if (textContentItem.vertical) {
height = width;
width = 0;
}
flushTextContentItem();
if (textContentItem.isLastCharWhiteSpace) {
return true;
}
textContentItem.isLastCharWhiteSpace = true;
textContent.items.push({
str: " ",
// TODO: check if using the orientation from last chunk is
@ -2640,7 +2676,7 @@ class PartialEvaluator {
dir: "ltr",
width,
height,
transform: transf ? transf : getCurrentTextTransform(),
transform: transf || getCurrentTextTransform(),
fontName,
hasEOL: false,
});
@ -2731,15 +2767,12 @@ class PartialEvaluator {
next(handleSetFont(fontNameArg, null));
return;
case OPS.setTextRise:
flushTextContentItem();
textState.textRise = args[0];
break;
case OPS.setHScale:
flushTextContentItem();
textState.textHScale = args[0] / 100;
break;
case OPS.setLeading:
flushTextContentItem();
textState.leading = args[0];
break;
case OPS.moveText:
@ -2747,13 +2780,11 @@ class PartialEvaluator {
textState.textMatrix = textState.textLineMatrix.slice();
break;
case OPS.setLeadingMoveText:
flushTextContentItem();
textState.leading = -args[1];
textState.translateTextLineMatrix(args[0], args[1]);
textState.textMatrix = textState.textLineMatrix.slice();
break;
case OPS.nextLine:
appendEOL();
textState.carriageReturn();
break;
case OPS.setTextMatrix:
@ -2782,7 +2813,6 @@ class PartialEvaluator {
textState.wordSpacing = args[0];
break;
case OPS.beginText:
flushTextContentItem();
textState.textMatrix = IDENTITY_MATRIX.slice();
textState.textLineMatrix = IDENTITY_MATRIX.slice();
break;
@ -2795,7 +2825,6 @@ class PartialEvaluator {
const spaceFactor =
((textState.font.vertical ? 1 : -1) * textState.fontSize) / 1000;
const elements = args[0];
let isFirstChunk = true;
for (let i = 0, ii = elements.length; i < ii - 1; i++) {
const item = elements[i];
if (typeof item === "string") {
@ -2814,11 +2843,7 @@ class PartialEvaluator {
buildTextContentItem({
chars: str,
extraSpacing: item * spaceFactor,
isFirstChunk,
});
if (str && isFirstChunk) {
isFirstChunk = false;
}
}
}
@ -2833,7 +2858,6 @@ class PartialEvaluator {
buildTextContentItem({
chars: str,
extraSpacing: 0,
isFirstChunk,
});
}
break;
@ -2842,11 +2866,9 @@ class PartialEvaluator {
self.ensureStateFont(stateManager.state);
continue;
}
buildTextContentItem({
chars: args[0],
extraSpacing: 0,
isFirstChunk: true,
});
break;
case OPS.nextLineShowText:
@ -2854,13 +2876,10 @@ class PartialEvaluator {
self.ensureStateFont(stateManager.state);
continue;
}
textContentItem.hasEOL = true;
flushTextContentItem();
textState.carriageReturn();
buildTextContentItem({
chars: args[0],
extraSpacing: 0,
isFirstChunk: true,
});
break;
case OPS.nextLineSetSpacingShowText:
@ -2868,15 +2887,12 @@ class PartialEvaluator {
self.ensureStateFont(stateManager.state);
continue;
}
textContentItem.hasEOL = true;
flushTextContentItem();
textState.wordSpacing = args[0];
textState.charSpacing = args[1];
textState.carriageReturn();
buildTextContentItem({
chars: args[2],
extraSpacing: 0,
isFirstChunk: true,
});
break;
case OPS.paintXObject: