mirror of
https://github.com/mozilla/pdf.js.git
synced 2025-04-22 16:18:08 +02:00
Merge pull request #15395 from calixteman/15340
Don't replace cr by a white space when the last char on the line is an ideographic char
This commit is contained in:
commit
72375ed652
4 changed files with 35 additions and 4 deletions
|
@ -126,7 +126,7 @@ function normalize(text) {
|
|||
} else {
|
||||
// Compile the regular expression for text normalization once.
|
||||
const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join("");
|
||||
const regexp = `([${replace}])|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(\\n)`;
|
||||
const regexp = `([${replace}])|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(\\p{Ideographic}\\n)|(\\n)`;
|
||||
|
||||
if (syllablePositions.length === 0) {
|
||||
// Most of the syllables belong to Hangul so there are no need
|
||||
|
@ -188,7 +188,7 @@ function normalize(text) {
|
|||
|
||||
normalized = normalized.replace(
|
||||
normalizationRegex,
|
||||
(match, p1, p2, p3, p4, p5, i) => {
|
||||
(match, p1, p2, p3, p4, p5, p6, i) => {
|
||||
i -= shiftOrigin;
|
||||
if (p1) {
|
||||
// Maybe fractions or quotations mark...
|
||||
|
@ -248,6 +248,15 @@ function normalize(text) {
|
|||
}
|
||||
|
||||
if (p4) {
|
||||
// An ideographic at the end of a line doesn't imply adding an extra
|
||||
// white space.
|
||||
positions.push([i - shift + 1, shift]);
|
||||
shiftOrigin += 1;
|
||||
eol += 1;
|
||||
return p4.charAt(0);
|
||||
}
|
||||
|
||||
if (p5) {
|
||||
// eol is replaced by space: "foo\nbar" is likely equivalent to
|
||||
// "foo bar".
|
||||
positions.push([i - shift + 1, shift - 1]);
|
||||
|
@ -257,7 +266,7 @@ function normalize(text) {
|
|||
return " ";
|
||||
}
|
||||
|
||||
// p5
|
||||
// p6
|
||||
if (i + eol === syllablePositions[syllableIndex]?.[1]) {
|
||||
// A syllable (1 char) is replaced with several chars (n) so
|
||||
// newCharsLen = n - 1.
|
||||
|
@ -269,7 +278,7 @@ function normalize(text) {
|
|||
shift -= newCharLen;
|
||||
shiftOrigin += newCharLen;
|
||||
}
|
||||
return p5;
|
||||
return p6;
|
||||
}
|
||||
);
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue