mirror of
https://github.com/mozilla/pdf.js.git
synced 2025-04-23 08:38:06 +02:00
Merge pull request #17331 from calixteman/lang_marker
Remove language codes from text strings.
This commit is contained in:
commit
9d863f5180
2 changed files with 47 additions and 2 deletions
|
@ -905,12 +905,21 @@ const PDFStringTranslateTable = [
|
|||
];
|
||||
|
||||
function stringToPDFString(str) {
|
||||
// See section 7.9.2.2 Text String Type.
|
||||
// The string can contain some language codes bracketed with 0x0b,
|
||||
// so we must remove them.
|
||||
if (str[0] >= "\xEF") {
|
||||
let encoding;
|
||||
if (str[0] === "\xFE" && str[1] === "\xFF") {
|
||||
encoding = "utf-16be";
|
||||
if (str.length % 2 === 1) {
|
||||
str = str.slice(0, -1);
|
||||
}
|
||||
} else if (str[0] === "\xFF" && str[1] === "\xFE") {
|
||||
encoding = "utf-16le";
|
||||
if (str.length % 2 === 1) {
|
||||
str = str.slice(0, -1);
|
||||
}
|
||||
} else if (str[0] === "\xEF" && str[1] === "\xBB" && str[2] === "\xBF") {
|
||||
encoding = "utf-8";
|
||||
}
|
||||
|
@ -919,7 +928,11 @@ function stringToPDFString(str) {
|
|||
try {
|
||||
const decoder = new TextDecoder(encoding, { fatal: true });
|
||||
const buffer = stringToBytes(str);
|
||||
return decoder.decode(buffer);
|
||||
const decoded = decoder.decode(buffer);
|
||||
if (!decoded.includes("\x1b")) {
|
||||
return decoded;
|
||||
}
|
||||
return decoded.replaceAll(/\x1b[^\x1b]*(?:\x1b|$)/g, "");
|
||||
} catch (ex) {
|
||||
warn(`stringToPDFString: "${ex}".`);
|
||||
}
|
||||
|
@ -928,7 +941,13 @@ function stringToPDFString(str) {
|
|||
// ISO Latin 1
|
||||
const strBuf = [];
|
||||
for (let i = 0, ii = str.length; i < ii; i++) {
|
||||
const code = PDFStringTranslateTable[str.charCodeAt(i)];
|
||||
const charCode = str.charCodeAt(i);
|
||||
if (charCode === 0x1b) {
|
||||
// eslint-disable-next-line no-empty
|
||||
while (++i < ii && str.charCodeAt(i) !== 0x1b) {}
|
||||
continue;
|
||||
}
|
||||
const code = PDFStringTranslateTable[charCode];
|
||||
strBuf.push(code ? String.fromCharCode(code) : str.charAt(i));
|
||||
}
|
||||
return strBuf.join("");
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue