1
0
Fork 0
mirror of https://github.com/mozilla/pdf.js.git synced 2025-04-23 08:38:06 +02:00

Merge pull request #17331 from calixteman/lang_marker

Remove language codes from text strings.
This commit is contained in:
calixteman 2023-11-25 16:43:37 +01:00 committed by GitHub
commit 9d863f5180
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 47 additions and 2 deletions

View file

@ -905,12 +905,21 @@ const PDFStringTranslateTable = [
];
function stringToPDFString(str) {
// See section 7.9.2.2 Text String Type.
// The string can contain some language codes bracketed with 0x0b,
// so we must remove them.
if (str[0] >= "\xEF") {
let encoding;
if (str[0] === "\xFE" && str[1] === "\xFF") {
encoding = "utf-16be";
if (str.length % 2 === 1) {
str = str.slice(0, -1);
}
} else if (str[0] === "\xFF" && str[1] === "\xFE") {
encoding = "utf-16le";
if (str.length % 2 === 1) {
str = str.slice(0, -1);
}
} else if (str[0] === "\xEF" && str[1] === "\xBB" && str[2] === "\xBF") {
encoding = "utf-8";
}
@ -919,7 +928,11 @@ function stringToPDFString(str) {
try {
const decoder = new TextDecoder(encoding, { fatal: true });
const buffer = stringToBytes(str);
return decoder.decode(buffer);
const decoded = decoder.decode(buffer);
if (!decoded.includes("\x1b")) {
return decoded;
}
return decoded.replaceAll(/\x1b[^\x1b]*(?:\x1b|$)/g, "");
} catch (ex) {
warn(`stringToPDFString: "${ex}".`);
}
@ -928,7 +941,13 @@ function stringToPDFString(str) {
// ISO Latin 1
const strBuf = [];
for (let i = 0, ii = str.length; i < ii; i++) {
const code = PDFStringTranslateTable[str.charCodeAt(i)];
const charCode = str.charCodeAt(i);
if (charCode === 0x1b) {
// eslint-disable-next-line no-empty
while (++i < ii && str.charCodeAt(i) !== 0x1b) {}
continue;
}
const code = PDFStringTranslateTable[charCode];
strBuf.push(code ? String.fromCharCode(code) : str.charAt(i));
}
return strBuf.join("");