1
0
Fork 0
mirror of https://github.com/mozilla/pdf.js.git synced 2025-04-19 22:58:07 +02:00

Fix autolinking errors due to regex and email validation

Fix some edge cases in the autolinking logic with the regex as well as
validating email domains and add unit tests for them.

Fixes: https://github.com/mozilla/pdf.js/issues/19462
This commit is contained in:
Ujjwal Sharma 2025-02-10 16:29:43 +01:00
parent a857ca3261
commit 70e2873430
2 changed files with 23 additions and 13 deletions

View file

@ -96,31 +96,37 @@ class Autolinker {
static #regex;
static findLinks(text) {
// Regex can be tested and verified at https://regex101.com/r/zgDwPE/1.
// Regex can be tested and verified at https://regex101.com/r/rXoLiT/2.
this.#regex ??=
/\b(?:https?:\/\/|mailto:|www\.)(?:[[\S--\[]--\p{P}]|\/|[\p{P}--\[]+[[\S--\[]--\p{P}])+|\b[[\S--@]--\{]+@[\S--.]+\.[[\S--\[]--\p{P}]{2,}/gmv;
/\b(?:https?:\/\/|mailto:|www\.)(?:[\S--[\p{P}<>]]|\/|[\S--[\[\]]]+[\S--[\p{P}<>]])+|\b[\S--[@\p{Ps}\p{Pe}<>]]+@([\S--[\p{P}<>]]+(?:\.[\S--[\p{P}<>]]+)+)/gmv;
const [normalizedText, diffs] = normalize(text);
const matches = normalizedText.matchAll(this.#regex);
const links = [];
for (const match of matches) {
const raw =
match[0].startsWith("www.") ||
match[0].startsWith("mailto:") ||
match[0].startsWith("http://") ||
match[0].startsWith("https://")
? match[0]
: `mailto:${match[0]}`;
const url = createValidAbsoluteUrl(raw, null, {
const [url, emailDomain] = match;
let raw;
if (
url.startsWith("www.") ||
url.startsWith("http://") ||
url.startsWith("https://")
) {
raw = url;
} else if (URL.canParse(`http://${emailDomain}`)) {
raw = url.startsWith("mailto:") ? url : `mailto:${url}`;
} else {
continue;
}
const absoluteURL = createValidAbsoluteUrl(raw, null, {
addDefaultProtocol: true,
});
if (url) {
if (absoluteURL) {
const [index, length] = getOriginalIndex(
diffs,
match.index,
match[0].length
url.length
);
links.push({ url: url.href, index, length });
links.push({ url: absoluteURL.href, index, length });
}
}
return links;