diff --git a/src/core/catalog.js b/src/core/catalog.js index 3ca66bace..10ec83db9 100644 --- a/src/core/catalog.js +++ b/src/core/catalog.js @@ -13,6 +13,13 @@ * limitations under the License. */ +import { + addDefaultProtocolToUrl, + collectActions, + MissingDataException, + toRomanNumerals, + tryConvertUrlEncoding, +} from "./core_utils.js"; import { clearPrimitiveCaches, Dict, @@ -24,11 +31,6 @@ import { RefSet, RefSetCache, } from "./primitives.js"; -import { - collectActions, - MissingDataException, - toRomanNumerals, -} from "./core_utils.js"; import { createPromiseCapability, createValidAbsoluteUrl, @@ -1283,21 +1285,6 @@ class Catalog { * @param {ParseDestDictionaryParameters} params */ static parseDestDictionary(params) { - // Lets URLs beginning with 'www.' default to using the 'http://' protocol. - function addDefaultProtocolToUrl(url) { - return url.startsWith("www.") ? `http://${url}` : url; - } - - // According to ISO 32000-1:2008, section 12.6.4.7, URIs should be encoded - // in 7-bit ASCII. Some bad PDFs use UTF-8 encoding; see Bugzilla 1122280. - function tryConvertUrlEncoding(url) { - try { - return stringToUTF8String(url); - } catch (e) { - return url; - } - } - const destDict = params.destDict; if (!isDict(destDict)) { warn("parseDestDictionary: `destDict` must be a dictionary."); diff --git a/src/core/core_utils.js b/src/core/core_utils.js index d91455296..93e8c22fb 100644 --- a/src/core/core_utils.js +++ b/src/core/core_utils.js @@ -18,6 +18,7 @@ import { BaseException, objectSize, stringToPDFString, + stringToUTF8String, warn, } from "../shared/util.js"; import { Dict, isName, isRef, isStream, RefSet } from "./primitives.js"; @@ -451,7 +452,23 @@ function validateCSSFont(cssFontInfo) { return true; } +// Let URLs beginning with 'www.' default to using the 'http://' protocol. +function addDefaultProtocolToUrl(url) { + return url.startsWith("www.") ? `http://${url}` : url; +} + +// According to ISO 32000-1:2008, section 12.6.4.7, URIs should be encoded +// in 7-bit ASCII. Some bad PDFs use UTF-8 encoding; see Bugzilla 1122280. +function tryConvertUrlEncoding(url) { + try { + return stringToUTF8String(url); + } catch (e) { + return url; + } +} + export { + addDefaultProtocolToUrl, collectActions, encodeToXmlString, escapePDFName, @@ -467,6 +484,7 @@ export { readUint16, readUint32, toRomanNumerals, + tryConvertUrlEncoding, validateCSSFont, XRefEntryException, XRefParseException, diff --git a/src/core/xfa/xhtml.js b/src/core/xfa/xhtml.js index 8f13cf947..51aa7730a 100644 --- a/src/core/xfa/xhtml.js +++ b/src/core/xfa/xhtml.js @@ -29,8 +29,13 @@ import { XmlObject, } from "./xfa_object.js"; import { $buildXFAObject, NamespaceIds } from "./namespaces.js"; +import { + addDefaultProtocolToUrl, + tryConvertUrlEncoding, +} from "../core_utils.js"; import { fixTextIndent, measureToString, setFontFamily } from "./html_utils.js"; import { getMeasurement, HTMLResult, stripQuotes } from "./utils.js"; +import { createValidAbsoluteUrl } from "../../shared/util.js"; const XHTML_NS_ID = NamespaceIds.xhtml.id; @@ -321,7 +326,16 @@ class XhtmlObject extends XmlObject { class A extends XhtmlObject { constructor(attributes) { super(attributes, "a"); - this.href = attributes.href || ""; + let href = ""; + if (typeof attributes.href === "string") { + let url = addDefaultProtocolToUrl(attributes.href); + url = tryConvertUrlEncoding(url); + const absoluteUrl = createValidAbsoluteUrl(url); + if (absoluteUrl) { + href = absoluteUrl.href; + } + } + this.href = href; } } diff --git a/test/unit/xfa_tohtml_spec.js b/test/unit/xfa_tohtml_spec.js index cb96d6469..bf3cd8f19 100644 --- a/test/unit/xfa_tohtml_spec.js +++ b/test/unit/xfa_tohtml_spec.js @@ -522,4 +522,70 @@ describe("XFAFactory", function () { expect(field1).not.toEqual(null); expect(field1.attributes.value).toEqual("123"); }); + + it("should parse URLs correctly", function () { + function getXml(href) { + return ` + + + + + + + + + `; + } + let factory, pages, a; + + // A valid, and complete, URL. + factory = new XFAFactory({ "xdp:xdp": getXml("https://www.example.com/") }); + expect(factory.numberPages).toEqual(1); + pages = factory.getPages(); + a = searchHtmlNode(pages, "name", "a"); + expect(a.value).toEqual("https://www.example.com/"); + expect(a.attributes.href).toEqual("https://www.example.com/"); + + // A valid, but incomplete, URL. + factory = new XFAFactory({ "xdp:xdp": getXml("www.example.com/") }); + expect(factory.numberPages).toEqual(1); + pages = factory.getPages(); + a = searchHtmlNode(pages, "name", "a"); + expect(a.value).toEqual("www.example.com/"); + expect(a.attributes.href).toEqual("http://www.example.com/"); + + // A valid email-address. + factory = new XFAFactory({ "xdp:xdp": getXml("mailto:test@example.com") }); + expect(factory.numberPages).toEqual(1); + pages = factory.getPages(); + a = searchHtmlNode(pages, "name", "a"); + expect(a.value).toEqual("mailto:test@example.com"); + expect(a.attributes.href).toEqual("mailto:test@example.com"); + + // Not a valid URL. + factory = new XFAFactory({ "xdp:xdp": getXml("qwerty/") }); + expect(factory.numberPages).toEqual(1); + pages = factory.getPages(); + a = searchHtmlNode(pages, "name", "a"); + expect(a.value).toEqual("qwerty/"); + expect(a.attributes.href).toEqual(""); + }); });