From 81a1c1cef73ef98041b5ec07ecd3412f0aee4a89 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Mon, 20 Sep 2021 14:36:43 +0200 Subject: [PATCH] Correctly validate URLs in XFA documents (bug 1731240) With this patch we'll ensure that only valid absolute URLs can be used in XFA documents, similar to the existing validation done for "regular" PDF documents. Furthermore, we'll also attempt to add a default protocol (i.e. `http`) to URLs beginning with "www." in XFA documents as well; this on its own is enough to fix https://bugzilla.mozilla.org/show_bug.cgi?id=1731240 --- src/core/catalog.js | 27 ++++----------- src/core/core_utils.js | 18 ++++++++++ src/core/xfa/xhtml.js | 16 ++++++++- test/unit/xfa_tohtml_spec.js | 66 ++++++++++++++++++++++++++++++++++++ 4 files changed, 106 insertions(+), 21 deletions(-) diff --git a/src/core/catalog.js b/src/core/catalog.js index 3ca66bace..10ec83db9 100644 --- a/src/core/catalog.js +++ b/src/core/catalog.js @@ -13,6 +13,13 @@ * limitations under the License. */ +import { + addDefaultProtocolToUrl, + collectActions, + MissingDataException, + toRomanNumerals, + tryConvertUrlEncoding, +} from "./core_utils.js"; import { clearPrimitiveCaches, Dict, @@ -24,11 +31,6 @@ import { RefSet, RefSetCache, } from "./primitives.js"; -import { - collectActions, - MissingDataException, - toRomanNumerals, -} from "./core_utils.js"; import { createPromiseCapability, createValidAbsoluteUrl, @@ -1283,21 +1285,6 @@ class Catalog { * @param {ParseDestDictionaryParameters} params */ static parseDestDictionary(params) { - // Lets URLs beginning with 'www.' default to using the 'http://' protocol. - function addDefaultProtocolToUrl(url) { - return url.startsWith("www.") ? `http://${url}` : url; - } - - // According to ISO 32000-1:2008, section 12.6.4.7, URIs should be encoded - // in 7-bit ASCII. Some bad PDFs use UTF-8 encoding; see Bugzilla 1122280. - function tryConvertUrlEncoding(url) { - try { - return stringToUTF8String(url); - } catch (e) { - return url; - } - } - const destDict = params.destDict; if (!isDict(destDict)) { warn("parseDestDictionary: `destDict` must be a dictionary."); diff --git a/src/core/core_utils.js b/src/core/core_utils.js index d91455296..93e8c22fb 100644 --- a/src/core/core_utils.js +++ b/src/core/core_utils.js @@ -18,6 +18,7 @@ import { BaseException, objectSize, stringToPDFString, + stringToUTF8String, warn, } from "../shared/util.js"; import { Dict, isName, isRef, isStream, RefSet } from "./primitives.js"; @@ -451,7 +452,23 @@ function validateCSSFont(cssFontInfo) { return true; } +// Let URLs beginning with 'www.' default to using the 'http://' protocol. +function addDefaultProtocolToUrl(url) { + return url.startsWith("www.") ? `http://${url}` : url; +} + +// According to ISO 32000-1:2008, section 12.6.4.7, URIs should be encoded +// in 7-bit ASCII. Some bad PDFs use UTF-8 encoding; see Bugzilla 1122280. +function tryConvertUrlEncoding(url) { + try { + return stringToUTF8String(url); + } catch (e) { + return url; + } +} + export { + addDefaultProtocolToUrl, collectActions, encodeToXmlString, escapePDFName, @@ -467,6 +484,7 @@ export { readUint16, readUint32, toRomanNumerals, + tryConvertUrlEncoding, validateCSSFont, XRefEntryException, XRefParseException, diff --git a/src/core/xfa/xhtml.js b/src/core/xfa/xhtml.js index 8f13cf947..51aa7730a 100644 --- a/src/core/xfa/xhtml.js +++ b/src/core/xfa/xhtml.js @@ -29,8 +29,13 @@ import { XmlObject, } from "./xfa_object.js"; import { $buildXFAObject, NamespaceIds } from "./namespaces.js"; +import { + addDefaultProtocolToUrl, + tryConvertUrlEncoding, +} from "../core_utils.js"; import { fixTextIndent, measureToString, setFontFamily } from "./html_utils.js"; import { getMeasurement, HTMLResult, stripQuotes } from "./utils.js"; +import { createValidAbsoluteUrl } from "../../shared/util.js"; const XHTML_NS_ID = NamespaceIds.xhtml.id; @@ -321,7 +326,16 @@ class XhtmlObject extends XmlObject { class A extends XhtmlObject { constructor(attributes) { super(attributes, "a"); - this.href = attributes.href || ""; + let href = ""; + if (typeof attributes.href === "string") { + let url = addDefaultProtocolToUrl(attributes.href); + url = tryConvertUrlEncoding(url); + const absoluteUrl = createValidAbsoluteUrl(url); + if (absoluteUrl) { + href = absoluteUrl.href; + } + } + this.href = href; } } diff --git a/test/unit/xfa_tohtml_spec.js b/test/unit/xfa_tohtml_spec.js index cb96d6469..bf3cd8f19 100644 --- a/test/unit/xfa_tohtml_spec.js +++ b/test/unit/xfa_tohtml_spec.js @@ -522,4 +522,70 @@ describe("XFAFactory", function () { expect(field1).not.toEqual(null); expect(field1.attributes.value).toEqual("123"); }); + + it("should parse URLs correctly", function () { + function getXml(href) { + return ` + + + + + + + + + `; + } + let factory, pages, a; + + // A valid, and complete, URL. + factory = new XFAFactory({ "xdp:xdp": getXml("https://www.example.com/") }); + expect(factory.numberPages).toEqual(1); + pages = factory.getPages(); + a = searchHtmlNode(pages, "name", "a"); + expect(a.value).toEqual("https://www.example.com/"); + expect(a.attributes.href).toEqual("https://www.example.com/"); + + // A valid, but incomplete, URL. + factory = new XFAFactory({ "xdp:xdp": getXml("www.example.com/") }); + expect(factory.numberPages).toEqual(1); + pages = factory.getPages(); + a = searchHtmlNode(pages, "name", "a"); + expect(a.value).toEqual("www.example.com/"); + expect(a.attributes.href).toEqual("http://www.example.com/"); + + // A valid email-address. + factory = new XFAFactory({ "xdp:xdp": getXml("mailto:test@example.com") }); + expect(factory.numberPages).toEqual(1); + pages = factory.getPages(); + a = searchHtmlNode(pages, "name", "a"); + expect(a.value).toEqual("mailto:test@example.com"); + expect(a.attributes.href).toEqual("mailto:test@example.com"); + + // Not a valid URL. + factory = new XFAFactory({ "xdp:xdp": getXml("qwerty/") }); + expect(factory.numberPages).toEqual(1); + pages = factory.getPages(); + a = searchHtmlNode(pages, "name", "a"); + expect(a.value).toEqual("qwerty/"); + expect(a.attributes.href).toEqual(""); + }); });