1
0
Fork 0
mirror of https://github.com/mozilla/pdf.js.git synced 2025-04-19 14:48:08 +02:00
pdf.js/web/autolinker.js
Jonas Jenwald b85f0903ca Add new bounding-box helpers in Util to reduce code duplication
Currently we have a `Util`-helper for computing the bounding-box of a Bézier curve, however for simple points and rectangles we repeat virtually identical code in many spots throughout the code-base.

 - Introduce new `Util.pointBoundingBox` and `Util.rectBoundingBox` helpers.

 - Remove the "fallback" from `Util.bezierBoundingBox` and only support passing in a `minMax`-array, since there's only a single call-site using the other format and it could be easily updated.
2025-03-23 19:20:02 +01:00

180 lines
5.7 KiB
JavaScript

/* Copyright 2025 Mozilla Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import { AnnotationType, createValidAbsoluteUrl, Util } from "pdfjs-lib";
import { getOriginalIndex, normalize } from "./pdf_find_controller.js";
function DOMRectToPDF({ width, height, left, top }, pdfPageView) {
if (width === 0 || height === 0) {
return null;
}
const pageBox = pdfPageView.textLayer.div.getBoundingClientRect();
const bottomLeft = pdfPageView.getPagePoint(
left - pageBox.left,
top - pageBox.top
);
const topRight = pdfPageView.getPagePoint(
left - pageBox.left + width,
top - pageBox.top + height
);
return Util.normalizeRect([
bottomLeft[0],
bottomLeft[1],
topRight[0],
topRight[1],
]);
}
function calculateLinkPosition(range, pdfPageView) {
const rangeRects = range.getClientRects();
if (rangeRects.length === 1) {
return { rect: DOMRectToPDF(rangeRects[0], pdfPageView) };
}
const rect = [Infinity, Infinity, -Infinity, -Infinity];
const quadPoints = [];
let i = 0;
for (const domRect of rangeRects) {
const normalized = DOMRectToPDF(domRect, pdfPageView);
if (normalized === null) {
continue;
}
quadPoints[i] = quadPoints[i + 4] = normalized[0];
quadPoints[i + 1] = quadPoints[i + 3] = normalized[3];
quadPoints[i + 2] = quadPoints[i + 6] = normalized[2];
quadPoints[i + 5] = quadPoints[i + 7] = normalized[1];
Util.rectBoundingBox(...normalized, rect);
i += 8;
}
return { quadPoints, rect };
}
/**
* Given a DOM node `container` and an index into its text contents `offset`,
* returns a pair consisting of text node that the `offset` actually points
* to, together with the offset relative to that text node.
* When the offset points at the boundary between two node, the result will
* point to the first text node in depth-first traversal order.
*
* For example, given this DOM:
* <p>abc<span>def</span>ghi</p>
*
* textPosition(p, 0) -> [#text "abc", 0] (before `a`)
* textPosition(p, 2) -> [#text "abc", 2] (between `b` and `c`)
* textPosition(p, 3) -> [#text "abc", 3] (after `c`)
* textPosition(p, 5) -> [#text "def", 2] (between `e` and `f`)
* textPosition(p, 6) -> [#text "def", 3] (after `f`)
*/
function textPosition(container, offset) {
let currentContainer = container;
do {
if (currentContainer.nodeType === Node.TEXT_NODE) {
const currentLength = currentContainer.textContent.length;
if (offset <= currentLength) {
return [currentContainer, offset];
}
offset -= currentLength;
} else if (currentContainer.firstChild) {
currentContainer = currentContainer.firstChild;
continue;
}
while (!currentContainer.nextSibling && currentContainer !== container) {
currentContainer = currentContainer.parentNode;
}
if (currentContainer !== container) {
currentContainer = currentContainer.nextSibling;
}
} while (currentContainer !== container);
throw new Error("Offset is bigger than container's contents length.");
}
function createLinkAnnotation({ url, index, length }, pdfPageView, id) {
const highlighter = pdfPageView._textHighlighter;
const [{ begin, end }] = highlighter._convertMatches([index], [length]);
const range = new Range();
range.setStart(
...textPosition(highlighter.textDivs[begin.divIdx], begin.offset)
);
range.setEnd(...textPosition(highlighter.textDivs[end.divIdx], end.offset));
return {
id: `inferred_link_${id}`,
unsafeUrl: url,
url,
annotationType: AnnotationType.LINK,
rotation: 0,
...calculateLinkPosition(range, pdfPageView),
// Populated in the annotationLayer to avoid unnecessary object creation,
// since most inferred links overlap existing LinkAnnotations:
borderStyle: null,
};
}
class Autolinker {
static #index = 0;
static #regex;
static findLinks(text) {
// Regex can be tested and verified at https://regex101.com/r/rXoLiT/2.
this.#regex ??=
/\b(?:https?:\/\/|mailto:|www\.)(?:[\S--[\p{P}<>]]|\/|[\S--[\[\]]]+[\S--[\p{P}<>]])+|\b[\S--[@\p{Ps}\p{Pe}<>]]+@([\S--[\p{P}<>]]+(?:\.[\S--[\p{P}<>]]+)+)/gmv;
const [normalizedText, diffs] = normalize(text);
const matches = normalizedText.matchAll(this.#regex);
const links = [];
for (const match of matches) {
const [url, emailDomain] = match;
let raw;
if (
url.startsWith("www.") ||
url.startsWith("http://") ||
url.startsWith("https://")
) {
raw = url;
} else if (URL.canParse(`http://${emailDomain}`)) {
raw = url.startsWith("mailto:") ? url : `mailto:${url}`;
} else {
continue;
}
const absoluteURL = createValidAbsoluteUrl(raw, null, {
addDefaultProtocol: true,
});
if (absoluteURL) {
const [index, length] = getOriginalIndex(
diffs,
match.index,
url.length
);
links.push({ url: absoluteURL.href, index, length });
}
}
return links;
}
static processLinks(pdfPageView) {
return this.findLinks(
pdfPageView._textHighlighter.textContentItemsStr.join("\n")
).map(link => createLinkAnnotation(link, pdfPageView, this.#index++));
}
}
export { Autolinker };