mirror of
https://github.com/mozilla/pdf.js.git
synced 2025-04-26 01:58:06 +02:00
[api-minor] Decode all JPEG images with the built-in PDF.js decoder in src/core/jpg.js
Currently some JPEG images are decoded by the built-in PDF.js decoder in `src/core/jpg.js`, while others attempt to use the browser JPEG decoder. This inconsistency seem unfortunate for a number of reasons:
- It adds, compared to the other image formats supported in the PDF specification, a fair amount of code/complexity to the image handling in the PDF.js library.
- The PDF specification support JPEG images with features, e.g. certain ColorSpaces, that browsers are unable to decode natively. Hence, determining if a JPEG image is possible to decode natively in the browser require a non-trivial amount of parsing. In particular, we're parsing (part of) the raw JPEG data to extract certain marker data and we also need to parse the ColorSpace for the JPEG image.
- While some JPEG images may, for all intents and purposes, appear to be natively supported there's still cases where the browser may fail to decode some JPEG images. In order to support those cases, we've had to implement a fallback to the PDF.js JPEG decoder if there's any issues during the native decoding. This also means that it's no longer possible to simply send the JPEG image to the main-thread and continue parsing, but you now need to actually wait for the main-thread to indicate success/failure first.
In practice this means that there's a code-path where the worker-thread is forced to wait for the main-thread, while the reverse should *always* be the case.
- The native decoding, for anything except the *simplest* of JPEG images, result in increased peak memory usage because there's a handful of short-lived copies of the JPEG data (see PR 11707).
Furthermore this also leads to data being *parsed* on the main-thread, rather than the worker-thread, which you usually want to avoid for e.g. performance and UI-reponsiveness reasons.
- Not all environments, e.g. Node.js, fully support native JPEG decoding. This has, historically, lead to some issues and support requests.
- Different browsers may use different JPEG decoders, possibly leading to images being rendered slightly differently depending on the platform/browser where the PDF.js library is used.
Originally the implementation in `src/core/jpg.js` were unable to handle all of the JPEG images in the test-suite, but over the last couple of years I've fixed (hopefully) all of those issues.
At this point in time, there's two kinds of failure with this patch:
- Changes which are basically imperceivable to the naked eye, where some pixels in the images are essentially off-by-one (in all components), which could probably be attributed to things such as different rounding behaviour in the browser/PDF.js JPEG decoder.
This type of "failure" accounts for the *vast* majority of the total number of changes in the reference tests.
- Changes where the JPEG images now looks *ever so slightly* blurrier than with the native browser decoder. For quite some time I've just assumed that this pointed to a general deficiency in the `src/core/jpg.js` implementation, however I've discovered when comparing two viewers side-by-side that the differences vanish at higher zoom levels (usually around 200% is enough).
Basically if you disable [this downscaling in canvas.js](8fb82e939c/src/display/canvas.js (L2356-L2395)
), which is what happens when zooming in, the differences simply vanish!
Hence I'm pretty satisfied that there's no significant problems with the `src/core/jpg.js` implementation, and the problems are rather tied to the general quality of the downscaling algorithm used. It could even be seen as a positive that *all* images now share the same downscaling behaviour, since this actually fixes one old bug; see issue 7041.
This commit is contained in:
parent
4a3a24b002
commit
0351852d74
15 changed files with 32 additions and 602 deletions
|
@ -13,17 +13,14 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import { createObjectURL, shadow } from "../shared/util.js";
|
||||
import { DecodeStream } from "./stream.js";
|
||||
import { isDict } from "./primitives.js";
|
||||
import { JpegImage } from "./jpg.js";
|
||||
import { shadow } from "../shared/util.js";
|
||||
|
||||
/**
|
||||
* Depending on the type of JPEG a JpegStream is handled in different ways. For
|
||||
* JPEG's that are supported natively such as DeviceGray and DeviceRGB the image
|
||||
* data is stored and then loaded by the browser. For unsupported JPEG's we use
|
||||
* a library to decode these images and the stream behaves like all the other
|
||||
* DecodeStreams.
|
||||
* For JPEG's we use a library to decode these images and the stream behaves
|
||||
* like all the other DecodeStreams.
|
||||
*/
|
||||
const JpegStream = (function JpegStreamClosure() {
|
||||
// eslint-disable-next-line no-shadow
|
||||
|
@ -110,150 +107,6 @@ const JpegStream = (function JpegStreamClosure() {
|
|||
this.eof = true;
|
||||
};
|
||||
|
||||
Object.defineProperty(JpegStream.prototype, "maybeValidDimensions", {
|
||||
get: function JpegStream_maybeValidDimensions() {
|
||||
const { dict, stream } = this;
|
||||
const dictHeight = dict.get("Height", "H");
|
||||
const startPos = stream.pos;
|
||||
|
||||
let validDimensions = true,
|
||||
foundSOF = false,
|
||||
b;
|
||||
while ((b = stream.getByte()) !== -1) {
|
||||
if (b !== 0xff) {
|
||||
// Not a valid marker.
|
||||
continue;
|
||||
}
|
||||
switch (stream.getByte()) {
|
||||
case 0xc0: // SOF0
|
||||
case 0xc1: // SOF1
|
||||
case 0xc2: // SOF2
|
||||
// These three SOF{n} markers are the only ones that the built-in
|
||||
// PDF.js JPEG decoder currently supports.
|
||||
foundSOF = true;
|
||||
|
||||
stream.pos += 2; // Skip marker length.
|
||||
stream.pos += 1; // Skip precision.
|
||||
const scanLines = stream.getUint16();
|
||||
const samplesPerLine = stream.getUint16();
|
||||
|
||||
// Letting the browser handle the JPEG decoding, on the main-thread,
|
||||
// will cause a *large* increase in peak memory usage since there's
|
||||
// a handful of short-lived copies of the image data. For very big
|
||||
// JPEG images, always let the PDF.js image decoder handle them to
|
||||
// reduce overall memory usage during decoding (see issue 11694).
|
||||
if (scanLines * samplesPerLine > 1e6) {
|
||||
validDimensions = false;
|
||||
break;
|
||||
}
|
||||
|
||||
// The "normal" case, where the image data and dictionary agrees.
|
||||
if (scanLines === dictHeight) {
|
||||
break;
|
||||
}
|
||||
// A DNL (Define Number of Lines) marker is expected,
|
||||
// which browsers (usually) cannot decode natively.
|
||||
if (scanLines === 0) {
|
||||
validDimensions = false;
|
||||
break;
|
||||
}
|
||||
// The dimensions of the image, among other properties, should
|
||||
// always be taken from the image data *itself* rather than the
|
||||
// XObject dictionary. However there's cases of corrupt images that
|
||||
// browsers cannot decode natively, for example:
|
||||
// - JPEG images with DNL markers, where the SOF `scanLines`
|
||||
// parameter has an unexpected value (see issue 8614).
|
||||
// - JPEG images with too large SOF `scanLines` parameter, where
|
||||
// the EOI marker is encountered prematurely (see issue 10880).
|
||||
// In an attempt to handle these kinds of corrupt images, compare
|
||||
// the dimensions in the image data with the dictionary and *always*
|
||||
// let the PDF.js JPEG decoder (rather than the browser) handle the
|
||||
// image if the difference is larger than one order of magnitude
|
||||
// (since that would generally suggest that something is off).
|
||||
if (scanLines > dictHeight * 10) {
|
||||
validDimensions = false;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
case 0xc3: // SOF3
|
||||
/* falls through */
|
||||
case 0xc5: // SOF5
|
||||
case 0xc6: // SOF6
|
||||
case 0xc7: // SOF7
|
||||
/* falls through */
|
||||
case 0xc9: // SOF9
|
||||
case 0xca: // SOF10
|
||||
case 0xcb: // SOF11
|
||||
/* falls through */
|
||||
case 0xcd: // SOF13
|
||||
case 0xce: // SOF14
|
||||
case 0xcf: // SOF15
|
||||
foundSOF = true;
|
||||
break;
|
||||
|
||||
case 0xc4: // DHT
|
||||
case 0xcc: // DAC
|
||||
/* falls through */
|
||||
case 0xda: // SOS
|
||||
case 0xdb: // DQT
|
||||
case 0xdc: // DNL
|
||||
case 0xdd: // DRI
|
||||
case 0xde: // DHP
|
||||
case 0xdf: // EXP
|
||||
/* falls through */
|
||||
case 0xe0: // APP0
|
||||
case 0xe1: // APP1
|
||||
case 0xe2: // APP2
|
||||
case 0xe3: // APP3
|
||||
case 0xe4: // APP4
|
||||
case 0xe5: // APP5
|
||||
case 0xe6: // APP6
|
||||
case 0xe7: // APP7
|
||||
case 0xe8: // APP8
|
||||
case 0xe9: // APP9
|
||||
case 0xea: // APP10
|
||||
case 0xeb: // APP11
|
||||
case 0xec: // APP12
|
||||
case 0xed: // APP13
|
||||
case 0xee: // APP14
|
||||
case 0xef: // APP15
|
||||
/* falls through */
|
||||
case 0xfe: // COM
|
||||
const markerLength = stream.getUint16();
|
||||
if (markerLength > 2) {
|
||||
stream.skip(markerLength - 2); // Jump to the next marker.
|
||||
} else {
|
||||
// The marker length is invalid, resetting the stream position.
|
||||
stream.skip(-2);
|
||||
}
|
||||
break;
|
||||
|
||||
case 0xff: // Fill byte.
|
||||
// Avoid skipping a valid marker, resetting the stream position.
|
||||
stream.skip(-1);
|
||||
break;
|
||||
|
||||
case 0xd9: // EOI
|
||||
foundSOF = true;
|
||||
break;
|
||||
}
|
||||
if (foundSOF) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Finally, don't forget to reset the stream position.
|
||||
stream.pos = startPos;
|
||||
|
||||
return shadow(this, "maybeValidDimensions", validDimensions);
|
||||
},
|
||||
configurable: true,
|
||||
});
|
||||
|
||||
JpegStream.prototype.getIR = function (forceDataSchema = false) {
|
||||
return createObjectURL(this.bytes, "image/jpeg", forceDataSchema);
|
||||
};
|
||||
|
||||
return JpegStream;
|
||||
})();
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue