diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 17a73f569..cdac17bf9 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -512,7 +512,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { this.xref, resources, this.pdfFunctionFactory - ) + ) && + image.maybeValidDimensions ) { // These JPEGs don't need any more processing so we can just send it. return this.handler diff --git a/src/core/image_utils.js b/src/core/image_utils.js index d12862ddf..294ec14a2 100644 --- a/src/core/image_utils.js +++ b/src/core/image_utils.js @@ -41,7 +41,8 @@ class NativeImageDecoder { this.xref, this.resources, this.pdfFunctionFactory - ) + ) && + image.maybeValidDimensions ); } diff --git a/src/core/jpeg_stream.js b/src/core/jpeg_stream.js index 582e194f3..068f75e6e 100644 --- a/src/core/jpeg_stream.js +++ b/src/core/jpeg_stream.js @@ -109,6 +109,135 @@ const JpegStream = (function JpegStreamClosure() { this.eof = true; }; + Object.defineProperty(JpegStream.prototype, "maybeValidDimensions", { + get: function JpegStream_maybeValidDimensions() { + const { dict, stream } = this; + const dictHeight = dict.get("Height", "H"); + const startPos = stream.pos; + + let validDimensions = true, + foundSOF = false, + b; + while ((b = stream.getByte()) !== -1) { + if (b !== 0xff) { + // Not a valid marker. + continue; + } + switch (stream.getByte()) { + case 0xc0: // SOF0 + case 0xc1: // SOF1 + case 0xc2: // SOF2 + // These three SOF{n} markers are the only ones that the built-in + // PDF.js JPEG decoder currently supports. + foundSOF = true; + + stream.pos += 2; // Skip marker length. + stream.pos += 1; // Skip precision. + const scanLines = stream.getUint16(); + + // The "normal" case, where the image data and dictionary agrees. + if (scanLines === dictHeight) { + break; + } + // A DNL (Define Number of Lines) marker is expected, + // which browsers (usually) cannot decode natively. + if (scanLines === 0) { + validDimensions = false; + break; + } + // The dimensions of the image, among other properties, should + // always be taken from the image data *itself* rather than the + // XObject dictionary. However there's cases of corrupt images that + // browsers cannot decode natively, for example: + // - JPEG images with DNL markers, where the SOF `scanLines` + // parameter has an unexpected value (see issue 8614). + // - JPEG images with too large SOF `scanLines` parameter, where + // the EOI marker is encountered prematurely (see issue 10880). + // In an attempt to handle these kinds of corrupt images, compare + // the dimensions in the image data with the dictionary and *always* + // let the PDF.js JPEG decoder (rather than the browser) handle the + // image if the difference is larger than one order of magnitude + // (since that would generally suggest that something is off). + if (scanLines > dictHeight * 10) { + validDimensions = false; + break; + } + break; + + case 0xc3: // SOF3 + /* falls through */ + case 0xc5: // SOF5 + case 0xc6: // SOF6 + case 0xc7: // SOF7 + /* falls through */ + case 0xc9: // SOF9 + case 0xca: // SOF10 + case 0xcb: // SOF11 + /* falls through */ + case 0xcd: // SOF13 + case 0xce: // SOF14 + case 0xcf: // SOF15 + foundSOF = true; + break; + + case 0xc4: // DHT + case 0xcc: // DAC + /* falls through */ + case 0xda: // SOS + case 0xdb: // DQT + case 0xdc: // DNL + case 0xdd: // DRI + case 0xde: // DHP + case 0xdf: // EXP + /* falls through */ + case 0xe0: // APP0 + case 0xe1: // APP1 + case 0xe2: // APP2 + case 0xe3: // APP3 + case 0xe4: // APP4 + case 0xe5: // APP5 + case 0xe6: // APP6 + case 0xe7: // APP7 + case 0xe8: // APP8 + case 0xe9: // APP9 + case 0xea: // APP10 + case 0xeb: // APP11 + case 0xec: // APP12 + case 0xed: // APP13 + case 0xee: // APP14 + case 0xef: // APP15 + /* falls through */ + case 0xfe: // COM + const markerLength = stream.getUint16(); + if (markerLength > 2) { + stream.skip(markerLength - 2); // Jump to the next marker. + } else { + // The marker length is invalid, resetting the stream position. + stream.skip(-2); + } + break; + + case 0xff: // Fill byte. + // Avoid skipping a valid marker, resetting the stream position. + stream.skip(-1); + break; + + case 0xd9: // EOI + foundSOF = true; + break; + } + if (foundSOF) { + break; + } + } + // Finally, don't forget to reset the stream position. + stream.pos = startPos; + + return shadow(this, "maybeValidDimensions", validDimensions); + }, + configurable: true, + }); + JpegStream.prototype.getIR = function(forceDataSchema = false) { return createObjectURL(this.bytes, "image/jpeg", forceDataSchema); }; diff --git a/src/core/jpg.js b/src/core/jpg.js index c11321e78..b80739a7d 100644 --- a/src/core/jpg.js +++ b/src/core/jpg.js @@ -148,7 +148,7 @@ var JpegImage = (function JpegImageClosure() { if (bitsData === 0xff) { var nextByte = data[offset++]; if (nextByte) { - if (nextByte === 0xdc && parseDNLMarker) { + if (nextByte === /* DNL = */ 0xdc && parseDNLMarker) { offset += 2; // Skip marker length. const scanLines = readUint16(data, offset); @@ -159,7 +159,22 @@ var JpegImage = (function JpegImageClosure() { scanLines ); } - } else if (nextByte === 0xd9) { + } else if (nextByte === /* EOI = */ 0xd9) { + if (parseDNLMarker) { + // NOTE: only 8-bit JPEG images are supported in this decoder. + const maybeScanLines = blockRow * 8; + // Heuristic to attempt to handle corrupt JPEG images with too + // large `scanLines` parameter, by falling back to the currently + // parsed number of scanLines when it's at least one order of + // magnitude smaller than expected (fixes issue10880.pdf). + if (maybeScanLines > 0 && maybeScanLines < frame.scanLines / 10) { + throw new DNLMarkerError( + "Found EOI marker (0xFFD9) while parsing scan data, " + + "possibly caused by incorrect `scanLines` parameter", + maybeScanLines + ); + } + } throw new EOIMarkerError( "Found EOI marker (0xFFD9) while parsing scan data" ); @@ -337,17 +352,18 @@ var JpegImage = (function JpegImageClosure() { } } + let blockRow = 0; function decodeMcu(component, decode, mcu, row, col) { var mcuRow = (mcu / mcusPerLine) | 0; var mcuCol = mcu % mcusPerLine; - var blockRow = mcuRow * component.v + row; + blockRow = mcuRow * component.v + row; var blockCol = mcuCol * component.h + col; var offset = getBlockBufferOffset(component, blockRow, blockCol); decode(component, offset); } function decodeBlock(component, decode, mcu) { - var blockRow = (mcu / component.blocksPerLine) | 0; + blockRow = (mcu / component.blocksPerLine) | 0; var blockCol = mcu % component.blocksPerLine; var offset = getBlockBufferOffset(component, blockRow, blockCol); decode(component, offset); diff --git a/test/pdfs/issue10880.pdf.link b/test/pdfs/issue10880.pdf.link new file mode 100644 index 000000000..10f4e7b79 --- /dev/null +++ b/test/pdfs/issue10880.pdf.link @@ -0,0 +1 @@ +https://github.com/mozilla/pdf.js/files/3247065/B3-T-G5-50.pdf diff --git a/test/test_manifest.json b/test/test_manifest.json index 3da6a1a12..b94ab450c 100644 --- a/test/test_manifest.json +++ b/test/test_manifest.json @@ -3634,6 +3634,15 @@ "lastPage": 1, "type": "eq" }, + { "id": "issue10880", + "file": "pdfs/issue10880.pdf", + "md5": "244ee5ee3ab88db8d8eb51d4416e2c97", + "rounds": 1, + "link": true, + "firstPage": 7, + "lastPage": 7, + "type": "eq" + }, { "id": "issue9650", "file": "pdfs/issue9650.pdf", "md5": "20d50bda6b1080b6d9088811299c791e",