diff --git a/src/core/parser.js b/src/core/parser.js index f7efc4d2d..a9b826f40 100644 --- a/src/core/parser.js +++ b/src/core/parser.js @@ -609,12 +609,27 @@ class Parser { return imageStream; } - _findStreamLength(startPos, signature) { + #findStreamLength(startPos) { const { stream } = this.lexer; stream.pos = startPos; const SCAN_BLOCK_LENGTH = 2048; - const signatureLength = signature.length; + const signatureLength = "endstream".length; + + const END_SIGNATURE = new Uint8Array([0x65, 0x6e, 0x64]); + const endLength = END_SIGNATURE.length; + + // Ideally we'd directly search for "endstream", however there are corrupt + // PDF documents where the command is incomplete; hence we search for: + // 1. The normal case. + // 2. The misspelled case (fixes issue18122.pdf). + // 3. The truncated case (fixes issue10004.pdf). + const PARTIAL_SIGNATURE = [ + new Uint8Array([0x73, 0x74, 0x72, 0x65, 0x61, 0x6d]), // "stream" + new Uint8Array([0x73, 0x74, 0x65, 0x61, 0x6d]), // "steam", + new Uint8Array([0x73, 0x74, 0x72, 0x65, 0x61]), // "strea" + ]; + const normalLength = signatureLength - endLength; while (stream.pos < stream.end) { const scanBytes = stream.peekBytes(SCAN_BLOCK_LENGTH); @@ -626,13 +641,43 @@ class Parser { let pos = 0; while (pos < scanLength) { let j = 0; - while (j < signatureLength && scanBytes[pos + j] === signature[j]) { + while (j < endLength && scanBytes[pos + j] === END_SIGNATURE[j]) { j++; } - if (j >= signatureLength) { - // `signature` found. - stream.pos += pos; - return stream.pos - startPos; + if (j >= endLength) { + // "end" found, find the complete command. + let found = false; + for (const part of PARTIAL_SIGNATURE) { + const partLen = part.length; + let k = 0; + while (k < partLen && scanBytes[pos + j + k] === part[k]) { + k++; + } + if (k >= normalLength) { + // Found "endstream" command. + found = true; + break; + } + if (k >= partLen) { + // Found "endsteam" or "endstea" command. + // Ensure that the byte immediately following the corrupt + // endstream command is a space, to prevent false positives. + const lastByte = scanBytes[pos + j + k]; + if (isWhiteSpace(lastByte)) { + info( + `Found "${bytesToString([...END_SIGNATURE, ...part])}" when ` + + "searching for endstream command." + ); + found = true; + } + break; + } + } + + if (found) { + stream.pos += pos; + return stream.pos - startPos; + } } pos++; } @@ -665,43 +710,9 @@ class Parser { this.shift(); // 'stream' } else { // Bad stream length, scanning for endstream command. - const ENDSTREAM_SIGNATURE = new Uint8Array([ - 0x65, 0x6e, 0x64, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, - ]); - let actualLength = this._findStreamLength(startPos, ENDSTREAM_SIGNATURE); + const actualLength = this.#findStreamLength(startPos); if (actualLength < 0) { - // Only allow limited truncation of the endstream signature, - // to prevent false positives. - const MAX_TRUNCATION = 1; - // Check if the PDF generator included truncated endstream commands, - // such as e.g. "endstrea" (fixes issue10004.pdf). - for (let i = 1; i <= MAX_TRUNCATION; i++) { - const end = ENDSTREAM_SIGNATURE.length - i; - const TRUNCATED_SIGNATURE = ENDSTREAM_SIGNATURE.slice(0, end); - - const maybeLength = this._findStreamLength( - startPos, - TRUNCATED_SIGNATURE - ); - if (maybeLength >= 0) { - // Ensure that the byte immediately following the truncated - // endstream command is a space, to prevent false positives. - const lastByte = stream.peekBytes(end + 1)[end]; - if (!isWhiteSpace(lastByte)) { - break; - } - info( - `Found "${bytesToString(TRUNCATED_SIGNATURE)}" when ` + - "searching for endstream command." - ); - actualLength = maybeLength; - break; - } - } - - if (actualLength < 0) { - throw new FormatError("Missing endstream command."); - } + throw new FormatError("Missing endstream command."); } length = actualLength; diff --git a/test/pdfs/issue18122.pdf.link b/test/pdfs/issue18122.pdf.link new file mode 100644 index 000000000..875c8225b --- /dev/null +++ b/test/pdfs/issue18122.pdf.link @@ -0,0 +1 @@ +https://github.com/mozilla/pdf.js/files/15385405/Factuur_1105490.pdf diff --git a/test/test_manifest.json b/test/test_manifest.json index 74bc215d9..49ed28779 100644 --- a/test/test_manifest.json +++ b/test/test_manifest.json @@ -5420,6 +5420,15 @@ "rounds": 1, "type": "text" }, + { + "id": "issue18122", + "file": "pdfs/issue18122.pdf", + "md5": "b9be483e6bc0f35994f536d8f6a685c1", + "rounds": 1, + "link": true, + "lastPage": 1, + "type": "eq" + }, { "id": "issue4890", "file": "pdfs/issue4890.pdf",