From 1dc54ddb400bc9f72c6e7b39bfd222646a8251f6 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Fri, 8 Dec 2017 16:37:12 +0100 Subject: [PATCH] Handle PDF files with missing 'endobj' operators, by searching for the "obj" string rather than "endobj" in `XRef.indexObjects` (issue 9105) This patch refactors the searching for 'endobj', to try and find the next occurance of "obj" and then check if it was in fact an 'endobj' and continue searching otherwise. This approach is used to avoid having to first find 'endobj', and then re-check the entire contents of the object and having to run (potentially expensive) regular expressions on arbitrary long strings. Fixes 9105. --- src/core/obj.js | 38 +++++++++++++++-- test/pdfs/.gitignore | 1 + test/pdfs/issue9105_reduced.pdf | 74 +++++++++++++++++++++++++++++++++ test/test_manifest.json | 7 ++++ 4 files changed, 117 insertions(+), 3 deletions(-) create mode 100644 test/pdfs/issue9105_reduced.pdf diff --git a/src/core/obj.js b/src/core/obj.js index 6af60f1f3..c7058689e 100644 --- a/src/core/obj.js +++ b/src/core/obj.js @@ -1102,10 +1102,14 @@ var XRef = (function XRefClosure() { return skipped; } var objRegExp = /^(\d+)\s+(\d+)\s+obj\b/; + const endobjRegExp = /\bendobj[\b\s]$/; + const nestedObjRegExp = /\s+(\d+\s+\d+\s+obj[\b\s])$/; + const CHECK_CONTENT_LENGTH = 25; + var trailerBytes = new Uint8Array([116, 114, 97, 105, 108, 101, 114]); var startxrefBytes = new Uint8Array([115, 116, 97, 114, 116, 120, 114, 101, 102]); - var endobjBytes = new Uint8Array([101, 110, 100, 111, 98, 106]); + const objBytes = new Uint8Array([111, 98, 106]); var xrefBytes = new Uint8Array([47, 88, 82, 101, 102]); // Clear out any existing entries, since they may be bogus. @@ -1147,8 +1151,36 @@ var XRef = (function XRefClosure() { uncompressed: true, }; } - var contentLength = skipUntil(buffer, position, endobjBytes) + 7; - var content = buffer.subarray(position, position + contentLength); + let contentLength, startPos = position + token.length; + + // Find the next "obj" string, rather than "endobj", to ensure that + // we won't skip over a new 'obj' operator in corrupt files where + // 'endobj' operators are missing (fixes issue9105_reduced.pdf). + while (startPos < buffer.length) { + let endPos = startPos + skipUntil(buffer, startPos, objBytes) + 4; + contentLength = endPos - position; + + let checkPos = Math.max(endPos - CHECK_CONTENT_LENGTH, startPos); + let tokenStr = bytesToString(buffer.subarray(checkPos, endPos)); + + // Check if the current object ends with an 'endobj' operator. + if (endobjRegExp.test(tokenStr)) { + break; + } else { + // Check if an "obj" occurance is actually a new object, + // i.e. the current object is missing the 'endobj' operator. + let objToken = nestedObjRegExp.exec(tokenStr); + + if (objToken && objToken[1]) { + warn('indexObjects: Found new "obj" inside of another "obj", ' + + 'caused by missing "endobj" -- trying to recover.'); + contentLength -= objToken[1].length; + break; + } + } + startPos += contentLength; + } + let content = buffer.subarray(position, position + contentLength); // checking XRef stream suspect // (it shall have '/XRef' and next char is not a letter) diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 6ec4eeb40..9b2378ecd 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -64,6 +64,7 @@ !issue8798r.pdf !issue8823.pdf !issue9084.pdf +!issue9105_reduced.pdf !bad-PageLabels.pdf !filled-background.pdf !ArabicCIDTrueType.pdf diff --git a/test/pdfs/issue9105_reduced.pdf b/test/pdfs/issue9105_reduced.pdf new file mode 100644 index 000000000..39412e285 --- /dev/null +++ b/test/pdfs/issue9105_reduced.pdf @@ -0,0 +1,74 @@ +%PDF-1.7 +%âãÏÓ +1 0 obj +<< +/Title (Issue 9105) +/Author (Snuffleupagus) +>> +2 0 obj +<< +/Pages 3 0 R +/Type /Catalog +>> +endobj +3 0 obj +<< +/Kids [4 0 R] +/Count 1 +/Type /Pages +>> +endobj +4 0 obj +<< +/Parent 3 0 R +/MediaBox [0 0 200 50] +/Resources +<< +/Font +<< +/F1 5 0 R +>> +>> +/Contents 6 0 R +/Type /Page +>> +endobj +5 0 obj +<< +/BaseFont /Times-Roman +/Subtype /Type1 +/Encoding /WinAnsiEncoding +/Type /Font +>> +endobj +6 0 obj +<< +/Length 41 +>> +stream +BT +10 20 TD +/F1 20 Tf +(Issue 9105) Tj +ET + +endstream +endobj xref +0 7 +0000000000 65535 f +0000000001 00000 n +0000000002 00000 n +0000000003 00000 n +0000000004 00000 n +0000000005 00000 n +0000000006 00000 n +trailer + +<< +/Info 1 0 R +/Root 2 0 R +/Size 7 +>> +startxref +491 +%%EOF diff --git a/test/test_manifest.json b/test/test_manifest.json index 8c8a85eea..f766ccd62 100644 --- a/test/test_manifest.json +++ b/test/test_manifest.json @@ -741,6 +741,13 @@ "lastPage": 1, "type": "eq" }, + { "id": "issue9105", + "file": "pdfs/issue9105_reduced.pdf", + "md5": "f3889f7c7b60e1ab998aac430cc7e08e", + "rounds": 1, + "link": false, + "type": "eq" + }, { "id": "issue6289", "file": "pdfs/issue6289.pdf", "md5": "0869f3d147c734ec484ffd492104095d",