From 502f7cb81b0060f7a7e6afcc938d3692f7ebc2bf Mon Sep 17 00:00:00 2001 From: notmasteryet Date: Fri, 23 Sep 2011 20:50:21 -0500 Subject: [PATCH 1/3] Recovering from misplaced/bad XRef --- pdf.js | 107 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 105 insertions(+), 2 deletions(-) diff --git a/pdf.js b/pdf.js index 6c66b84c4..d70885416 100644 --- a/pdf.js +++ b/pdf.js @@ -3161,6 +3161,110 @@ var XRef = (function xRefXRef() { this.readXRef(prev); return streamParameters; }, + indexObjects: function indexObjects() { + // Simple scan through the PDF content to find objects, + // trailers and XRef streams. + function readToken(data, offset) { + var token = '', ch = data[offset]; + while (ch !== 13 && ch !== 10) { + if (++offset >= data.length) + break; + token += String.fromCharCode(ch); + ch = data[offset]; + } + return token; + } + function skipUntil(data, offset, what) { + var length = what.length, dataLength = data.length; + var bytes = new Uint8Array(length); + var i, skipped = 0; + for (i = 0; i < length; i++) + bytes[i] = what.charCodeAt(i); + // finding byte sequence + while(offset < dataLength) { + var i = 0; + while (i < length && data[offset + i] == bytes[i]) + ++i; + if (i >= length) + break; // sequnce found + + offset++; + skipped++; + } + return skipped; + } + var stream = this.stream; + stream.pos = 0; + var buffer = stream.getBytes(); + var position = 0, length = buffer.length; + var trailers = [], xrefStms = []; + var state = 0; + var currentToken; + while (position < length) { + var ch = buffer[position]; + if (ch === 32 || ch === 9 || ch === 13 || ch === 10) { + ++position; + continue; + } + if (ch === 37) { // %-comment + do { + ++position; + ch = buffer[position]; + } while (ch !== 13 && ch !== 10); + continue; + } + var token = readToken(buffer, position); + var m; + if (token === 'xref') { + position += skipUntil(buffer, position, 'trailer'); + trailers.push(position); + position += skipUntil(buffer, position, 'startxref'); + } else if ((m = /^(\d+)\s+(\d+)\s+obj\b/.exec(token))) { + this.entries[m[1]] = { + offset: position, + gen: m[2] | 0, + uncompressed: true + }; + + var contentLength = skipUntil(buffer, position, 'endobj') + 7; + var content = buffer.subarray(position, position + contentLength); + + // checking XRef stream suspect + // (it shall have '/XRef' and next char is not a letter) + var xrefTagOffset = skipUntil(content, 0, '/XRef'); + if (xrefTagOffset < contentLength && + content[xrefTagOffset + 5] < 64) { + xrefStms.push(position); + this.xrefstms[position] = 1; // don't read it recursively + } + + position += contentLength; + } else + position += token.length + 1; + } + // reading XRef streams + for (var i = 0; i < xrefStms.length; ++i) { + this.readXRef(xrefStms[i]); + } + // finding main trailer + for (var i = 0; i < trailers.length; ++i) { + stream.pos = trailers[i]; + var parser = new Parser(new Lexer(stream), true); + var obj = parser.getObj(); + if (!IsCmd(obj, 'trailer')) + continue; + // read the trailer dictionary + var dict; + if (!IsDict(dict = parser.getObj())) + continue; + // taking the first one with 'ID' + if (dict.has('ID')) + return dict; + } + // nothing helps + error('Invalid PDF structure'); + return null; + }, readXRef: function readXref(startXRef) { var stream = this.stream; stream.pos = startXRef; @@ -3178,8 +3282,7 @@ var XRef = (function xRefXRef() { } return this.readXRefStream(obj); } - error('Invalid XRef'); - return null; + return this.indexObjects(); }, getEntry: function xRefGetEntry(i) { var e = this.entries[i]; From eca6c82bbc338bcae72956069519c44e0ccdf03d Mon Sep 17 00:00:00 2001 From: notmasteryet Date: Fri, 23 Sep 2011 20:59:07 -0500 Subject: [PATCH 2/3] Pattern match optimization --- pdf.js | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/pdf.js b/pdf.js index d70885416..0a0bfaf12 100644 --- a/pdf.js +++ b/pdf.js @@ -3176,23 +3176,26 @@ var XRef = (function xRefXRef() { } function skipUntil(data, offset, what) { var length = what.length, dataLength = data.length; - var bytes = new Uint8Array(length); - var i, skipped = 0; - for (i = 0; i < length; i++) - bytes[i] = what.charCodeAt(i); + var skipped = 0; // finding byte sequence - while(offset < dataLength) { + while (offset < dataLength) { var i = 0; - while (i < length && data[offset + i] == bytes[i]) + while (i < length && data[offset + i] == what[i]) ++i; if (i >= length) - break; // sequnce found + break; // sequence found offset++; skipped++; } return skipped; } + var trailerBytes = new Uint8Array([116, 114, 97, 105, 108, 101, 114]); + var startxrefBytes = new Uint8Array([115, 116, 97, 114, 116, 120, 114, + 101, 102]); + var endobjBytes = new Uint8Array([101, 110, 100, 111, 98, 106]); + var xrefBytes = new Uint8Array([47, 88, 82, 101, 102]); + var stream = this.stream; stream.pos = 0; var buffer = stream.getBytes(); @@ -3216,9 +3219,9 @@ var XRef = (function xRefXRef() { var token = readToken(buffer, position); var m; if (token === 'xref') { - position += skipUntil(buffer, position, 'trailer'); + position += skipUntil(buffer, position, trailerBytes); trailers.push(position); - position += skipUntil(buffer, position, 'startxref'); + position += skipUntil(buffer, position, startxrefBytes); } else if ((m = /^(\d+)\s+(\d+)\s+obj\b/.exec(token))) { this.entries[m[1]] = { offset: position, @@ -3226,12 +3229,12 @@ var XRef = (function xRefXRef() { uncompressed: true }; - var contentLength = skipUntil(buffer, position, 'endobj') + 7; + var contentLength = skipUntil(buffer, position, endobjBytes) + 7; var content = buffer.subarray(position, position + contentLength); // checking XRef stream suspect // (it shall have '/XRef' and next char is not a letter) - var xrefTagOffset = skipUntil(content, 0, '/XRef'); + var xrefTagOffset = skipUntil(content, 0, xrefBytes); if (xrefTagOffset < contentLength && content[xrefTagOffset + 5] < 64) { xrefStms.push(position); From 91a5f73708d1c15f50f6b9543ec2ad015828e450 Mon Sep 17 00:00:00 2001 From: notmasteryet Date: Sat, 24 Sep 2011 09:44:50 -0500 Subject: [PATCH 3/3] Implement "skipPages" feature for reftests --- test/driver.js | 7 +++++++ test/pdfs/ibwa-bad.pdf.link | 1 + test/test_manifest.json | 7 +++++++ 3 files changed, 15 insertions(+) create mode 100644 test/pdfs/ibwa-bad.pdf.link diff --git a/test/driver.js b/test/driver.js index 7d6c54509..76093ed98 100644 --- a/test/driver.js +++ b/test/driver.js @@ -107,6 +107,13 @@ function nextPage(task, loadError) { } } + if (task.skipPages && task.skipPages.indexOf(task.pageNum) >= 0) { + log(' skipping page ' + task.pageNum + '/' + task.pdfDoc.numPages + + '... '); + snapshotCurrentPage(task, ''); + return; + } + var page = null; if (!failure) { diff --git a/test/pdfs/ibwa-bad.pdf.link b/test/pdfs/ibwa-bad.pdf.link new file mode 100644 index 000000000..feec66ea2 --- /dev/null +++ b/test/pdfs/ibwa-bad.pdf.link @@ -0,0 +1 @@ +http://www.bottledwater.org/public/pdf/IBWA05ModelCode_Mar2.pdf diff --git a/test/test_manifest.json b/test/test_manifest.json index 231857fa8..50f3c4c16 100644 --- a/test/test_manifest.json +++ b/test/test_manifest.json @@ -151,5 +151,12 @@ "link": true, "rounds": 1, "type": "load" + }, + { "id": "ibwa-bad", + "file": "pdfs/ibwa-bad.pdf", + "link": true, + "rounds": 1, + "skipPages": [ 16 ], + "type": "load" } ]