1
0
Fork 0
mirror of https://github.com/mozilla/pdf.js.git synced 2025-04-26 10:08:06 +02:00

Implement progressive loading of PDFs

This commit is contained in:
Mack Duan 2013-02-06 15:19:29 -08:00
parent added3da8f
commit ef423ef30c
25 changed files with 2110 additions and 586 deletions

View file

@ -17,7 +17,8 @@
/* globals assertWellFormed, bytesToString, CipherTransformFactory, error, info,
InvalidPDFException, isArray, isCmd, isDict, isInt, isName, isRef,
isStream, JpegStream, Lexer, log, Page, Parser, Promise, shadow,
stringToPDFString, stringToUTF8String, warn, isString */
stringToPDFString, stringToUTF8String, warn, isString, assert, PDFJS,
MissingDataException, XRefParseException */
'use strict';
@ -152,9 +153,18 @@ var RefSet = (function RefSetClosure() {
var Catalog = (function CatalogClosure() {
function Catalog(xref) {
this.xref = xref;
var obj = xref.getCatalogObj();
assertWellFormed(isDict(obj), 'catalog object is not a dictionary');
this.catDict = obj;
this.catDict = xref.getCatalogObj();
assertWellFormed(isDict(this.catDict),
'catalog object is not a dictionary');
// Stores state as we traverse the pages catalog so that we can resume
// parsing if an exception is thrown
this.traversePagesQueue = [{
pagesDict: this.toplevelPagesDict,
posInKids: 0
}];
this.pagePromises = [];
this.currPageIndex = 0;
}
Catalog.prototype = {
@ -258,27 +268,6 @@ var Catalog = (function CatalogClosure() {
// shadow the prototype getter
return shadow(this, 'num', obj);
},
traverseKids: function Catalog_traverseKids(pagesDict) {
var pageCache = this.pageCache;
var kids = pagesDict.get('Kids');
assertWellFormed(isArray(kids),
'page dictionary kids object is not an array');
for (var i = 0, ii = kids.length; i < ii; ++i) {
var kid = kids[i];
assertWellFormed(isRef(kid),
'page dictionary kid is not a reference');
var obj = this.xref.fetch(kid);
if (isDict(obj, 'Page') || (isDict(obj) && !obj.has('Kids'))) {
pageCache.push(new Page(this.xref, pageCache.length, obj, kid));
} else { // must be a child page dictionary
assertWellFormed(
isDict(obj),
'page dictionary kid reference points to wrong type of object'
);
this.traverseKids(obj);
}
}
},
get destinations() {
function fetchDestination(dest) {
return isDict(dest) ? dest.get('D') : dest;
@ -346,13 +335,53 @@ var Catalog = (function CatalogClosure() {
}
return shadow(this, 'javaScript', javaScript);
},
getPage: function Catalog_getPage(n) {
var pageCache = this.pageCache;
if (!pageCache) {
pageCache = this.pageCache = [];
this.traverseKids(this.toplevelPagesDict);
getPage: function Catalog_getPage(pageIndex) {
if (!(pageIndex in this.pagePromises)) {
this.pagePromises[pageIndex] = new PDFJS.Promise();
}
return this.pagePromises[pageIndex];
},
// Traverses pages in DFS order so that pages are processed in increasing
// order
traversePages: function Catalog_traversePages() {
var queue = this.traversePagesQueue;
while (queue.length) {
var queueItem = queue[queue.length - 1];
var pagesDict = queueItem.pagesDict;
var kids = pagesDict.get('Kids');
assert(isArray(kids), 'page dictionary kids object is not an array');
if (queueItem.posInKids >= kids.length) {
queue.pop();
continue;
}
var kidRef = kids[queueItem.posInKids];
assert(isRef(kidRef), 'page dictionary kid is not a reference');
var kid = this.xref.fetch(kidRef);
if (isDict(kid, 'Page') || (isDict(kid) && !kid.has('Kids'))) {
var pageIndex = this.currPageIndex++;
var page = new Page(this.xref, pageIndex, kid, kidRef);
if (!(pageIndex in this.pagePromises)) {
this.pagePromises[pageIndex] = new PDFJS.Promise();
}
this.pagePromises[pageIndex].resolve(page);
} else { // must be a child page dictionary
assert(
isDict(kid),
'page dictionary kid reference points to wrong type of object'
);
queue.push({
pagesDict: kid,
posInKids: 0
});
}
++queueItem.posInKids;
}
return this.pageCache[n - 1];
}
};
@ -360,75 +389,60 @@ var Catalog = (function CatalogClosure() {
})();
var XRef = (function XRefClosure() {
function XRef(stream, startXRef, mainXRefEntriesOffset, password) {
function XRef(stream, password) {
this.stream = stream;
this.entries = [];
this.xrefstms = {};
var trailerDict = this.readXRef(startXRef);
trailerDict.assignXref(this);
this.trailer = trailerDict;
// prepare the XRef cache
this.cache = [];
var encrypt = trailerDict.get('Encrypt');
if (encrypt) {
var ids = trailerDict.get('ID');
var fileId = (ids && ids.length) ? ids[0] : '';
this.encrypt = new CipherTransformFactory(encrypt, fileId, password);
}
// get the root dictionary (catalog) object
if (!(this.root = trailerDict.get('Root')))
error('Invalid root reference');
this.password = password;
}
XRef.prototype = {
readXRefTable: function XRef_readXRefTable(parser) {
// Example of cross-reference table:
// xref
// 0 1 <-- subsection header (first obj #, obj count)
// 0000000000 65535 f <-- actual object (offset, generation #, f/n)
// 23 2 <-- subsection header ... and so on ...
// 0000025518 00002 n
// 0000025635 00000 n
// trailer
// ...
setStartXRef: function XRef_setStartXRef(startXRef) {
// Store the starting positions of xref tables as we process them
// so we can recover from missing data errors
this.startXRefQueue = [startXRef];
},
// Outer loop is over subsection headers
var obj;
while (!isCmd(obj = parser.getObj(), 'trailer')) {
var first = obj,
count = parser.getObj();
if (!isInt(first) || !isInt(count))
error('Invalid XRef table: wrong types in subsection header');
// Inner loop is over objects themselves
for (var i = 0; i < count; i++) {
var entry = {};
entry.offset = parser.getObj();
entry.gen = parser.getObj();
var type = parser.getObj();
if (isCmd(type, 'f'))
entry.free = true;
else if (isCmd(type, 'n'))
entry.uncompressed = true;
// Validate entry obj
if (!isInt(entry.offset) || !isInt(entry.gen) ||
!(entry.free || entry.uncompressed)) {
error('Invalid entry in XRef subsection: ' + first + ', ' + count);
}
if (!this.entries[i + first])
this.entries[i + first] = entry;
}
parse: function XRef_parse(recoveryMode) {
var trailerDict;
if (!recoveryMode) {
trailerDict = this.readXRef();
} else {
warn('Indexing all PDF objects');
trailerDict = this.indexObjects();
}
trailerDict.assignXref(this);
this.trailer = trailerDict;
var encrypt = trailerDict.get('Encrypt');
if (encrypt) {
var ids = trailerDict.get('ID');
var fileId = (ids && ids.length) ? ids[0] : '';
this.encrypt = new CipherTransformFactory(
encrypt, fileId, this.password);
}
// Sanity check: as per spec, first object must be free
if (this.entries[0] && !this.entries[0].free)
error('Invalid XRef table: unexpected first object');
// get the root dictionary (catalog) object
if (!(this.root = trailerDict.get('Root'))) {
error('Invalid root reference');
}
},
processXRefTable: function XRef_processXRefTable(parser) {
if (!('tableState' in this)) {
// Stores state of the table as we process it so we can resume
// from middle of table in case of missing data error
this.tableState = {
entryNum: 0,
streamPos: parser.lexer.stream.pos,
parserBuf1: parser.buf1,
parserBuf2: parser.buf2
};
}
var obj = this.readXRefTable(parser);
// Sanity check
if (!isCmd(obj, 'trailer'))
@ -447,27 +461,140 @@ var XRef = (function XRefClosure() {
if (!isDict(dict))
error('Invalid XRef table: could not parse trailer dictionary');
delete this.tableState;
return dict;
},
readXRefTable: function XRef_readXRefTable(parser) {
// Example of cross-reference table:
// xref
// 0 1 <-- subsection header (first obj #, obj count)
// 0000000000 65535 f <-- actual object (offset, generation #, f/n)
// 23 2 <-- subsection header ... and so on ...
// 0000025518 00002 n
// 0000025635 00000 n
// trailer
// ...
var stream = parser.lexer.stream;
var tableState = this.tableState;
stream.pos = tableState.streamPos;
parser.buf1 = tableState.parserBuf1;
parser.buf2 = tableState.parserBuf2;
// Outer loop is over subsection headers
var obj;
while (true) {
if (!('firstEntryNum' in tableState) || !('entryCount' in tableState)) {
if (isCmd(obj = parser.getObj(), 'trailer')) {
break;
}
tableState.firstEntryNum = obj;
tableState.entryCount = parser.getObj();
}
var first = tableState.firstEntryNum;
var count = tableState.entryCount;
if (!isInt(first) || !isInt(count))
error('Invalid XRef table: wrong types in subsection header');
// Inner loop is over objects themselves
for (var i = tableState.entryNum; i < count; i++) {
tableState.streamPos = stream.pos;
tableState.entryNum = i;
tableState.parserBuf1 = parser.buf1;
tableState.parserBuf2 = parser.buf2;
var entry = {};
entry.offset = parser.getObj();
entry.gen = parser.getObj();
var type = parser.getObj();
if (isCmd(type, 'f'))
entry.free = true;
else if (isCmd(type, 'n'))
entry.uncompressed = true;
// Validate entry obj
if (!isInt(entry.offset) || !isInt(entry.gen) ||
!(entry.free || entry.uncompressed)) {
console.log(entry.offset, entry.gen, entry.free,
entry.uncompressed);
error('Invalid entry in XRef subsection: ' + first + ', ' + count);
}
if (!this.entries[i + first])
this.entries[i + first] = entry;
}
tableState.entryNum = 0;
tableState.streamPos = stream.pos;
tableState.parserBuf1 = parser.buf1;
tableState.parserBuf2 = parser.buf2;
delete tableState.firstEntryNum;
delete tableState.entryCount;
}
// Sanity check: as per spec, first object must be free
if (this.entries[0] && !this.entries[0].free)
error('Invalid XRef table: unexpected first object');
return obj;
},
processXRefStream: function XRef_processXRefStream(stream) {
if (!('streamState' in this)) {
// Stores state of the stream as we process it so we can resume
// from middle of stream in case of missing data error
var streamParameters = stream.parameters;
var byteWidths = streamParameters.get('W');
var range = streamParameters.get('Index');
if (!range) {
range = [0, streamParameters.get('Size')];
}
this.streamState = {
entryRanges: range,
byteWidths: byteWidths,
entryNum: 0,
streamPos: stream.pos
};
}
this.readXRefStream(stream);
delete this.streamState;
return stream.parameters;
},
readXRefStream: function XRef_readXRefStream(stream) {
var streamParameters = stream.parameters;
var byteWidths = streamParameters.get('W');
var range = streamParameters.get('Index');
if (!range)
range = [0, streamParameters.get('Size')];
var i, j;
while (range.length > 0) {
var first = range[0], n = range[1];
var streamState = this.streamState;
stream.pos = streamState.streamPos;
var byteWidths = streamState.byteWidths;
var typeFieldWidth = byteWidths[0];
var offsetFieldWidth = byteWidths[1];
var generationFieldWidth = byteWidths[2];
var entryRanges = streamState.entryRanges;
while (entryRanges.length > 0) {
var first = entryRanges[0];
var n = entryRanges[1];
if (!isInt(first) || !isInt(n))
error('Invalid XRef range fields: ' + first + ', ' + n);
var typeFieldWidth = byteWidths[0];
var offsetFieldWidth = byteWidths[1];
var generationFieldWidth = byteWidths[2];
if (!isInt(typeFieldWidth) || !isInt(offsetFieldWidth) ||
!isInt(generationFieldWidth)) {
error('Invalid XRef entry fields length: ' + first + ', ' + n);
}
for (i = 0; i < n; ++i) {
for (i = streamState.entryNum; i < n; ++i) {
streamState.entryNum = i;
streamState.streamPos = stream.pos;
var type = 0, offset = 0, generation = 0;
for (j = 0; j < typeFieldWidth; ++j)
type = (type << 8) | stream.getByte();
@ -496,9 +623,11 @@ var XRef = (function XRefClosure() {
if (!this.entries[first + i])
this.entries[first + i] = entry;
}
range.splice(0, 2);
streamState.entryNum = 0;
streamState.streamPos = stream.pos;
entryRanges.splice(0, 2);
}
return streamParameters;
},
indexObjects: function XRef_indexObjects() {
// Simple scan through the PDF content to find objects,
@ -586,7 +715,8 @@ var XRef = (function XRefClosure() {
}
// reading XRef streams
for (var i = 0, ii = xrefStms.length; i < ii; ++i) {
this.readXRef(xrefStms[i], true);
this.startXRefQueue.push(xrefStms[i]);
this.readXRef(/* recoveryMode */ true);
}
// finding main trailer
var dict;
@ -610,64 +740,84 @@ var XRef = (function XRefClosure() {
// calling error() would reject worker with an UnknownErrorException.
throw new InvalidPDFException('Invalid PDF structure');
},
readXRef: function XRef_readXRef(startXRef, recoveryMode) {
readXRef: function XRef_readXRef(recoveryMode) {
var stream = this.stream;
stream.pos = startXRef;
try {
var parser = new Parser(new Lexer(stream), true, null);
var obj = parser.getObj();
var dict;
while (this.startXRefQueue.length) {
var startXRef = this.startXRefQueue[0];
// Get dictionary
if (isCmd(obj, 'xref')) {
// Parse end-of-file XRef
dict = this.readXRefTable(parser);
stream.pos = startXRef;
// Recursively get other XRefs 'XRefStm', if any
obj = dict.get('XRefStm');
if (isInt(obj)) {
var pos = obj;
// ignore previously loaded xref streams
// (possible infinite recursion)
if (!(pos in this.xrefstms)) {
this.xrefstms[pos] = 1;
this.readXRef(pos);
var parser = new Parser(new Lexer(stream), true, null);
var obj = parser.getObj();
var dict;
// Get dictionary
if (isCmd(obj, 'xref')) {
// Parse end-of-file XRef
dict = this.processXRefTable(parser);
if (!this.topDict) {
this.topDict = dict;
}
// Recursively get other XRefs 'XRefStm', if any
obj = dict.get('XRefStm');
if (isInt(obj)) {
var pos = obj;
// ignore previously loaded xref streams
// (possible infinite recursion)
if (!(pos in this.xrefstms)) {
this.xrefstms[pos] = 1;
this.startXRefQueue.push(pos);
}
}
} else if (isInt(obj)) {
// Parse in-stream XRef
if (!isInt(parser.getObj()) ||
!isCmd(parser.getObj(), 'obj') ||
!isStream(obj = parser.getObj())) {
error('Invalid XRef stream');
}
dict = this.processXRefStream(obj);
if (!this.topDict) {
this.topDict = dict;
}
if (!dict)
error('Failed to read XRef stream');
}
} else if (isInt(obj)) {
// Parse in-stream XRef
if (!isInt(parser.getObj()) ||
!isCmd(parser.getObj(), 'obj') ||
!isStream(obj = parser.getObj())) {
error('Invalid XRef stream');
// Recursively get previous dictionary, if any
obj = dict.get('Prev');
if (isInt(obj)) {
this.startXRefQueue.push(obj);
} else if (isRef(obj)) {
// The spec says Prev must not be a reference, i.e. "/Prev NNN"
// This is a fallback for non-compliant PDFs, i.e. "/Prev NNN 0 R"
this.startXRefQueue.push(obj.num);
}
dict = this.readXRefStream(obj);
if (!dict)
error('Failed to read XRef stream');
this.startXRefQueue.shift();
}
// Recursively get previous dictionary, if any
obj = dict.get('Prev');
if (isInt(obj))
this.readXRef(obj, recoveryMode);
else if (isRef(obj)) {
// The spec says Prev must not be a reference, i.e. "/Prev NNN"
// This is a fallback for non-compliant PDFs, i.e. "/Prev NNN 0 R"
this.readXRef(obj.num, recoveryMode);
}
return dict;
return this.topDict;
} catch (e) {
if (e instanceof MissingDataException) {
throw e;
}
log('(while reading XRef): ' + e);
}
if (recoveryMode)
return;
warn('Indexing all PDF objects');
return this.indexObjects();
throw new XRefParseException();
},
getEntry: function XRef_getEntry(i) {
var e = this.entries[i];
if (e === null)