mirror of
https://github.com/mozilla/pdf.js.git
synced 2025-04-26 01:58:06 +02:00
Text char codes extraction
This commit is contained in:
parent
853f16085f
commit
3b72c6063c
4 changed files with 152 additions and 13 deletions
50
src/core.js
50
src/core.js
|
@ -200,10 +200,12 @@ var Page = (function PageClosure() {
|
|||
if (isArray(content)) {
|
||||
// fetching items
|
||||
var i, n = content.length;
|
||||
var streams = [];
|
||||
for (i = 0; i < n; ++i)
|
||||
content[i] = xref.fetchIfRef(content[i]);
|
||||
content = new StreamsSequenceStream(content);
|
||||
}
|
||||
streams.push(xref.fetchIfRef(content[i]));
|
||||
content = new StreamsSequenceStream(streams);
|
||||
} else if (isStream(content))
|
||||
content.pos = 0;
|
||||
|
||||
var pe = this.pe = new PartialEvaluator(
|
||||
xref, handler, 'p' + this.pageNumber + '_');
|
||||
|
@ -212,6 +214,36 @@ var Page = (function PageClosure() {
|
|||
dependency));
|
||||
},
|
||||
|
||||
extractTextContent: function pageExtractPageContent() {
|
||||
if ('textContent' in this) {
|
||||
// text content was extracted
|
||||
return this.textContent;
|
||||
}
|
||||
|
||||
var handler = {
|
||||
on: function () {},
|
||||
send: function() {}
|
||||
};
|
||||
|
||||
var xref = this.xref;
|
||||
var content = xref.fetchIfRef(this.content);
|
||||
var resources = xref.fetchIfRef(this.resources);
|
||||
if (isArray(content)) {
|
||||
// fetching items
|
||||
var i, n = content.length;
|
||||
var streams = [];
|
||||
for (i = 0; i < n; ++i)
|
||||
streams.push(xref.fetchIfRef(content[i]));
|
||||
content = new StreamsSequenceStream(streams);
|
||||
} else if (isStream(content))
|
||||
content.pos = 0;
|
||||
|
||||
var pe = new PartialEvaluator(
|
||||
xref, handler, 'p' + this.pageNumber + '_');
|
||||
var text = pe.getTextContent(content, resources);
|
||||
return (this.textContent = text);
|
||||
},
|
||||
|
||||
ensureFonts: function pageEnsureFonts(fonts, callback) {
|
||||
// Convert the font names to the corresponding font obj.
|
||||
for (var i = 0, ii = fonts.length; i < ii; i++) {
|
||||
|
@ -614,6 +646,12 @@ var PDFDoc = (function PDFDocClosure() {
|
|||
throw data.error;
|
||||
}, this);
|
||||
|
||||
messageHandler.on('text_extracted', function pdfDocError(data) {
|
||||
var index = data.index;
|
||||
if (this.textExtracted)
|
||||
this.textExtracted(index);
|
||||
}, this);
|
||||
|
||||
setTimeout(function pdfDocFontReadySetTimeout() {
|
||||
messageHandler.send('doc', this.data);
|
||||
this.workerReadyPromise.resolve(true);
|
||||
|
@ -643,6 +681,12 @@ var PDFDoc = (function PDFDocClosure() {
|
|||
return (this.pageCache[n] = page);
|
||||
},
|
||||
|
||||
extractText: function pdfDocExtractExtractText() {
|
||||
this.workerReadyPromise.then(function pdfDocStartRenderingThen() {
|
||||
this.messageHandler.send('extract_text');
|
||||
}.bind(this));
|
||||
},
|
||||
|
||||
destroy: function pdfDocDestroy() {
|
||||
if (this.worker)
|
||||
this.worker.terminate();
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue