mirror of
https://github.com/mozilla/pdf.js.git
synced 2025-04-26 01:58:06 +02:00
Text char codes extraction
This commit is contained in:
parent
853f16085f
commit
3b72c6063c
4 changed files with 152 additions and 13 deletions
|
@ -160,6 +160,28 @@ var WorkerMessageHandler = {
|
|||
|
||||
handler.send('font_ready', [objId, obj]);
|
||||
});
|
||||
|
||||
handler.on('extract_text', function wphExtractText() {
|
||||
var numPages = pdfDoc.numPages;
|
||||
var index = [];
|
||||
for (var i = 0; i < numPages; i++) {
|
||||
var start = Date.now();
|
||||
|
||||
var textContent = '';
|
||||
try {
|
||||
var page = pdfDoc.getPage(i + 1);
|
||||
textContent = page.extractTextContent();
|
||||
} catch (e) {
|
||||
// Skip errored pages
|
||||
}
|
||||
|
||||
index.push(textContent);
|
||||
}
|
||||
|
||||
console.log('text indexing=: time=%dms', Date.now() - start);
|
||||
|
||||
handler.send('text_extracted', { index: index });
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue