PDFParser - return at least first 3 pages of PDF

fix for pdf parsing without returning parsed text due to interruption by
time out.
pull/1/head
reger 13 years ago committed by Michael Peter Christen
parent d2ca33ccd7
commit 32104360ce

@ -127,22 +127,31 @@ public class pdfParser extends AbstractParser implements Parser {
docTitle = MultiProtocolURI.unescape(location.getFileName()); docTitle = MultiProtocolURI.unescape(location.getFileName());
} }
final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE); final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
byte[] contentBytes = UTF8.getBytes("");
try { try {
// create a writer for output // create a writer for output
final PDFTextStripper stripper = new PDFTextStripper(); final PDFTextStripper stripper = new PDFTextStripper();
stripper.setEndPage(3); // get first 3 pages (always)
writer.append(stripper.getText(pdfDoc));
contentBytes = UTF8.getBytes(writer.toString()); // remember text in case of interrupting thread
stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text)
stripper.setEndPage(Integer.MAX_VALUE); // set to default
// we start the pdf parsing in a separate thread to ensure that it can be terminated // we start the pdf parsing in a separate thread to ensure that it can be terminated
final Thread t = new Thread() { final Thread t = new Thread() {
@Override @Override
public void run() { public void run() {
try { try {
stripper.writeText(pdfDoc, writer); // may throw a NPE writer.append(stripper.getText(pdfDoc));
} catch (final Throwable e) {} } catch (final Throwable e) {}
} }
}; };
t.start(); t.start();
t.join(3000); t.join(3000);
if (t.isAlive()) t.interrupt(); if (t.isAlive()) t.interrupt();
pdfDoc.close(); pdfDoc.close();
contentBytes = UTF8.getBytes(writer.toString()); // get final text before closing writer
writer.close(); writer.close();
} catch (final IOException e) { } catch (final IOException e) {
// close the writer // close the writer
@ -168,9 +177,7 @@ public class pdfParser extends AbstractParser implements Parser {
docTitle = docSubject; docTitle = docSubject;
} }
byte[] contentBytes;
contentBytes = UTF8.getBytes(writer.toString());
// clear resources in pdfbox. they say that is resolved but it's not. see: // clear resources in pdfbox. they say that is resolved but it's not. see:
// https://issues.apache.org/jira/browse/PDFBOX-313 // https://issues.apache.org/jira/browse/PDFBOX-313
// https://issues.apache.org/jira/browse/PDFBOX-351 // https://issues.apache.org/jira/browse/PDFBOX-351

Loading…
Cancel
Save