|
|
|
@ -107,8 +107,10 @@ public class pdfParser extends AbstractParser implements Parser {
|
|
|
|
|
throw new Parser.Failure("Document is encrypted (3): " + e.getMessage(), location);
|
|
|
|
|
}
|
|
|
|
|
final AccessPermission perm = pdfDoc.getCurrentAccessPermission();
|
|
|
|
|
if (perm == null || !perm.canExtractContent())
|
|
|
|
|
if (perm == null || !perm.canExtractContent()) {
|
|
|
|
|
try {pdfDoc.close();} catch (final IOException ee) {}
|
|
|
|
|
throw new Parser.Failure("Document is encrypted and cannot be decrypted", location);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// extracting some metadata
|
|
|
|
@ -131,16 +133,16 @@ public class pdfParser extends AbstractParser implements Parser {
|
|
|
|
|
if (docTitle == null || docTitle.isEmpty()) {
|
|
|
|
|
docTitle = MultiProtocolURL.unescape(location.getFileName());
|
|
|
|
|
}
|
|
|
|
|
final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
|
|
|
|
|
byte[] contentBytes = new byte[0];
|
|
|
|
|
final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
|
|
|
|
|
byte[] contentBytes = new byte[0];
|
|
|
|
|
try {
|
|
|
|
|
// create a writer for output
|
|
|
|
|
final PDFTextStripper stripper = new PDFTextStripper();
|
|
|
|
|
final PDFTextStripper stripper = new PDFTextStripper();
|
|
|
|
|
|
|
|
|
|
stripper.setEndPage(3); // get first 3 pages (always)
|
|
|
|
|
writer.append(stripper.getText(pdfDoc));
|
|
|
|
|
contentBytes = writer.getBytes(); // remember text in case of interrupting thread
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text)
|
|
|
|
|
stripper.setEndPage(Integer.MAX_VALUE); // set to default
|
|
|
|
|
// we start the pdf parsing in a separate thread to ensure that it can be terminated
|
|
|
|
@ -149,14 +151,14 @@ public class pdfParser extends AbstractParser implements Parser {
|
|
|
|
|
@Override
|
|
|
|
|
public void run() {
|
|
|
|
|
Thread.currentThread().setName("pdfParser.getText:" + location);
|
|
|
|
|
try {
|
|
|
|
|
writer.append(stripper.getText(pdfDocC));
|
|
|
|
|
try {
|
|
|
|
|
writer.append(stripper.getText(pdfDocC));
|
|
|
|
|
} catch (final Throwable e) {}
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
t.start();
|
|
|
|
|
t.join(3000);
|
|
|
|
|
if (t.isAlive()) t.interrupt();
|
|
|
|
|
if (t.isAlive()) t.interrupt();
|
|
|
|
|
pdfDoc.close();
|
|
|
|
|
contentBytes = writer.getBytes(); // get final text before closing writer
|
|
|
|
|
} catch (final Throwable e) {
|
|
|
|
@ -176,7 +178,7 @@ public class pdfParser extends AbstractParser implements Parser {
|
|
|
|
|
if (docTitle == null) {
|
|
|
|
|
docTitle = docSubject;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// clear resources in pdfbox. they say that is resolved but it's not. see:
|
|
|
|
|
// https://issues.apache.org/jira/browse/PDFBOX-313
|
|
|
|
|
// https://issues.apache.org/jira/browse/PDFBOX-351
|
|
|
|
|