diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java index 812674b7a..c61536f69 100644 --- a/source/net/yacy/document/parser/pdfParser.java +++ b/source/net/yacy/document/parser/pdfParser.java @@ -107,8 +107,10 @@ public class pdfParser extends AbstractParser implements Parser { throw new Parser.Failure("Document is encrypted (3): " + e.getMessage(), location); } final AccessPermission perm = pdfDoc.getCurrentAccessPermission(); - if (perm == null || !perm.canExtractContent()) + if (perm == null || !perm.canExtractContent()) { + try {pdfDoc.close();} catch (final IOException ee) {} throw new Parser.Failure("Document is encrypted and cannot be decrypted", location); + } } // extracting some metadata @@ -131,16 +133,16 @@ public class pdfParser extends AbstractParser implements Parser { if (docTitle == null || docTitle.isEmpty()) { docTitle = MultiProtocolURL.unescape(location.getFileName()); } - final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE); - byte[] contentBytes = new byte[0]; + final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE); + byte[] contentBytes = new byte[0]; try { // create a writer for output - final PDFTextStripper stripper = new PDFTextStripper(); + final PDFTextStripper stripper = new PDFTextStripper(); stripper.setEndPage(3); // get first 3 pages (always) writer.append(stripper.getText(pdfDoc)); contentBytes = writer.getBytes(); // remember text in case of interrupting thread - + stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text) stripper.setEndPage(Integer.MAX_VALUE); // set to default // we start the pdf parsing in a separate thread to ensure that it can be terminated @@ -149,14 +151,14 @@ public class pdfParser extends AbstractParser implements Parser { @Override public void run() { Thread.currentThread().setName("pdfParser.getText:" + location); - try { - writer.append(stripper.getText(pdfDocC)); + try { + writer.append(stripper.getText(pdfDocC)); } catch (final Throwable e) {} } }; t.start(); t.join(3000); - if (t.isAlive()) t.interrupt(); + if (t.isAlive()) t.interrupt(); pdfDoc.close(); contentBytes = writer.getBytes(); // get final text before closing writer } catch (final Throwable e) { @@ -176,7 +178,7 @@ public class pdfParser extends AbstractParser implements Parser { if (docTitle == null) { docTitle = docSubject; } - + // clear resources in pdfbox. they say that is resolved but it's not. see: // https://issues.apache.org/jira/browse/PDFBOX-313 // https://issues.apache.org/jira/browse/PDFBOX-351