diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java index 617d0c367..23f4e647b 100644 --- a/source/net/yacy/document/parser/pdfParser.java +++ b/source/net/yacy/document/parser/pdfParser.java @@ -32,14 +32,17 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStreamWriter; +import java.io.UnsupportedEncodingException; import java.io.Writer; import java.util.HashSet; import java.util.Set; +import org.apache.pdfbox.exceptions.CryptographyException; import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocumentInformation; import org.apache.pdfbox.pdmodel.encryption.AccessPermission; +import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException; import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial; import org.apache.pdfbox.util.PDFTextStripper; @@ -88,46 +91,57 @@ public class pdfParser extends AbstractParser implements Idiom { PDDocument theDocument = null; Writer writer = null; File writerFile = null; - try { - // reducing thread priority - Thread.currentThread().setPriority(Thread.MIN_PRIORITY); - - // deactivating the logging for jMimeMagic -// Logger theLogger = Logger.getLogger("org.pdfbox"); -// theLogger.setLevel(Level.INFO); - - String docTitle = null, docSubject = null, docAuthor = null, docKeywordStr = null; - - // check for interruption - checkInterruption(); - - // creating a pdf parser - final PDFParser parser = new PDFParser(source); + + String docTitle = null, docSubject = null, docAuthor = null, docKeywordStr = null; + + // check for interruption + checkInterruption(); + + // creating a pdf parser + final PDFParser parser; + final PDFTextStripper stripper; + try { + Thread.currentThread().setPriority(Thread.MIN_PRIORITY); + parser = new PDFParser(source); parser.parse(); - - // check for interruption checkInterruption(); - - // creating a text stripper - final PDFTextStripper stripper = new PDFTextStripper(); + stripper = new PDFTextStripper(); theDocument = parser.getPDDocument(); - - if (theDocument.isEncrypted()) { + } catch (IOException e) { + Log.logException(e); + throw new ParserException(e.getMessage(), location); + } finally { + Thread.currentThread().setPriority(Thread.NORM_PRIORITY); + } + + if (theDocument.isEncrypted()) { + try { theDocument.openProtection(new StandardDecryptionMaterial("")); - final AccessPermission perm = theDocument.getCurrentAccessPermission(); - if (perm == null || !perm.canExtractContent()) - throw new ParserException("Document is encrypted", location); + } catch (BadSecurityHandlerException e) { + Log.logException(e); + throw new ParserException("Document is encrypted (1): " + e.getMessage(), location); + } catch (IOException e) { + Log.logException(e); + throw new ParserException("Document is encrypted (2): " + e.getMessage(), location); + } catch (CryptographyException e) { + Log.logException(e); + throw new ParserException("Document is encrypted (3): " + e.getMessage(), location); } - - // extracting some metadata - final PDDocumentInformation theDocInfo = theDocument.getDocumentInformation(); - if (theDocInfo != null) { - docTitle = theDocInfo.getTitle(); - docSubject = theDocInfo.getSubject(); - docAuthor = theDocInfo.getAuthor(); - docKeywordStr = theDocInfo.getKeywords(); - } - + final AccessPermission perm = theDocument.getCurrentAccessPermission(); + if (perm == null || !perm.canExtractContent()) + throw new ParserException("Document is encrypted and cannot decrypted", location); + } + + // extracting some metadata + final PDDocumentInformation theDocInfo = theDocument.getDocumentInformation(); + if (theDocInfo != null) { + docTitle = theDocInfo.getTitle(); + docSubject = theDocInfo.getSubject(); + docAuthor = theDocInfo.getAuthor(); + docKeywordStr = theDocInfo.getKeywords(); + } + + try { // creating a writer for output if ((this.contentLength == -1) || (this.contentLength > Idiom.MAX_KEEP_IN_MEMORY_SIZE)) { writerFile = File.createTempFile("pdfParser",".prt"); @@ -138,67 +152,64 @@ public class pdfParser extends AbstractParser implements Idiom { try { stripper.writeText(theDocument, writer ); // may throw a NPE } catch (Exception e) { + Log.logException(e); Log.logWarning("pdfParser", e.getMessage()); } theDocument.close(); theDocument = null; writer.close(); - - - String[] docKeywords = null; - if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,"); - - Document theDoc = null; - - if (writer instanceof CharBuffer) { - final byte[] contentBytes = ((CharBuffer)writer).toString().getBytes("UTF-8"); - theDoc = new Document( - location, - mimeType, - "UTF-8", - null, - docKeywords, - (docTitle == null) ? docSubject : docTitle, - docAuthor, - null, - null, - contentBytes, - null, - null); - } else { - theDoc = new Document( - location, - mimeType, - "UTF-8", - null, - docKeywords, - (docTitle == null) ? docSubject : docTitle, - docAuthor, - null, - null, - writerFile, - null, - null); - } - - return theDoc; - } - catch (final Exception e) { - if (e instanceof InterruptedException) throw (InterruptedException) e; - if (e instanceof ParserException) throw (ParserException) e; - + } catch (IOException e) { + Log.logException(e); // close the writer - if (writer != null) try { writer.close(); } catch (final Exception ex) {/* ignore this */} + if (writer != null) try { writer.close(); } catch (final Exception ex) {} // delete the file if (writerFile != null) FileUtils.deletedelete(writerFile); + throw new ParserException(e.getMessage(), location); + } - Log.logException(e); - throw new ParserException("Unexpected error while parsing pdf file. " + e.getMessage(),location); - } finally { - if (theDocument != null) try { theDocument.close(); } catch (final Exception e) {/* ignore this */} - if (writer != null) try { writer.close(); } catch (final Exception e) {/* ignore this */} - Thread.currentThread().setPriority(Thread.NORM_PRIORITY); + String[] docKeywords = null; + if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,"); + + Document theDoc = null; + + if (writer instanceof CharBuffer) { + byte[] contentBytes; + try { + contentBytes = ((CharBuffer) writer).toString().getBytes("UTF-8"); + } catch (UnsupportedEncodingException e) { + Log.logException(e); + throw new ParserException(e.getMessage(), location); + } + theDoc = new Document( + location, + mimeType, + "UTF-8", + null, + docKeywords, + (docTitle == null) ? docSubject : docTitle, + docAuthor, + null, + null, + contentBytes, + null, + null); + } else { + theDoc = new Document( + location, + mimeType, + "UTF-8", + null, + docKeywords, + (docTitle == null) ? docSubject : docTitle, + docAuthor, + null, + null, + writerFile, + null, + null); } + + return theDoc; } @Override