more details for exception catching when parsing pdfs

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6461 6c8d7289-2bf4-0310-a012-ef5d649a1542
16 years ago · 605e896d6c
parent 18b21eaffe
commit 605e896d6c
1 changed files with 97 additions and 86 deletions
--- a/source/net/yacy/document/parser/pdfParser.java
+++ b/source/net/yacy/document/parser/pdfParser.java
@ -32,14 +32,17 @@ import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStreamWriter;
+import java.io.UnsupportedEncodingException;
 import java.io.Writer;
 import java.util.HashSet;
 import java.util.Set;

+import org.apache.pdfbox.exceptions.CryptographyException;
 import org.apache.pdfbox.pdfparser.PDFParser;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDDocumentInformation;
 import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
+import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException;
 import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
 import org.apache.pdfbox.util.PDFTextStripper;

@ -88,46 +91,57 @@ public class pdfParser extends AbstractParser implements Idiom {
        PDDocument theDocument = null;
        Writer writer = null;
        File writerFile = null;
-        try {       
-            // reducing thread priority
-            Thread.currentThread().setPriority(Thread.MIN_PRIORITY);                        
-            
-            // deactivating the logging for jMimeMagic
-//            Logger theLogger = Logger.getLogger("org.pdfbox");
-//            theLogger.setLevel(Level.INFO);            
-            
-            String docTitle = null, docSubject = null, docAuthor = null, docKeywordStr = null;
-            
-            // check for interruption
-            checkInterruption();
-            
-            // creating a pdf parser
-            final PDFParser parser = new PDFParser(source);
+        
+        String docTitle = null, docSubject = null, docAuthor = null, docKeywordStr = null;
+        
+        // check for interruption
+        checkInterruption();
+        
+        // creating a pdf parser
+        final PDFParser parser;
+        final PDFTextStripper stripper;
+        try {
+            Thread.currentThread().setPriority(Thread.MIN_PRIORITY);
+            parser = new PDFParser(source);
            parser.parse();
-                        
-            // check for interruption
            checkInterruption();
-            
-            // creating a text stripper
-            final PDFTextStripper stripper = new PDFTextStripper();
+            stripper = new PDFTextStripper();
            theDocument = parser.getPDDocument();
-            
-            if (theDocument.isEncrypted()) {
+        } catch (IOException e) {
+            Log.logException(e);
+            throw new ParserException(e.getMessage(), location);
+        } finally {
+            Thread.currentThread().setPriority(Thread.NORM_PRIORITY);
+        }
+        
+        if (theDocument.isEncrypted()) {
+            try {
                theDocument.openProtection(new StandardDecryptionMaterial(""));
-                final AccessPermission perm = theDocument.getCurrentAccessPermission();
-                if (perm == null || !perm.canExtractContent())
-                    throw new ParserException("Document is encrypted", location);
+            } catch (BadSecurityHandlerException e) {
+                Log.logException(e);
+                throw new ParserException("Document is encrypted (1): " + e.getMessage(), location);
+            } catch (IOException e) {
+                Log.logException(e);
+                throw new ParserException("Document is encrypted (2): " + e.getMessage(), location);
+            } catch (CryptographyException e) {
+                Log.logException(e);
+                throw new ParserException("Document is encrypted (3): " + e.getMessage(), location);
            }
-            
-            // extracting some metadata
-            final PDDocumentInformation theDocInfo = theDocument.getDocumentInformation();            
-            if (theDocInfo != null) {
-                docTitle = theDocInfo.getTitle();
-                docSubject = theDocInfo.getSubject();
-                docAuthor = theDocInfo.getAuthor();
-                docKeywordStr = theDocInfo.getKeywords();
-            }            
-            
+            final AccessPermission perm = theDocument.getCurrentAccessPermission();
+            if (perm == null || !perm.canExtractContent())
+                throw new ParserException("Document is encrypted and cannot decrypted", location);
+        }
+        
+        // extracting some metadata
+        final PDDocumentInformation theDocInfo = theDocument.getDocumentInformation();            
+        if (theDocInfo != null) {
+            docTitle = theDocInfo.getTitle();
+            docSubject = theDocInfo.getSubject();
+            docAuthor = theDocInfo.getAuthor();
+            docKeywordStr = theDocInfo.getKeywords();
+        }            
+        
+        try {
            // creating a writer for output
            if ((this.contentLength == -1) || (this.contentLength > Idiom.MAX_KEEP_IN_MEMORY_SIZE)) {
                writerFile = File.createTempFile("pdfParser",".prt");
@ -138,67 +152,64 @@ public class pdfParser extends AbstractParser implements Idiom {
            try {
                stripper.writeText(theDocument, writer ); // may throw a NPE
            } catch (Exception e) {
+                Log.logException(e);
                Log.logWarning("pdfParser", e.getMessage());
            }
            theDocument.close(); theDocument = null;            
            writer.close();
-
-            
-            String[] docKeywords = null;
-            if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,");
-            
-            Document theDoc = null;
-            
-            if (writer instanceof CharBuffer) {
-                final byte[] contentBytes = ((CharBuffer)writer).toString().getBytes("UTF-8");
-                theDoc = new Document(
-                        location,
-                        mimeType,
-                        "UTF-8",
-                        null,
-                        docKeywords,
-                        (docTitle == null) ? docSubject : docTitle,
-                        docAuthor,
-                        null,
-                        null,
-                        contentBytes,
-                        null,
-                        null);
-            } else {
-                theDoc = new Document(
-                        location,
-                        mimeType,
-                        "UTF-8",
-                        null,
-                        docKeywords,
-                        (docTitle == null) ? docSubject : docTitle,
-                        docAuthor,
-                        null,
-                        null,
-                        writerFile,
-                        null,
-                        null);                
-            }
-            
-            return theDoc;
-        }
-        catch (final Exception e) {       
-            if (e instanceof InterruptedException) throw (InterruptedException) e;
-            if (e instanceof ParserException) throw (ParserException) e;
-            
+        } catch (IOException e) {
+            Log.logException(e);
            // close the writer
-            if (writer != null) try { writer.close(); } catch (final Exception ex) {/* ignore this */}
+            if (writer != null) try { writer.close(); } catch (final Exception ex) {}
            
            // delete the file
            if (writerFile != null) FileUtils.deletedelete(writerFile);
+            throw new ParserException(e.getMessage(), location);
+        }
            
-            Log.logException(e);
-            throw new ParserException("Unexpected error while parsing pdf file. " + e.getMessage(),location); 
-        } finally {
-            if (theDocument != null) try { theDocument.close(); } catch (final Exception e) {/* ignore this */}
-            if (writer != null)      try { writer.close(); }      catch (final Exception e) {/* ignore this */}
-            Thread.currentThread().setPriority(Thread.NORM_PRIORITY);
+        String[] docKeywords = null;
+        if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,");
+        
+        Document theDoc = null;
+        
+        if (writer instanceof CharBuffer) {
+            byte[] contentBytes;
+            try {
+                contentBytes = ((CharBuffer) writer).toString().getBytes("UTF-8");
+            } catch (UnsupportedEncodingException e) {
+                Log.logException(e);
+                throw new ParserException(e.getMessage(), location);
+            }
+            theDoc = new Document(
+                    location,
+                    mimeType,
+                    "UTF-8",
+                    null,
+                    docKeywords,
+                    (docTitle == null) ? docSubject : docTitle,
+                    docAuthor,
+                    null,
+                    null,
+                    contentBytes,
+                    null,
+                    null);
+        } else {
+            theDoc = new Document(
+                    location,
+                    mimeType,
+                    "UTF-8",
+                    null,
+                    docKeywords,
+                    (docTitle == null) ? docSubject : docTitle,
+                    docAuthor,
+                    null,
+                    null,
+                    writerFile,
+                    null,
+                    null);                
        }
+        
+        return theDoc;
    }
    
    @Override