more details for exception catching when parsing pdfs

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6461 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 18b21eaffe
commit 605e896d6c

@ -32,14 +32,17 @@ import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.util.HashSet;
import java.util.Set;
import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException;
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.apache.pdfbox.util.PDFTextStripper;
@ -88,13 +91,6 @@ public class pdfParser extends AbstractParser implements Idiom {
PDDocument theDocument = null;
Writer writer = null;
File writerFile = null;
try {
// reducing thread priority
Thread.currentThread().setPriority(Thread.MIN_PRIORITY);
// deactivating the logging for jMimeMagic
// Logger theLogger = Logger.getLogger("org.pdfbox");
// theLogger.setLevel(Level.INFO);
String docTitle = null, docSubject = null, docAuthor = null, docKeywordStr = null;
@ -102,21 +98,38 @@ public class pdfParser extends AbstractParser implements Idiom {
checkInterruption();
// creating a pdf parser
final PDFParser parser = new PDFParser(source);
final PDFParser parser;
final PDFTextStripper stripper;
try {
Thread.currentThread().setPriority(Thread.MIN_PRIORITY);
parser = new PDFParser(source);
parser.parse();
// check for interruption
checkInterruption();
// creating a text stripper
final PDFTextStripper stripper = new PDFTextStripper();
stripper = new PDFTextStripper();
theDocument = parser.getPDDocument();
} catch (IOException e) {
Log.logException(e);
throw new ParserException(e.getMessage(), location);
} finally {
Thread.currentThread().setPriority(Thread.NORM_PRIORITY);
}
if (theDocument.isEncrypted()) {
try {
theDocument.openProtection(new StandardDecryptionMaterial(""));
} catch (BadSecurityHandlerException e) {
Log.logException(e);
throw new ParserException("Document is encrypted (1): " + e.getMessage(), location);
} catch (IOException e) {
Log.logException(e);
throw new ParserException("Document is encrypted (2): " + e.getMessage(), location);
} catch (CryptographyException e) {
Log.logException(e);
throw new ParserException("Document is encrypted (3): " + e.getMessage(), location);
}
final AccessPermission perm = theDocument.getCurrentAccessPermission();
if (perm == null || !perm.canExtractContent())
throw new ParserException("Document is encrypted", location);
throw new ParserException("Document is encrypted and cannot decrypted", location);
}
// extracting some metadata
@ -128,6 +141,7 @@ public class pdfParser extends AbstractParser implements Idiom {
docKeywordStr = theDocInfo.getKeywords();
}
try {
// creating a writer for output
if ((this.contentLength == -1) || (this.contentLength > Idiom.MAX_KEEP_IN_MEMORY_SIZE)) {
writerFile = File.createTempFile("pdfParser",".prt");
@ -138,11 +152,20 @@ public class pdfParser extends AbstractParser implements Idiom {
try {
stripper.writeText(theDocument, writer ); // may throw a NPE
} catch (Exception e) {
Log.logException(e);
Log.logWarning("pdfParser", e.getMessage());
}
theDocument.close(); theDocument = null;
writer.close();
} catch (IOException e) {
Log.logException(e);
// close the writer
if (writer != null) try { writer.close(); } catch (final Exception ex) {}
// delete the file
if (writerFile != null) FileUtils.deletedelete(writerFile);
throw new ParserException(e.getMessage(), location);
}
String[] docKeywords = null;
if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,");
@ -150,7 +173,13 @@ public class pdfParser extends AbstractParser implements Idiom {
Document theDoc = null;
if (writer instanceof CharBuffer) {
final byte[] contentBytes = ((CharBuffer)writer).toString().getBytes("UTF-8");
byte[] contentBytes;
try {
contentBytes = ((CharBuffer) writer).toString().getBytes("UTF-8");
} catch (UnsupportedEncodingException e) {
Log.logException(e);
throw new ParserException(e.getMessage(), location);
}
theDoc = new Document(
location,
mimeType,
@ -182,24 +211,6 @@ public class pdfParser extends AbstractParser implements Idiom {
return theDoc;
}
catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof ParserException) throw (ParserException) e;
// close the writer
if (writer != null) try { writer.close(); } catch (final Exception ex) {/* ignore this */}
// delete the file
if (writerFile != null) FileUtils.deletedelete(writerFile);
Log.logException(e);
throw new ParserException("Unexpected error while parsing pdf file. " + e.getMessage(),location);
} finally {
if (theDocument != null) try { theDocument.close(); } catch (final Exception e) {/* ignore this */}
if (writer != null) try { writer.close(); } catch (final Exception e) {/* ignore this */}
Thread.currentThread().setPriority(Thread.NORM_PRIORITY);
}
}
@Override
public void reset() {

Loading…
Cancel
Save