more details for exception catching when parsing pdfs

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6461 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 18b21eaffe
commit 605e896d6c

@ -32,14 +32,17 @@ import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.util.HashSet;
import java.util.Set;
import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException;
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.apache.pdfbox.util.PDFTextStripper;
@ -88,46 +91,57 @@ public class pdfParser extends AbstractParser implements Idiom {
PDDocument theDocument = null;
Writer writer = null;
File writerFile = null;
try {
// reducing thread priority
Thread.currentThread().setPriority(Thread.MIN_PRIORITY);
// deactivating the logging for jMimeMagic
// Logger theLogger = Logger.getLogger("org.pdfbox");
// theLogger.setLevel(Level.INFO);
String docTitle = null, docSubject = null, docAuthor = null, docKeywordStr = null;
// check for interruption
checkInterruption();
// creating a pdf parser
final PDFParser parser = new PDFParser(source);
String docTitle = null, docSubject = null, docAuthor = null, docKeywordStr = null;
// check for interruption
checkInterruption();
// creating a pdf parser
final PDFParser parser;
final PDFTextStripper stripper;
try {
Thread.currentThread().setPriority(Thread.MIN_PRIORITY);
parser = new PDFParser(source);
parser.parse();
// check for interruption
checkInterruption();
// creating a text stripper
final PDFTextStripper stripper = new PDFTextStripper();
stripper = new PDFTextStripper();
theDocument = parser.getPDDocument();
if (theDocument.isEncrypted()) {
} catch (IOException e) {
Log.logException(e);
throw new ParserException(e.getMessage(), location);
} finally {
Thread.currentThread().setPriority(Thread.NORM_PRIORITY);
}
if (theDocument.isEncrypted()) {
try {
theDocument.openProtection(new StandardDecryptionMaterial(""));
final AccessPermission perm = theDocument.getCurrentAccessPermission();
if (perm == null || !perm.canExtractContent())
throw new ParserException("Document is encrypted", location);
} catch (BadSecurityHandlerException e) {
Log.logException(e);
throw new ParserException("Document is encrypted (1): " + e.getMessage(), location);
} catch (IOException e) {
Log.logException(e);
throw new ParserException("Document is encrypted (2): " + e.getMessage(), location);
} catch (CryptographyException e) {
Log.logException(e);
throw new ParserException("Document is encrypted (3): " + e.getMessage(), location);
}
// extracting some metadata
final PDDocumentInformation theDocInfo = theDocument.getDocumentInformation();
if (theDocInfo != null) {
docTitle = theDocInfo.getTitle();
docSubject = theDocInfo.getSubject();
docAuthor = theDocInfo.getAuthor();
docKeywordStr = theDocInfo.getKeywords();
}
final AccessPermission perm = theDocument.getCurrentAccessPermission();
if (perm == null || !perm.canExtractContent())
throw new ParserException("Document is encrypted and cannot decrypted", location);
}
// extracting some metadata
final PDDocumentInformation theDocInfo = theDocument.getDocumentInformation();
if (theDocInfo != null) {
docTitle = theDocInfo.getTitle();
docSubject = theDocInfo.getSubject();
docAuthor = theDocInfo.getAuthor();
docKeywordStr = theDocInfo.getKeywords();
}
try {
// creating a writer for output
if ((this.contentLength == -1) || (this.contentLength > Idiom.MAX_KEEP_IN_MEMORY_SIZE)) {
writerFile = File.createTempFile("pdfParser",".prt");
@ -138,67 +152,64 @@ public class pdfParser extends AbstractParser implements Idiom {
try {
stripper.writeText(theDocument, writer ); // may throw a NPE
} catch (Exception e) {
Log.logException(e);
Log.logWarning("pdfParser", e.getMessage());
}
theDocument.close(); theDocument = null;
writer.close();
String[] docKeywords = null;
if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,");
Document theDoc = null;
if (writer instanceof CharBuffer) {
final byte[] contentBytes = ((CharBuffer)writer).toString().getBytes("UTF-8");
theDoc = new Document(
location,
mimeType,
"UTF-8",
null,
docKeywords,
(docTitle == null) ? docSubject : docTitle,
docAuthor,
null,
null,
contentBytes,
null,
null);
} else {
theDoc = new Document(
location,
mimeType,
"UTF-8",
null,
docKeywords,
(docTitle == null) ? docSubject : docTitle,
docAuthor,
null,
null,
writerFile,
null,
null);
}
return theDoc;
}
catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof ParserException) throw (ParserException) e;
} catch (IOException e) {
Log.logException(e);
// close the writer
if (writer != null) try { writer.close(); } catch (final Exception ex) {/* ignore this */}
if (writer != null) try { writer.close(); } catch (final Exception ex) {}
// delete the file
if (writerFile != null) FileUtils.deletedelete(writerFile);
throw new ParserException(e.getMessage(), location);
}
Log.logException(e);
throw new ParserException("Unexpected error while parsing pdf file. " + e.getMessage(),location);
} finally {
if (theDocument != null) try { theDocument.close(); } catch (final Exception e) {/* ignore this */}
if (writer != null) try { writer.close(); } catch (final Exception e) {/* ignore this */}
Thread.currentThread().setPriority(Thread.NORM_PRIORITY);
String[] docKeywords = null;
if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,");
Document theDoc = null;
if (writer instanceof CharBuffer) {
byte[] contentBytes;
try {
contentBytes = ((CharBuffer) writer).toString().getBytes("UTF-8");
} catch (UnsupportedEncodingException e) {
Log.logException(e);
throw new ParserException(e.getMessage(), location);
}
theDoc = new Document(
location,
mimeType,
"UTF-8",
null,
docKeywords,
(docTitle == null) ? docSubject : docTitle,
docAuthor,
null,
null,
contentBytes,
null,
null);
} else {
theDoc = new Document(
location,
mimeType,
"UTF-8",
null,
docKeywords,
(docTitle == null) ? docSubject : docTitle,
docAuthor,
null,
null,
writerFile,
null,
null);
}
return theDoc;
}
@Override

Loading…
Cancel
Save