another update to the pdf parser

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6463 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 54c54fb144
commit 08f1cbb125

@ -88,24 +88,13 @@ public class pdfParser extends AbstractParser implements Idiom {
public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
PDDocument theDocument = null; // create a pdf parser
Writer writer = null; final PDDocument theDocument;
File writerFile = null;
String docTitle = null, docSubject = null, docAuthor = null, docKeywordStr = null;
// check for interruption
checkInterruption();
// creating a pdf parser
final PDFParser parser; final PDFParser parser;
final PDFTextStripper stripper;
try { try {
Thread.currentThread().setPriority(Thread.MIN_PRIORITY); Thread.currentThread().setPriority(Thread.MIN_PRIORITY);
parser = new PDFParser(source); parser = new PDFParser(source);
parser.parse(); parser.parse();
checkInterruption();
stripper = new PDFTextStripper();
theDocument = parser.getPDDocument(); theDocument = parser.getPDDocument();
} catch (IOException e) { } catch (IOException e) {
Log.logException(e); Log.logException(e);
@ -114,6 +103,8 @@ public class pdfParser extends AbstractParser implements Idiom {
Thread.currentThread().setPriority(Thread.NORM_PRIORITY); Thread.currentThread().setPriority(Thread.NORM_PRIORITY);
} }
checkInterruption();
if (theDocument.isEncrypted()) { if (theDocument.isEncrypted()) {
try { try {
theDocument.openProtection(new StandardDecryptionMaterial("")); theDocument.openProtection(new StandardDecryptionMaterial(""));
@ -134,6 +125,7 @@ public class pdfParser extends AbstractParser implements Idiom {
// extracting some metadata // extracting some metadata
final PDDocumentInformation theDocInfo = theDocument.getDocumentInformation(); final PDDocumentInformation theDocInfo = theDocument.getDocumentInformation();
String docTitle = null, docSubject = null, docAuthor = null, docKeywordStr = null;
if (theDocInfo != null) { if (theDocInfo != null) {
docTitle = theDocInfo.getTitle(); docTitle = theDocInfo.getTitle();
docSubject = theDocInfo.getSubject(); docSubject = theDocInfo.getSubject();
@ -141,6 +133,8 @@ public class pdfParser extends AbstractParser implements Idiom {
docKeywordStr = theDocInfo.getKeywords(); docKeywordStr = theDocInfo.getKeywords();
} }
Writer writer = null;
File writerFile = null;
try { try {
// creating a writer for output // creating a writer for output
if ((this.contentLength == -1) || (this.contentLength > Idiom.MAX_KEEP_IN_MEMORY_SIZE)) { if ((this.contentLength == -1) || (this.contentLength > Idiom.MAX_KEEP_IN_MEMORY_SIZE)) {
@ -149,13 +143,9 @@ public class pdfParser extends AbstractParser implements Idiom {
} else { } else {
writer = new CharBuffer(); writer = new CharBuffer();
} }
try { final PDFTextStripper stripper = new PDFTextStripper();
stripper.writeText(theDocument, writer ); // may throw a NPE stripper.writeText(theDocument, writer); // may throw a NPE
} catch (Exception e) { theDocument.close();
Log.logException(e);
Log.logWarning("pdfParser", e.getMessage());
}
theDocument.close(); theDocument = null;
writer.close(); writer.close();
} catch (IOException e) { } catch (IOException e) {
Log.logException(e); Log.logException(e);

Loading…
Cancel
Save