|
|
@ -88,24 +88,13 @@ public class pdfParser extends AbstractParser implements Idiom {
|
|
|
|
|
|
|
|
|
|
|
|
public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
|
|
|
|
public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
|
|
|
|
|
|
|
|
|
|
|
|
PDDocument theDocument = null;
|
|
|
|
// create a pdf parser
|
|
|
|
Writer writer = null;
|
|
|
|
final PDDocument theDocument;
|
|
|
|
File writerFile = null;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
String docTitle = null, docSubject = null, docAuthor = null, docKeywordStr = null;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// check for interruption
|
|
|
|
|
|
|
|
checkInterruption();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// creating a pdf parser
|
|
|
|
|
|
|
|
final PDFParser parser;
|
|
|
|
final PDFParser parser;
|
|
|
|
final PDFTextStripper stripper;
|
|
|
|
|
|
|
|
try {
|
|
|
|
try {
|
|
|
|
Thread.currentThread().setPriority(Thread.MIN_PRIORITY);
|
|
|
|
Thread.currentThread().setPriority(Thread.MIN_PRIORITY);
|
|
|
|
parser = new PDFParser(source);
|
|
|
|
parser = new PDFParser(source);
|
|
|
|
parser.parse();
|
|
|
|
parser.parse();
|
|
|
|
checkInterruption();
|
|
|
|
|
|
|
|
stripper = new PDFTextStripper();
|
|
|
|
|
|
|
|
theDocument = parser.getPDDocument();
|
|
|
|
theDocument = parser.getPDDocument();
|
|
|
|
} catch (IOException e) {
|
|
|
|
} catch (IOException e) {
|
|
|
|
Log.logException(e);
|
|
|
|
Log.logException(e);
|
|
|
@ -114,6 +103,8 @@ public class pdfParser extends AbstractParser implements Idiom {
|
|
|
|
Thread.currentThread().setPriority(Thread.NORM_PRIORITY);
|
|
|
|
Thread.currentThread().setPriority(Thread.NORM_PRIORITY);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
checkInterruption();
|
|
|
|
|
|
|
|
|
|
|
|
if (theDocument.isEncrypted()) {
|
|
|
|
if (theDocument.isEncrypted()) {
|
|
|
|
try {
|
|
|
|
try {
|
|
|
|
theDocument.openProtection(new StandardDecryptionMaterial(""));
|
|
|
|
theDocument.openProtection(new StandardDecryptionMaterial(""));
|
|
|
@ -134,6 +125,7 @@ public class pdfParser extends AbstractParser implements Idiom {
|
|
|
|
|
|
|
|
|
|
|
|
// extracting some metadata
|
|
|
|
// extracting some metadata
|
|
|
|
final PDDocumentInformation theDocInfo = theDocument.getDocumentInformation();
|
|
|
|
final PDDocumentInformation theDocInfo = theDocument.getDocumentInformation();
|
|
|
|
|
|
|
|
String docTitle = null, docSubject = null, docAuthor = null, docKeywordStr = null;
|
|
|
|
if (theDocInfo != null) {
|
|
|
|
if (theDocInfo != null) {
|
|
|
|
docTitle = theDocInfo.getTitle();
|
|
|
|
docTitle = theDocInfo.getTitle();
|
|
|
|
docSubject = theDocInfo.getSubject();
|
|
|
|
docSubject = theDocInfo.getSubject();
|
|
|
@ -141,6 +133,8 @@ public class pdfParser extends AbstractParser implements Idiom {
|
|
|
|
docKeywordStr = theDocInfo.getKeywords();
|
|
|
|
docKeywordStr = theDocInfo.getKeywords();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Writer writer = null;
|
|
|
|
|
|
|
|
File writerFile = null;
|
|
|
|
try {
|
|
|
|
try {
|
|
|
|
// creating a writer for output
|
|
|
|
// creating a writer for output
|
|
|
|
if ((this.contentLength == -1) || (this.contentLength > Idiom.MAX_KEEP_IN_MEMORY_SIZE)) {
|
|
|
|
if ((this.contentLength == -1) || (this.contentLength > Idiom.MAX_KEEP_IN_MEMORY_SIZE)) {
|
|
|
@ -149,13 +143,9 @@ public class pdfParser extends AbstractParser implements Idiom {
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
writer = new CharBuffer();
|
|
|
|
writer = new CharBuffer();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
try {
|
|
|
|
final PDFTextStripper stripper = new PDFTextStripper();
|
|
|
|
stripper.writeText(theDocument, writer ); // may throw a NPE
|
|
|
|
stripper.writeText(theDocument, writer); // may throw a NPE
|
|
|
|
} catch (Exception e) {
|
|
|
|
theDocument.close();
|
|
|
|
Log.logException(e);
|
|
|
|
|
|
|
|
Log.logWarning("pdfParser", e.getMessage());
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
theDocument.close(); theDocument = null;
|
|
|
|
|
|
|
|
writer.close();
|
|
|
|
writer.close();
|
|
|
|
} catch (IOException e) {
|
|
|
|
} catch (IOException e) {
|
|
|
|
Log.logException(e);
|
|
|
|
Log.logException(e);
|
|
|
|