|
|
|
@ -45,6 +45,7 @@ package de.anomic.plasma.parser.pdf;
|
|
|
|
|
|
|
|
|
|
import java.io.File;
|
|
|
|
|
import java.io.FileOutputStream;
|
|
|
|
|
import java.io.IOException;
|
|
|
|
|
import java.io.InputStream;
|
|
|
|
|
import java.io.OutputStreamWriter;
|
|
|
|
|
import java.io.Writer;
|
|
|
|
@ -53,6 +54,7 @@ import java.util.Hashtable;
|
|
|
|
|
import org.pdfbox.pdfparser.PDFParser;
|
|
|
|
|
import org.pdfbox.pdmodel.PDDocument;
|
|
|
|
|
import org.pdfbox.pdmodel.PDDocumentInformation;
|
|
|
|
|
import org.pdfbox.pdmodel.encryption.AccessPermission;
|
|
|
|
|
import org.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
|
|
|
|
|
import org.pdfbox.util.PDFTextStripper;
|
|
|
|
|
|
|
|
|
@ -62,6 +64,7 @@ import de.anomic.plasma.parser.AbstractParser;
|
|
|
|
|
import de.anomic.plasma.parser.Parser;
|
|
|
|
|
import de.anomic.plasma.parser.ParserException;
|
|
|
|
|
import de.anomic.server.serverCharBuffer;
|
|
|
|
|
import de.anomic.server.serverFileUtils;
|
|
|
|
|
import de.anomic.yacy.yacyURL;
|
|
|
|
|
|
|
|
|
|
public class pdfParser extends AbstractParser implements Parser {
|
|
|
|
@ -78,7 +81,7 @@ public class pdfParser extends AbstractParser implements Parser {
|
|
|
|
|
* @see Parser#getLibxDependences()
|
|
|
|
|
*/
|
|
|
|
|
private static final String[] LIBX_DEPENDENCIES = new String[] {
|
|
|
|
|
"PDFBox-0.7.3.jar", "FontBox-0.1.0-dev.jar", "bcprov-jdk14-132.jar"
|
|
|
|
|
"PDFBox-0.7.3.jar", "FontBox-0.1.0-dev.jar", "bcprov-jdk14-139.jar"
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
public pdfParser() {
|
|
|
|
@ -121,7 +124,8 @@ public class pdfParser extends AbstractParser implements Parser {
|
|
|
|
|
|
|
|
|
|
if (theDocument.isEncrypted()) {
|
|
|
|
|
theDocument.openProtection(new StandardDecryptionMaterial(""));
|
|
|
|
|
if (!theDocument.getCurrentAccessPermission().canExtractContent())
|
|
|
|
|
final AccessPermission perm = theDocument.getCurrentAccessPermission();
|
|
|
|
|
if (perm == null || !perm.canExtractContent())
|
|
|
|
|
throw new ParserException("Document is encrypted",location,ErrorURL.DENIED_DOCUMENT_ENCRYPTED);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -193,6 +197,7 @@ public class pdfParser extends AbstractParser implements Parser {
|
|
|
|
|
// delete the file
|
|
|
|
|
if (writerFile != null) try { writerFile.delete(); } catch (Exception ex) {/* ignore this */}
|
|
|
|
|
|
|
|
|
|
e.printStackTrace();
|
|
|
|
|
throw new ParserException("Unexpected error while parsing pdf file. " + e.getMessage(),location);
|
|
|
|
|
} finally {
|
|
|
|
|
if (theDocument != null) try { theDocument.close(); } catch (Exception e) {/* ignore this */}
|
|
|
|
@ -205,5 +210,58 @@ public class pdfParser extends AbstractParser implements Parser {
|
|
|
|
|
// Nothing todo here at the moment
|
|
|
|
|
super.reset();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* test
|
|
|
|
|
* @param args
|
|
|
|
|
*/
|
|
|
|
|
public static void main(String[] args) {
|
|
|
|
|
if(args.length > 0 && args[0].length() > 0) {
|
|
|
|
|
// file
|
|
|
|
|
final File pdfFile = new File(args[0]);
|
|
|
|
|
if(pdfFile.canRead()) {
|
|
|
|
|
|
|
|
|
|
System.out.println(pdfFile.getAbsolutePath());
|
|
|
|
|
final long startTime = System.currentTimeMillis();
|
|
|
|
|
|
|
|
|
|
// parse
|
|
|
|
|
final AbstractParser parser = new pdfParser();
|
|
|
|
|
plasmaParserDocument document = null;
|
|
|
|
|
try {
|
|
|
|
|
document = parser.parse(null, "application/pdf", null, pdfFile);
|
|
|
|
|
|
|
|
|
|
} catch (ParserException e) {
|
|
|
|
|
System.err.println("Cannot parse file "+ pdfFile.getAbsolutePath());
|
|
|
|
|
e.printStackTrace();
|
|
|
|
|
} catch (InterruptedException e) {
|
|
|
|
|
System.err.println("Interrupted while parsing!");
|
|
|
|
|
e.printStackTrace();
|
|
|
|
|
} catch (NoClassDefFoundError e) {
|
|
|
|
|
System.err.println("class not found: " + e.getMessage());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// statistics
|
|
|
|
|
System.out.println("\ttime elapsed: " + (System.currentTimeMillis() - startTime) + " ms");
|
|
|
|
|
|
|
|
|
|
// output
|
|
|
|
|
if(document == null) {
|
|
|
|
|
System.out.println("\t!!!Parsing without result!!!");
|
|
|
|
|
} else {
|
|
|
|
|
System.out.println("\tParsed text with " + document.getTextLength() + " chars of text and " + document.getAnchors().size() + " anchors");
|
|
|
|
|
try {
|
|
|
|
|
// write file
|
|
|
|
|
serverFileUtils.copy(document.getText(), new File("parsedPdf.txt"));
|
|
|
|
|
} catch (IOException e) {
|
|
|
|
|
System.err.println("error saving parsed document");
|
|
|
|
|
e.printStackTrace();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
System.err.println("Cannot read file "+ pdfFile.getAbsolutePath());
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
System.out.println("Please give a filename as first argument.");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|