pdfParser: updated lib, fixed ClassNotFoundException: CMSError

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4776 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
danielr 17 years ago
parent 0d3808bd9e
commit ae03a54d23

@ -36,7 +36,8 @@
<classpathentry kind="lib" path="libx/commons-logging.jar"/> <classpathentry kind="lib" path="libx/commons-logging.jar"/>
<classpathentry kind="lib" path="libx/PDFBox-0.7.3.jar"/> <classpathentry kind="lib" path="libx/PDFBox-0.7.3.jar"/>
<classpathentry kind="lib" path="libx/FontBox-0.1.0-dev.jar"/> <classpathentry kind="lib" path="libx/FontBox-0.1.0-dev.jar"/>
<classpathentry kind="lib" path="libx/bcprov-jdk14-132.jar"/> <classpathentry kind="lib" path="libx/bcprov-jdk14-139.jar"/>
<classpathentry kind="lib" path="libx/bcmail-jdk14-139.jar"/>
<classpathentry kind="lib" path="libx/commons-codec-1.3.jar"/> <classpathentry kind="lib" path="libx/commons-codec-1.3.jar"/>
<classpathentry kind="lib" path="lib/commons-httpclient-3.1.jar"/> <classpathentry kind="lib" path="lib/commons-httpclient-3.1.jar"/>
<classpathentry kind="output" path="gen"/> <classpathentry kind="output" path="gen"/>

Binary file not shown.

Binary file not shown.

Binary file not shown.

@ -18,7 +18,8 @@
<pathelement location="${libx}/PDFBox-0.7.3.jar" /> <pathelement location="${libx}/PDFBox-0.7.3.jar" />
<!-- additional fontparsing lib, part of PDFBox --> <!-- additional fontparsing lib, part of PDFBox -->
<pathelement location="${libx}/FontBox-0.1.0-dev.jar" /> <pathelement location="${libx}/FontBox-0.1.0-dev.jar" />
<pathelement location="${libx}/bcprov-jdk14-132.jar" /> <pathelement location="${libx}/bcprov-jdk14-139.jar" />
<pathelement location="${libx}/bcmail-jdk14-139.jar" />
</classpath> </classpath>
</javac> </javac>
</target> </target>

@ -45,6 +45,7 @@ package de.anomic.plasma.parser.pdf;
import java.io.File; import java.io.File;
import java.io.FileOutputStream; import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.OutputStreamWriter; import java.io.OutputStreamWriter;
import java.io.Writer; import java.io.Writer;
@ -53,6 +54,7 @@ import java.util.Hashtable;
import org.pdfbox.pdfparser.PDFParser; import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument; import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDDocumentInformation; import org.pdfbox.pdmodel.PDDocumentInformation;
import org.pdfbox.pdmodel.encryption.AccessPermission;
import org.pdfbox.pdmodel.encryption.StandardDecryptionMaterial; import org.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.pdfbox.util.PDFTextStripper; import org.pdfbox.util.PDFTextStripper;
@ -62,6 +64,7 @@ import de.anomic.plasma.parser.AbstractParser;
import de.anomic.plasma.parser.Parser; import de.anomic.plasma.parser.Parser;
import de.anomic.plasma.parser.ParserException; import de.anomic.plasma.parser.ParserException;
import de.anomic.server.serverCharBuffer; import de.anomic.server.serverCharBuffer;
import de.anomic.server.serverFileUtils;
import de.anomic.yacy.yacyURL; import de.anomic.yacy.yacyURL;
public class pdfParser extends AbstractParser implements Parser { public class pdfParser extends AbstractParser implements Parser {
@ -78,7 +81,7 @@ public class pdfParser extends AbstractParser implements Parser {
* @see Parser#getLibxDependences() * @see Parser#getLibxDependences()
*/ */
private static final String[] LIBX_DEPENDENCIES = new String[] { private static final String[] LIBX_DEPENDENCIES = new String[] {
"PDFBox-0.7.3.jar", "FontBox-0.1.0-dev.jar", "bcprov-jdk14-132.jar" "PDFBox-0.7.3.jar", "FontBox-0.1.0-dev.jar", "bcprov-jdk14-139.jar"
}; };
public pdfParser() { public pdfParser() {
@ -121,7 +124,8 @@ public class pdfParser extends AbstractParser implements Parser {
if (theDocument.isEncrypted()) { if (theDocument.isEncrypted()) {
theDocument.openProtection(new StandardDecryptionMaterial("")); theDocument.openProtection(new StandardDecryptionMaterial(""));
if (!theDocument.getCurrentAccessPermission().canExtractContent()) final AccessPermission perm = theDocument.getCurrentAccessPermission();
if (perm == null || !perm.canExtractContent())
throw new ParserException("Document is encrypted",location,ErrorURL.DENIED_DOCUMENT_ENCRYPTED); throw new ParserException("Document is encrypted",location,ErrorURL.DENIED_DOCUMENT_ENCRYPTED);
} }
@ -193,6 +197,7 @@ public class pdfParser extends AbstractParser implements Parser {
// delete the file // delete the file
if (writerFile != null) try { writerFile.delete(); } catch (Exception ex) {/* ignore this */} if (writerFile != null) try { writerFile.delete(); } catch (Exception ex) {/* ignore this */}
e.printStackTrace();
throw new ParserException("Unexpected error while parsing pdf file. " + e.getMessage(),location); throw new ParserException("Unexpected error while parsing pdf file. " + e.getMessage(),location);
} finally { } finally {
if (theDocument != null) try { theDocument.close(); } catch (Exception e) {/* ignore this */} if (theDocument != null) try { theDocument.close(); } catch (Exception e) {/* ignore this */}
@ -205,5 +210,58 @@ public class pdfParser extends AbstractParser implements Parser {
// Nothing todo here at the moment // Nothing todo here at the moment
super.reset(); super.reset();
} }
/**
* test
* @param args
*/
public static void main(String[] args) {
if(args.length > 0 && args[0].length() > 0) {
// file
final File pdfFile = new File(args[0]);
if(pdfFile.canRead()) {
System.out.println(pdfFile.getAbsolutePath());
final long startTime = System.currentTimeMillis();
// parse
final AbstractParser parser = new pdfParser();
plasmaParserDocument document = null;
try {
document = parser.parse(null, "application/pdf", null, pdfFile);
} catch (ParserException e) {
System.err.println("Cannot parse file "+ pdfFile.getAbsolutePath());
e.printStackTrace();
} catch (InterruptedException e) {
System.err.println("Interrupted while parsing!");
e.printStackTrace();
} catch (NoClassDefFoundError e) {
System.err.println("class not found: " + e.getMessage());
}
// statistics
System.out.println("\ttime elapsed: " + (System.currentTimeMillis() - startTime) + " ms");
// output
if(document == null) {
System.out.println("\t!!!Parsing without result!!!");
} else {
System.out.println("\tParsed text with " + document.getTextLength() + " chars of text and " + document.getAnchors().size() + " anchors");
try {
// write file
serverFileUtils.copy(document.getText(), new File("parsedPdf.txt"));
} catch (IOException e) {
System.err.println("error saving parsed document");
e.printStackTrace();
}
}
} else {
System.err.println("Cannot read file "+ pdfFile.getAbsolutePath());
}
} else {
System.out.println("Please give a filename as first argument.");
}
}
} }

Loading…
Cancel
Save