update to pdf parser

pull/1/head
Michael Peter Christen 12 years ago
parent 4a9182ae16
commit 95712fdc8b

@ -46,9 +46,6 @@
<classpathentry kind="lib" path="lib/servlet-api-2.5-20081211.jar"/>
<classpathentry kind="lib" path="lib/jetty-6.1.26-patched-JETTY-1340.jar"/>
<classpathentry kind="lib" path="lib/jetty-util-6.1.26-patched-JETTY-1340.jar"/>
<classpathentry kind="lib" path="lib/fontbox-1.7.0.jar"/>
<classpathentry kind="lib" path="lib/pdfbox-1.7.0.jar"/>
<classpathentry kind="lib" path="lib/jempbox-1.7.0.jar"/>
<classpathentry kind="lib" path="lib/jaudiotagger-2.0.4-20111207.115108-15.jar"/>
<classpathentry kind="lib" path="lib/commons-codec-1.7.jar"/>
<classpathentry kind="lib" path="lib/jcl-over-slf4j-1.6.4.jar"/>
@ -59,17 +56,20 @@
<classpathentry kind="lib" path="lib/apache-solr-solrj-4.0.0.jar" sourcepath="/Users/admin/Development/sourcecode/apache-solr-4.0.0-src/src/java"/>
<classpathentry kind="lib" path="lib/lucene-analyzers-common-4.0.0.jar"/>
<classpathentry kind="lib" path="lib/lucene-analyzers-phonetic-4.0.0.jar"/>
<classpathentry kind="lib" path="lib/lucene-core-4.0.0.jar"/>
<classpathentry kind="lib" path="lib/lucene-core-4.0.0.jar" sourcepath="/Users/admin/Development/sourcecode/apache-solr-4.0.0-src/src/java"/>
<classpathentry kind="lib" path="lib/lucene-grouping-4.0.0.jar"/>
<classpathentry kind="lib" path="lib/lucene-highlighter-4.0.0.jar"/>
<classpathentry kind="lib" path="lib/lucene-memory-4.0.0.jar"/>
<classpathentry kind="lib" path="lib/lucene-misc-4.0.0.jar"/>
<classpathentry kind="lib" path="lib/lucene-misc-4.0.0.jar" sourcepath="/Users/admin/Development/sourcecode/apache-solr-4.0.0-src/src/java"/>
<classpathentry kind="lib" path="lib/lucene-queries-4.0.0.jar"/>
<classpathentry kind="lib" path="lib/lucene-queryparser-4.0.0.jar"/>
<classpathentry kind="lib" path="lib/lucene-spatial-4.0.0.jar"/>
<classpathentry kind="lib" path="lib/lucene-suggest-4.0.0.jar"/>
<classpathentry kind="lib" path="lib/zookeeper-3.3.6.jar"/>
<classpathentry kind="lib" path="lib/spatial4j-0.3.jar"/>
<classpathentry kind="lib" path="lib/pdfbox-1.7.1.jar"/>
<classpathentry kind="lib" path="lib/fontbox-1.7.1.jar"/>
<classpathentry kind="lib" path="lib/jempbox-1.7.1.jar"/>
<classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
<classpathentry kind="lib" path="lib/icu4j-core.jar"/>
<classpathentry kind="lib" path="lib/htmllexer.jar"/>

@ -53,7 +53,7 @@
<string>$JAVAROOT/lib/commons-jxpath-1.3.jar</string>
<string>$JAVAROOT/lib/commons-lang-2.6.jar</string>
<string>$JAVAROOT/lib/commons-logging-1.1.1.jar</string>
<string>$JAVAROOT/lib/fontbox-1.7.0.jar</string>
<string>$JAVAROOT/lib/fontbox-1.7.1.jar</string>
<string>$JAVAROOT/lib/geronimo-stax-api_1.0_spec-1.0.1.jar</string>
<string>$JAVAROOT/lib/guava-r05.jar</string>
<string>$JAVAROOT/lib/htmllexer.jar</string>
@ -66,7 +66,7 @@
<string>$JAVAROOT/lib/jaudiotagger-2.0.4-20111207.115108-15.jar</string>
<string>$JAVAROOT/lib/jcifs-1.3.15.jar</string>
<string>$JAVAROOT/lib/jcl-over-slf4j-1.6.4.jar</string>
<string>$JAVAROOT/lib/jempbox-1.7.0.jar</string>
<string>$JAVAROOT/lib/jempbox-1.7.1.jar</string>
<string>$JAVAROOT/lib/jena-2.6.4.jar</string>
<string>$JAVAROOT/lib/jetty-6.1.26-patched-JETTY-1340.jar</string>
<string>$JAVAROOT/lib/jetty-util-6.1.26-patched-JETTY-1340.jar</string>
@ -87,7 +87,7 @@
<string>$JAVAROOT/lib/lucene-suggest-4.0.0.jar</string>
<string>$JAVAROOT/lib/metadata-extractor-2.4.0-beta-1.jar</string>
<string>$JAVAROOT/lib/mysql-connector-java-5.1.12-bin.jar</string>
<string>$JAVAROOT/lib/pdfbox-1.7.0.jar</string>
<string>$JAVAROOT/lib/pdfbox-1.7.1.jar</string>
<string>$JAVAROOT/lib/poi-3.6-20091214.jar</string>
<string>$JAVAROOT/lib/poi-scratchpad-3.6-20091214.jar</string>
<string>$JAVAROOT/lib/sax-2.0.1.jar</string>

@ -172,7 +172,7 @@
<pathelement location="${lib}/commons-jxpath-1.3.jar" />
<pathelement location="${lib}/commons-lang-2.6.jar" />
<pathelement location="${lib}/commons-logging-1.1.1.jar" />
<pathelement location="${lib}/fontbox-1.7.0.jar" />
<pathelement location="${lib}/fontbox-1.7.1.jar" />
<pathelement location="${lib}/geronimo-stax-api_1.0_spec-1.0.1.jar" />
<pathelement location="${lib}/guava-r05.jar" />
<pathelement location="${lib}/htmllexer.jar" />
@ -185,7 +185,7 @@
<pathelement location="${lib}/jaudiotagger-2.0.4-20111207.115108-15.jar" />
<pathelement location="${lib}/jcifs-1.3.15.jar" />
<pathelement location="${lib}/jcl-over-slf4j-1.6.4.jar" />
<pathelement location="${lib}/jempbox-1.7.0" />
<pathelement location="${lib}/jempbox-1.7.1" />
<pathelement location="${lib}/jena-2.6.4.jar" />
<pathelement location="${lib}/jetty-6.1.26-patched-JETTY-1340.jar" />
<pathelement location="${lib}/jetty-util-6.1.26-patched-JETTY-1340.jar" />
@ -207,7 +207,7 @@
<pathelement location="${lib}/lucene-suggest-4.0.0.jar" />
<pathelement location="${lib}/metadata-extractor-2.4.0-beta-1.jar" />
<pathelement location="${lib}/mysql-connector-java-5.1.12-bin.jar" />
<pathelement location="${lib}/pdfbox-1.7.0.jar" />
<pathelement location="${lib}/pdfbox-1.7.1.jar" />
<pathelement location="${lib}/poi-3.6-20091214.jar" />
<pathelement location="${lib}/poi-scratchpad-3.6-20091214.jar" />
<pathelement location="${lib}/sax-2.0.1.jar" />

@ -33,16 +33,6 @@ import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.MemoryControl;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.pdmodel.PDDocument;
@ -53,6 +43,16 @@ import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.util.PDFTextStripper;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.MemoryControl;
public class pdfParser extends AbstractParser implements Parser {

Loading…
Cancel
Save