diff --git a/.classpath b/.classpath
index 0f0590b67..523593c72 100644
--- a/.classpath
+++ b/.classpath
@@ -1,39 +1,39 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/build.xml b/build.xml
index 19463c4e7..c76440fc5 100644
--- a/build.xml
+++ b/build.xml
@@ -45,6 +45,7 @@
+
@@ -178,7 +179,6 @@
-
@@ -541,7 +541,7 @@
-
@@ -558,12 +558,12 @@
-
+
-
+
diff --git a/lib/odf_utils_05_11_29.jar b/lib/odf_utils_05_11_29.jar
deleted file mode 100644
index afa36068a..000000000
Binary files a/lib/odf_utils_05_11_29.jar and /dev/null differ
diff --git a/source/de/anomic/document/parser/odtParser.java b/source/de/anomic/document/parser/odtParser.java
index 9bf1329d2..24b656d76 100644
--- a/source/de/anomic/document/parser/odtParser.java
+++ b/source/de/anomic/document/parser/odtParser.java
@@ -33,22 +33,22 @@ import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
-import java.nio.charset.Charset;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
-import com.catcode.odf.ODFMetaFileAnalyzer;
-import com.catcode.odf.OpenDocumentMetadata;
-import com.catcode.odf.OpenDocumentTextInputStream;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.document.AbstractParser;
import de.anomic.document.Idiom;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
+import de.anomic.document.parser.xml.ODContentHandler;
+import de.anomic.document.parser.xml.ODMetaHandler;
import de.anomic.http.httpClient;
import de.anomic.http.httpHeader;
import de.anomic.http.httpRequestHeader;
@@ -126,6 +126,7 @@ public class odtParser extends AbstractParser implements Idiom {
// opening the file as zip file
final ZipFile zipFile= new ZipFile(dest);
final Enumeration extends ZipEntry> zipEnum = zipFile.entries();
+ final SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
// looping through all containing files
while (zipEnum.hasMoreElements()) {
@@ -150,18 +151,19 @@ public class odtParser extends AbstractParser implements Idiom {
// extract data
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
- final OpenDocumentTextInputStream odStream = new OpenDocumentTextInputStream(zipFileEntryStream);
- FileUtils.copy(odStream, writer, Charset.forName("UTF-8"));
+ final SAXParser saxParser = saxParserFactory.newSAXParser();
+ saxParser.parse(zipFileEntryStream, new ODContentHandler(writer));
// close readers and writers
- odStream.close();
+ zipFileEntryStream.close();
writer.close();
} else if (entryName.equals("meta.xml")) {
// meta.xml contains metadata about the document
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
- final ODFMetaFileAnalyzer metaAnalyzer = new ODFMetaFileAnalyzer();
- final OpenDocumentMetadata metaData = metaAnalyzer.analyzeMetaData(zipFileEntryStream);
+ final SAXParser saxParser = saxParserFactory.newSAXParser();
+ final ODMetaHandler metaData = new ODMetaHandler();
+ saxParser.parse(zipFileEntryStream, metaData);
docDescription = metaData.getDescription();
docKeywordStr = metaData.getKeyword();
docShortTitle = metaData.getTitle();
@@ -260,7 +262,7 @@ public class odtParser extends AbstractParser implements Idiom {
// Nothing todo here at the moment
super.reset();
}
-
+
public static void main(final String[] args) {
try {
if (args.length != 1) return;
diff --git a/source/de/anomic/document/parser/xml/ODContentHandler.java b/source/de/anomic/document/parser/xml/ODContentHandler.java
new file mode 100644
index 000000000..fed53a392
--- /dev/null
+++ b/source/de/anomic/document/parser/xml/ODContentHandler.java
@@ -0,0 +1,65 @@
+// ODContentHandler.java
+// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
+// first published 16.07.2007 on http://yacy.net
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// $LastChangedDate: 2009-07-09 22:13:11 +0200 (Do, 09. Jul 2009) $
+// $LastChangedRevision: 6186 $
+// $LastChangedBy: low012 $
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+package de.anomic.document.parser.xml;
+
+import java.io.IOException;
+import java.io.Writer;
+
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * This is a SAX Handler, which handles the content.xml file
+ * of an OpenDocument-file and passes all interesting data to
+ * a Writer
+ * @author f1ori
+ *
+ */
+public class ODContentHandler extends DefaultHandler {
+ private Writer out;
+ public ODContentHandler(Writer out) {
+ this.out = out;
+ }
+ @Override
+ public void characters(final char ch[], final int start, final int length) {
+ try {
+ out.write(ch, start, length);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ @Override
+ public void endElement(final String uri, final String name, final String tag) {
+ if ("text:p".equals(tag) || "table:table-row".equals(tag)) {
+ // add newlines after paragraphs
+ try {
+ out.append("\n");
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/source/de/anomic/document/parser/xml/ODMetaHandler.java b/source/de/anomic/document/parser/xml/ODMetaHandler.java
new file mode 100644
index 000000000..b6486ec11
--- /dev/null
+++ b/source/de/anomic/document/parser/xml/ODMetaHandler.java
@@ -0,0 +1,67 @@
+package de.anomic.document.parser.xml;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class ODMetaHandler extends DefaultHandler {
+ private StringBuilder buffer = new StringBuilder();
+
+ private String docCreator = null;
+ private String docLanguage = null;
+ private String docKeyword = null;
+ private String docSubject = null;
+ private String docTitle = null;
+ private String docDescription = null;
+
+ public ODMetaHandler() {
+ }
+
+ @Override
+ public void characters(final char ch[], final int start, final int length) {
+ buffer.append(ch, start, length);
+ }
+
+ @Override
+ public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException {
+ buffer.delete(0, buffer.length());
+ }
+
+ @Override
+ public void endElement(final String uri, final String name, final String tag) {
+ if ("dc:creator".equals(tag)) {
+ this.docCreator = buffer.toString();
+ } else if ("dc:language".equals(tag)) {
+ this.docLanguage = buffer.toString();
+ } else if ("meta:keyword".equals(tag)) {
+ this.docKeyword = buffer.toString();
+ } else if ("dc:subject".equals(tag)) {
+ this.docSubject = buffer.toString();
+ } else if ("dc:title".equals(tag)) {
+ this.docTitle = buffer.toString();
+ } else if ("dc:description".equals(tag)) {
+ this.docDescription = buffer.toString();
+ }
+ }
+
+ public String getCreator() {
+ return docCreator;
+ }
+
+ public String getLanguage() {
+ return docLanguage;
+ }
+ public String getKeyword() {
+ return docKeyword;
+ }
+ public String getSubject() {
+ return docSubject;
+ }
+ public String getTitle() {
+ return docTitle;
+ }
+ public String getDescription() {
+ return docDescription;
+ }
+}
+
diff --git a/test/de/anomic/document/ParserTest.java b/test/de/anomic/document/ParserTest.java
index 8d17e875a..f90879f08 100644
--- a/test/de/anomic/document/ParserTest.java
+++ b/test/de/anomic/document/ParserTest.java
@@ -2,12 +2,14 @@ package de.anomic.document;
import static org.junit.Assert.*;
import org.junit.Test;
+import static org.junit.matchers.JUnitMatchers.*;
import java.io.File;
import java.io.FileInputStream;
import java.io.Reader;
import java.io.InputStreamReader;
+import de.anomic.document.Document;
import de.anomic.document.Parser;
import de.anomic.yacy.yacyURL;
@@ -17,11 +19,12 @@ public class ParserTest {
de.anomic.document.ParserException, java.net.MalformedURLException,
java.io.UnsupportedEncodingException, java.io.IOException {
String[][] testFiles = new String[][] {
- new String[]{"umlaute_linux.odt", "application/vnd.oasis.opendocument.text"},
- new String[]{"umlaute_linux.ods", "application/vnd.oasis.opendocument.spreadsheat"},
- new String[]{"umlaute_linux.odp", "application/vnd.oasis.opendocument.presentation"},
- new String[]{"umlaute_linux.pdf", "application/pdf"},
- new String[]{"umlaute_windows.doc", "application/msword"},
+ // meaning: filename in test/parsertest, mimetype, title, creator, description,
+ new String[]{"umlaute_linux.odt", "application/vnd.oasis.opendocument.text", "Münchner Hofbräuhaus", "", "Kommentar zum Hofbräuhaus"},
+ new String[]{"umlaute_linux.ods", "application/vnd.oasis.opendocument.spreadsheat", "", "", ""},
+ new String[]{"umlaute_linux.odp", "application/vnd.oasis.opendocument.presentation", "", "", ""},
+ new String[]{"umlaute_linux.pdf", "application/pdf", "", "", ""},
+ new String[]{"umlaute_windows.doc", "application/msword", "", "", ""},
};
@@ -38,10 +41,12 @@ public class ParserTest {
while( (c = content.read()) != -1 )
str.append((char)c);
- System.out.println("Parsed: " + str);
-
- assertTrue(str.indexOf("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen") != -1);
+ System.out.println("Parsed " + filename + ": " + str);
+ assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen"));
+ assertThat(doc.dc_title(), containsString(testFiles[i][2]));
+ assertThat(doc.dc_creator(), containsString(testFiles[i][3]));
+ assertThat(doc.dc_description(), containsString(testFiles[i][4]));
}
}
}
diff --git a/test/parsertest/umlaute_linux.odt b/test/parsertest/umlaute_linux.odt
index 23d978cb1..1d70fdcc9 100755
Binary files a/test/parsertest/umlaute_linux.odt and b/test/parsertest/umlaute_linux.odt differ