diff --git a/.classpath b/.classpath index 0f0590b67..523593c72 100644 --- a/.classpath +++ b/.classpath @@ -1,39 +1,39 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/build.xml b/build.xml index 19463c4e7..c76440fc5 100644 --- a/build.xml +++ b/build.xml @@ -45,6 +45,7 @@ + @@ -178,7 +179,6 @@ - @@ -541,7 +541,7 @@ - @@ -558,12 +558,12 @@ - + - + diff --git a/lib/odf_utils_05_11_29.jar b/lib/odf_utils_05_11_29.jar deleted file mode 100644 index afa36068a..000000000 Binary files a/lib/odf_utils_05_11_29.jar and /dev/null differ diff --git a/source/de/anomic/document/parser/odtParser.java b/source/de/anomic/document/parser/odtParser.java index 9bf1329d2..24b656d76 100644 --- a/source/de/anomic/document/parser/odtParser.java +++ b/source/de/anomic/document/parser/odtParser.java @@ -33,22 +33,22 @@ import java.io.FileOutputStream; import java.io.InputStream; import java.io.OutputStreamWriter; import java.io.Writer; -import java.nio.charset.Charset; import java.util.Enumeration; import java.util.HashSet; import java.util.Set; import java.util.zip.ZipEntry; import java.util.zip.ZipFile; -import com.catcode.odf.ODFMetaFileAnalyzer; -import com.catcode.odf.OpenDocumentMetadata; -import com.catcode.odf.OpenDocumentTextInputStream; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.document.AbstractParser; import de.anomic.document.Idiom; import de.anomic.document.ParserException; import de.anomic.document.Document; +import de.anomic.document.parser.xml.ODContentHandler; +import de.anomic.document.parser.xml.ODMetaHandler; import de.anomic.http.httpClient; import de.anomic.http.httpHeader; import de.anomic.http.httpRequestHeader; @@ -126,6 +126,7 @@ public class odtParser extends AbstractParser implements Idiom { // opening the file as zip file final ZipFile zipFile= new ZipFile(dest); final Enumeration zipEnum = zipFile.entries(); + final SAXParserFactory saxParserFactory = SAXParserFactory.newInstance(); // looping through all containing files while (zipEnum.hasMoreElements()) { @@ -150,18 +151,19 @@ public class odtParser extends AbstractParser implements Idiom { // extract data final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry); - final OpenDocumentTextInputStream odStream = new OpenDocumentTextInputStream(zipFileEntryStream); - FileUtils.copy(odStream, writer, Charset.forName("UTF-8")); + final SAXParser saxParser = saxParserFactory.newSAXParser(); + saxParser.parse(zipFileEntryStream, new ODContentHandler(writer)); // close readers and writers - odStream.close(); + zipFileEntryStream.close(); writer.close(); } else if (entryName.equals("meta.xml")) { // meta.xml contains metadata about the document final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry); - final ODFMetaFileAnalyzer metaAnalyzer = new ODFMetaFileAnalyzer(); - final OpenDocumentMetadata metaData = metaAnalyzer.analyzeMetaData(zipFileEntryStream); + final SAXParser saxParser = saxParserFactory.newSAXParser(); + final ODMetaHandler metaData = new ODMetaHandler(); + saxParser.parse(zipFileEntryStream, metaData); docDescription = metaData.getDescription(); docKeywordStr = metaData.getKeyword(); docShortTitle = metaData.getTitle(); @@ -260,7 +262,7 @@ public class odtParser extends AbstractParser implements Idiom { // Nothing todo here at the moment super.reset(); } - + public static void main(final String[] args) { try { if (args.length != 1) return; diff --git a/source/de/anomic/document/parser/xml/ODContentHandler.java b/source/de/anomic/document/parser/xml/ODContentHandler.java new file mode 100644 index 000000000..fed53a392 --- /dev/null +++ b/source/de/anomic/document/parser/xml/ODContentHandler.java @@ -0,0 +1,65 @@ +// ODContentHandler.java +// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 16.07.2007 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2009-07-09 22:13:11 +0200 (Do, 09. Jul 2009) $ +// $LastChangedRevision: 6186 $ +// $LastChangedBy: low012 $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.document.parser.xml; + +import java.io.IOException; +import java.io.Writer; + +import org.xml.sax.helpers.DefaultHandler; + +/** + * This is a SAX Handler, which handles the content.xml file + * of an OpenDocument-file and passes all interesting data to + * a Writer + * @author f1ori + * + */ +public class ODContentHandler extends DefaultHandler { + private Writer out; + public ODContentHandler(Writer out) { + this.out = out; + } + @Override + public void characters(final char ch[], final int start, final int length) { + try { + out.write(ch, start, length); + } catch (IOException e) { + e.printStackTrace(); + } + } + @Override + public void endElement(final String uri, final String name, final String tag) { + if ("text:p".equals(tag) || "table:table-row".equals(tag)) { + // add newlines after paragraphs + try { + out.append("\n"); + } catch (IOException e) { + e.printStackTrace(); + } + } + } +} \ No newline at end of file diff --git a/source/de/anomic/document/parser/xml/ODMetaHandler.java b/source/de/anomic/document/parser/xml/ODMetaHandler.java new file mode 100644 index 000000000..b6486ec11 --- /dev/null +++ b/source/de/anomic/document/parser/xml/ODMetaHandler.java @@ -0,0 +1,67 @@ +package de.anomic.document.parser.xml; + +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +public class ODMetaHandler extends DefaultHandler { + private StringBuilder buffer = new StringBuilder(); + + private String docCreator = null; + private String docLanguage = null; + private String docKeyword = null; + private String docSubject = null; + private String docTitle = null; + private String docDescription = null; + + public ODMetaHandler() { + } + + @Override + public void characters(final char ch[], final int start, final int length) { + buffer.append(ch, start, length); + } + + @Override + public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException { + buffer.delete(0, buffer.length()); + } + + @Override + public void endElement(final String uri, final String name, final String tag) { + if ("dc:creator".equals(tag)) { + this.docCreator = buffer.toString(); + } else if ("dc:language".equals(tag)) { + this.docLanguage = buffer.toString(); + } else if ("meta:keyword".equals(tag)) { + this.docKeyword = buffer.toString(); + } else if ("dc:subject".equals(tag)) { + this.docSubject = buffer.toString(); + } else if ("dc:title".equals(tag)) { + this.docTitle = buffer.toString(); + } else if ("dc:description".equals(tag)) { + this.docDescription = buffer.toString(); + } + } + + public String getCreator() { + return docCreator; + } + + public String getLanguage() { + return docLanguage; + } + public String getKeyword() { + return docKeyword; + } + public String getSubject() { + return docSubject; + } + public String getTitle() { + return docTitle; + } + public String getDescription() { + return docDescription; + } +} + diff --git a/test/de/anomic/document/ParserTest.java b/test/de/anomic/document/ParserTest.java index 8d17e875a..f90879f08 100644 --- a/test/de/anomic/document/ParserTest.java +++ b/test/de/anomic/document/ParserTest.java @@ -2,12 +2,14 @@ package de.anomic.document; import static org.junit.Assert.*; import org.junit.Test; +import static org.junit.matchers.JUnitMatchers.*; import java.io.File; import java.io.FileInputStream; import java.io.Reader; import java.io.InputStreamReader; +import de.anomic.document.Document; import de.anomic.document.Parser; import de.anomic.yacy.yacyURL; @@ -17,11 +19,12 @@ public class ParserTest { de.anomic.document.ParserException, java.net.MalformedURLException, java.io.UnsupportedEncodingException, java.io.IOException { String[][] testFiles = new String[][] { - new String[]{"umlaute_linux.odt", "application/vnd.oasis.opendocument.text"}, - new String[]{"umlaute_linux.ods", "application/vnd.oasis.opendocument.spreadsheat"}, - new String[]{"umlaute_linux.odp", "application/vnd.oasis.opendocument.presentation"}, - new String[]{"umlaute_linux.pdf", "application/pdf"}, - new String[]{"umlaute_windows.doc", "application/msword"}, + // meaning: filename in test/parsertest, mimetype, title, creator, description, + new String[]{"umlaute_linux.odt", "application/vnd.oasis.opendocument.text", "Münchner Hofbräuhaus", "", "Kommentar zum Hofbräuhaus"}, + new String[]{"umlaute_linux.ods", "application/vnd.oasis.opendocument.spreadsheat", "", "", ""}, + new String[]{"umlaute_linux.odp", "application/vnd.oasis.opendocument.presentation", "", "", ""}, + new String[]{"umlaute_linux.pdf", "application/pdf", "", "", ""}, + new String[]{"umlaute_windows.doc", "application/msword", "", "", ""}, }; @@ -38,10 +41,12 @@ public class ParserTest { while( (c = content.read()) != -1 ) str.append((char)c); - System.out.println("Parsed: " + str); - - assertTrue(str.indexOf("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen") != -1); + System.out.println("Parsed " + filename + ": " + str); + assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen")); + assertThat(doc.dc_title(), containsString(testFiles[i][2])); + assertThat(doc.dc_creator(), containsString(testFiles[i][3])); + assertThat(doc.dc_description(), containsString(testFiles[i][4])); } } } diff --git a/test/parsertest/umlaute_linux.odt b/test/parsertest/umlaute_linux.odt index 23d978cb1..1d70fdcc9 100755 Binary files a/test/parsertest/umlaute_linux.odt and b/test/parsertest/umlaute_linux.odt differ