diff --git a/.classpath b/.classpath
index 523593c72..e0d179186 100644
--- a/.classpath
+++ b/.classpath
@@ -1,6 +1,7 @@
+
@@ -34,6 +35,6 @@
-
+
diff --git a/source/de/anomic/document/Parser.java b/source/de/anomic/document/Parser.java
index fc06420c5..2cc45216d 100644
--- a/source/de/anomic/document/Parser.java
+++ b/source/de/anomic/document/Parser.java
@@ -44,6 +44,7 @@ import de.anomic.document.parser.docParser;
import de.anomic.document.parser.gzipParser;
import de.anomic.document.parser.htmlParser;
import de.anomic.document.parser.odtParser;
+import de.anomic.document.parser.ooxmlParser;
import de.anomic.document.parser.pdfParser;
import de.anomic.document.parser.pptParser;
import de.anomic.document.parser.psParser;
@@ -81,6 +82,7 @@ public final class Parser {
initParser(new gzipParser());
initParser(new htmlParser());
initParser(new odtParser());
+ initParser(new ooxmlParser());
initParser(new pdfParser());
initParser(new pptParser());
initParser(new psParser());
diff --git a/source/de/anomic/document/parser/docParser.java b/source/de/anomic/document/parser/docParser.java
index 423b3197f..a91a541b0 100644
--- a/source/de/anomic/document/parser/docParser.java
+++ b/source/de/anomic/document/parser/docParser.java
@@ -50,7 +50,6 @@ public class docParser extends AbstractParser implements Idiom {
public static final Set SUPPORTED_EXTENSIONS = new HashSet();
static {
SUPPORTED_EXTENSIONS.add("doc");
- SUPPORTED_EXTENSIONS.add("docx");
SUPPORTED_MIME_TYPES.add("application/msword");
SUPPORTED_MIME_TYPES.add("application/doc");
SUPPORTED_MIME_TYPES.add("appl/text");
diff --git a/source/de/anomic/document/parser/ooxmlParser.java b/source/de/anomic/document/parser/ooxmlParser.java
new file mode 100644
index 000000000..af71e4565
--- /dev/null
+++ b/source/de/anomic/document/parser/ooxmlParser.java
@@ -0,0 +1,275 @@
+//odtParser.java
+//------------------------
+//part of YaCy
+//(C) by Michael Peter Christen; mc@yacy.net
+//first published on http://www.anomic.de
+//Frankfurt, Germany, 2005
+//
+//this file is contributed by Martin Thelian
+//
+// $LastChangedDate$
+// $LastChangedRevision$
+// $LastChangedBy$
+//
+//This program is free software; you can redistribute it and/or modify
+//it under the terms of the GNU General Public License as published by
+//the Free Software Foundation; either version 2 of the License, or
+//(at your option) any later version.
+//
+//This program is distributed in the hope that it will be useful,
+//but WITHOUT ANY WARRANTY; without even the implied warranty of
+//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+//GNU General Public License for more details.
+//
+//You should have received a copy of the GNU General Public License
+//along with this program; if not, write to the Free Software
+//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+package de.anomic.document.parser;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.InputStream;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.Enumeration;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipFile;
+
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import de.anomic.crawler.retrieval.HTTPLoader;
+import de.anomic.document.AbstractParser;
+import de.anomic.document.Idiom;
+import de.anomic.document.ParserException;
+import de.anomic.document.Document;
+
+import de.anomic.document.parser.xml.ODContentHandler;
+import de.anomic.document.parser.xml.ODMetaHandler;
+import de.anomic.http.client.Client;
+import de.anomic.http.metadata.HeaderFramework;
+import de.anomic.http.metadata.RequestHeader;
+import de.anomic.kelondro.util.FileUtils;
+import de.anomic.server.serverCharBuffer;
+import de.anomic.yacy.yacyURL;
+
+public class ooxmlParser extends AbstractParser implements Idiom {
+
+ /**
+ * a list of mime types that are supported by this parser class
+ * @see #getSupportedMimeTypes()
+ */
+ public static final Set SUPPORTED_MIME_TYPES = new HashSet();
+ public static final Set SUPPORTED_EXTENSIONS = new HashSet();
+ static {
+ SUPPORTED_EXTENSIONS.add("docx");
+ SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.wordprocessingml.document");
+ SUPPORTED_EXTENSIONS.add("dotx");
+ SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.wordprocessingml.template");
+ SUPPORTED_EXTENSIONS.add("potx");
+ SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.presentationml.template");
+ SUPPORTED_EXTENSIONS.add("ppsx");
+ SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.presentationml.slideshow");
+ SUPPORTED_EXTENSIONS.add("pptx");
+ SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.presentationml.presentation");
+ SUPPORTED_EXTENSIONS.add("xlsx");
+ SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+ SUPPORTED_EXTENSIONS.add("xltx");
+ SUPPORTED_MIME_TYPES.add("application/vnd.openxmlformats-officedocument.spreadsheetml.template");
+ }
+
+ public ooxmlParser() {
+ super("Open Office XML Document Parser");
+ }
+
+ public Set supportedMimeTypes() {
+ return SUPPORTED_MIME_TYPES;
+ }
+
+ public Set supportedExtensions() {
+ return SUPPORTED_EXTENSIONS;
+ }
+
+ @Override
+ public Document parse(final yacyURL location, final String mimeType, final String charset, final File dest) throws ParserException, InterruptedException {
+
+ Writer writer = null;
+ File writerFile = null;
+ try {
+ String docDescription = null;
+ String docKeywordStr = null;
+ String docShortTitle = null;
+ String docLongTitle = null;
+ String docAuthor = null;
+ String docLanguage = null;
+
+ // opening the file as zip file
+ final ZipFile zipFile= new ZipFile(dest);
+ final Enumeration extends ZipEntry> zipEnum = zipFile.entries();
+ final SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
+
+ // looping through all containing files
+ while (zipEnum.hasMoreElements()) {
+ // check for interruption
+ checkInterruption();
+
+ // getting the next zip file entry
+ final ZipEntry zipEntry= zipEnum.nextElement();
+ final String entryName = zipEntry.getName();
+
+ // content.xml contains the document content in xml format
+ if (entryName.equals("word/document.xml")
+ || entryName.startsWith("ppt/slides/slide")
+ || entryName.startsWith("xl/worksheets/sheet")) {
+ final long contentSize = zipEntry.getSize();
+
+ // creating a writer for output
+ if ((contentSize == -1) || (contentSize > Idiom.MAX_KEEP_IN_MEMORY_SIZE)) {
+ writerFile = File.createTempFile("ooxmlParser",".prt");
+ writer = new OutputStreamWriter(new FileOutputStream(writerFile),"UTF-8");
+ } else {
+ writer = new serverCharBuffer();
+ }
+
+ // extract data
+ final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
+ final SAXParser saxParser = saxParserFactory.newSAXParser();
+ saxParser.parse(zipFileEntryStream, new ODContentHandler(writer));
+
+ // close readers and writers
+ zipFileEntryStream.close();
+ writer.close();
+
+ } else if (entryName.equals("docProps/core.xml")) {
+ // meta.xml contains metadata about the document
+ final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
+ final SAXParser saxParser = saxParserFactory.newSAXParser();
+ final ODMetaHandler metaData = new ODMetaHandler();
+ saxParser.parse(zipFileEntryStream, metaData);
+ docDescription = metaData.getDescription();
+ docKeywordStr = metaData.getKeyword();
+ docShortTitle = metaData.getTitle();
+ docLongTitle = metaData.getSubject();
+ docAuthor = metaData.getCreator();
+ docLanguage = metaData.getLanguage();
+ }
+ }
+
+ // make the languages set
+ Set languages = new HashSet(1);
+ if (docLanguage != null && docLanguage.length() == 0)
+ languages.add(docLanguage);
+
+ // if there is no title availabe we generate one
+ if (docLongTitle == null || docLongTitle.length() == 0) {
+ if (docShortTitle != null) {
+ docLongTitle = docShortTitle;
+ }
+ }
+
+ // split the keywords
+ String[] docKeywords = null;
+ if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,");
+
+ // create the parser document
+ Document theDoc = null;
+ if (writer instanceof serverCharBuffer) {
+ final byte[] contentBytes = ((serverCharBuffer)writer).toString().getBytes("UTF-8");
+ theDoc = new Document(
+ location,
+ mimeType,
+ "UTF-8",
+ languages,
+ docKeywords,
+ docLongTitle,
+ docAuthor,
+ null,
+ docDescription,
+ contentBytes,
+ null,
+ null);
+ } else {
+ theDoc = new Document(
+ location,
+ mimeType,
+ "UTF-8",
+ languages,
+ docKeywords,
+ docLongTitle,
+ docAuthor,
+ null,
+ docDescription,
+ writerFile,
+ null,
+ null);
+ }
+ return theDoc;
+ } catch (final Exception e) {
+ if (e instanceof InterruptedException) throw (InterruptedException) e;
+ if (e instanceof ParserException) throw (ParserException) e;
+
+ // close the writer
+ if (writer != null) try { writer.close(); } catch (final Exception ex) {/* ignore this */}
+
+ // delete the file
+ if (writerFile != null) FileUtils.deletedelete(writerFile);
+ e.printStackTrace();
+ throw new ParserException("Unexpected error while parsing odt file. " + e.getMessage(),location);
+ }
+ }
+
+ public Document parse(final yacyURL location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
+ File dest = null;
+ try {
+ // creating a tempfile
+ dest = File.createTempFile("OpenDocument", ".odt");
+ dest.deleteOnExit();
+
+ // copying the stream into a file
+ FileUtils.copy(source, dest);
+
+ // parsing the content
+ return parse(location, mimeType, charset, dest);
+ } catch (final Exception e) {
+ if (e instanceof InterruptedException) throw (InterruptedException) e;
+ if (e instanceof ParserException) throw (ParserException) e;
+
+ throw new ParserException("Unexpected error while parsing odt file. " + e.getMessage(),location);
+ } finally {
+ if (dest != null) FileUtils.deletedelete(dest);
+ }
+ }
+
+ @Override
+ public void reset() {
+ // Nothing todo here at the moment
+ super.reset();
+ }
+
+ public static void main(final String[] args) {
+ try {
+ if (args.length != 1) return;
+
+ // getting the content URL
+ final yacyURL contentUrl = new yacyURL(args[0], null);
+
+ // creating a new parser
+ final odtParser testParser = new odtParser();
+
+ // downloading the document content
+ final RequestHeader reqHeader = new RequestHeader();
+ reqHeader.put(HeaderFramework.USER_AGENT, HTTPLoader.crawlerUserAgent);
+ final byte[] content = Client.wget(contentUrl.toString(), reqHeader, 10000);
+ final ByteArrayInputStream input = new ByteArrayInputStream(content);
+
+ // parsing the document
+ testParser.parse(contentUrl, "application/vnd.oasis.opendocument.text", null, input);
+ } catch (final Exception e) {
+ e.printStackTrace();
+ }
+ }
+}
diff --git a/source/de/anomic/document/parser/xml/ODContentHandler.java b/source/de/anomic/document/parser/xml/ODContentHandler.java
index fed53a392..23b80421d 100644
--- a/source/de/anomic/document/parser/xml/ODContentHandler.java
+++ b/source/de/anomic/document/parser/xml/ODContentHandler.java
@@ -53,7 +53,7 @@ public class ODContentHandler extends DefaultHandler {
}
@Override
public void endElement(final String uri, final String name, final String tag) {
- if ("text:p".equals(tag) || "table:table-row".equals(tag)) {
+ if ("text:p".equals(tag) || "table:table-row".equals(tag) || "w:p".equals(tag)) {
// add newlines after paragraphs
try {
out.append("\n");
diff --git a/test/de/anomic/document/ParserTest.java b/test/de/anomic/document/ParserTest.java
index f90879f08..51ebb5927 100644
--- a/test/de/anomic/document/ParserTest.java
+++ b/test/de/anomic/document/ParserTest.java
@@ -20,6 +20,7 @@ public class ParserTest {
java.io.UnsupportedEncodingException, java.io.IOException {
String[][] testFiles = new String[][] {
// meaning: filename in test/parsertest, mimetype, title, creator, description,
+ new String[]{"umlaute_windows.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen", "", ""},
new String[]{"umlaute_linux.odt", "application/vnd.oasis.opendocument.text", "Münchner Hofbräuhaus", "", "Kommentar zum Hofbräuhaus"},
new String[]{"umlaute_linux.ods", "application/vnd.oasis.opendocument.spreadsheat", "", "", ""},
new String[]{"umlaute_linux.odp", "application/vnd.oasis.opendocument.presentation", "", "", ""},