* add new odf parser based on sax-xml-parser

* remove odf_utils-jar
* test metadata in ParserTest


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6231 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
f1ori 16 years ago
parent de4f0a006f
commit 67da20647f

@ -1,39 +1,39 @@
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry excluding="env/|htdocsdefault/|proxymsg/|yacy/|env/|yacy/user/|yacy/user/|yacy/ui/|processing/domaingraph/applet/|processing/domaingraph/|api/|api/bookmarks/posts/|api/bookmarks/|api/util/|api/bookmarks/xbel/|api/bookmarks/tags/" kind="src" path="htroot"/>
<classpathentry excluding="user/|user/|ui/" kind="src" path="htroot/yacy"/>
<classpathentry kind="src" path="htroot/env"/>
<classpathentry kind="src" path="source"/>
<classpathentry kind="src" path="htroot/yacy/ui"/>
<classpathentry excluding="bookmarks/posts/|bookmarks/|util/|bookmarks/xbel/|bookmarks/tags/" kind="src" path="htroot/api"/>
<classpathentry kind="src" path="htroot/api/bookmarks/posts"/>
<classpathentry excluding="posts/|xbel/|tags/" kind="src" path="htroot/api/bookmarks"/>
<classpathentry kind="src" path="htroot/api/util"/>
<classpathentry kind="src" path="htroot/api/bookmarks/xbel"/>
<classpathentry kind="src" path="htroot/api/bookmarks/tags"/>
<classpathentry exported="true" kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
<classpathentry exported="true" kind="lib" path="lib/commons-httpclient-3.1.jar"/>
<classpathentry exported="true" kind="lib" path="lib/commons-logging-1.1.1.jar"/>
<classpathentry exported="true" kind="lib" path="lib/commons-io-1.4.jar"/>
<classpathentry exported="true" kind="lib" path="lib/commons-fileupload-1.2.1.jar"/>
<classpathentry exported="true" kind="lib" path="lib/servlet-api.jar"/>
<classpathentry kind="lib" path="lib/xerces.jar"/>
<classpathentry kind="lib" path="lib/bzip2.jar"/>
<classpathentry kind="lib" path="lib/mysql-connector-java-5.1.7-bin.jar"/>
<classpathentry kind="lib" path="lib/bcmail-jdk14-139.jar"/>
<classpathentry kind="lib" path="lib/bcprov-jdk14-139.jar"/>
<classpathentry kind="lib" path="lib/commons-codec-1.3.jar"/>
<classpathentry kind="lib" path="lib/FontBox-0.1.0-dev.jar"/>
<classpathentry kind="lib" path="lib/J7Zip-modified.jar"/>
<classpathentry kind="lib" path="lib/jakarta-oro-2.0.7.jar"/>
<classpathentry kind="lib" path="lib/jsch-0.1.21.jar"/>
<classpathentry kind="lib" path="lib/log4j-1.2.9.jar"/>
<classpathentry kind="lib" path="lib/PDFBox-0.7.3.jar"/>
<classpathentry kind="lib" path="lib/poi-3.2-FINAL-20081019.jar"/>
<classpathentry kind="lib" path="lib/poi-scratchpad-3.2-FINAL-20081019.jar"/>
<classpathentry kind="lib" path="lib/webcat-0.1-swf.jar"/>
<classpathentry kind="lib" path="lib/odf_utils_05_11_29.jar"/>
<classpathentry kind="lib" path="lib/activation.jar"/>
<classpathentry kind="lib" path="lib/commons-jxpath-1.3.jar"/>
<classpathentry kind="output" path="gen"/>
</classpath>
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry excluding="env/|htdocsdefault/|proxymsg/|yacy/|env/|yacy/user/|yacy/user/|yacy/ui/|processing/domaingraph/applet/|processing/domaingraph/|api/|api/bookmarks/posts/|api/bookmarks/|api/util/|api/bookmarks/xbel/|api/bookmarks/tags/" kind="src" path="htroot"/>
<classpathentry excluding="user/|user/|ui/" kind="src" path="htroot/yacy"/>
<classpathentry kind="src" path="htroot/env"/>
<classpathentry kind="src" path="source"/>
<classpathentry kind="src" path="htroot/yacy/ui"/>
<classpathentry excluding="bookmarks/posts/|bookmarks/|util/|bookmarks/xbel/|bookmarks/tags/" kind="src" path="htroot/api"/>
<classpathentry kind="src" path="htroot/api/bookmarks/posts"/>
<classpathentry excluding="posts/|xbel/|tags/" kind="src" path="htroot/api/bookmarks"/>
<classpathentry kind="src" path="htroot/api/util"/>
<classpathentry kind="src" path="htroot/api/bookmarks/xbel"/>
<classpathentry kind="src" path="htroot/api/bookmarks/tags"/>
<classpathentry exported="true" kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
<classpathentry exported="true" kind="lib" path="lib/commons-httpclient-3.1.jar"/>
<classpathentry exported="true" kind="lib" path="lib/commons-logging-1.1.1.jar"/>
<classpathentry exported="true" kind="lib" path="lib/commons-io-1.4.jar"/>
<classpathentry exported="true" kind="lib" path="lib/commons-fileupload-1.2.1.jar"/>
<classpathentry exported="true" kind="lib" path="lib/servlet-api.jar"/>
<classpathentry kind="lib" path="lib/xerces.jar"/>
<classpathentry kind="lib" path="lib/bzip2.jar"/>
<classpathentry kind="lib" path="lib/mysql-connector-java-5.1.7-bin.jar"/>
<classpathentry kind="lib" path="lib/bcmail-jdk14-139.jar"/>
<classpathentry kind="lib" path="lib/bcprov-jdk14-139.jar"/>
<classpathentry kind="lib" path="lib/commons-codec-1.3.jar"/>
<classpathentry kind="lib" path="lib/FontBox-0.1.0-dev.jar"/>
<classpathentry kind="lib" path="lib/J7Zip-modified.jar"/>
<classpathentry kind="lib" path="lib/jakarta-oro-2.0.7.jar"/>
<classpathentry kind="lib" path="lib/jsch-0.1.21.jar"/>
<classpathentry kind="lib" path="lib/log4j-1.2.9.jar"/>
<classpathentry kind="lib" path="lib/PDFBox-0.7.3.jar"/>
<classpathentry kind="lib" path="lib/poi-3.2-FINAL-20081019.jar"/>
<classpathentry kind="lib" path="lib/poi-scratchpad-3.2-FINAL-20081019.jar"/>
<classpathentry kind="lib" path="lib/webcat-0.1-swf.jar"/>
<classpathentry kind="lib" path="lib/activation.jar"/>
<classpathentry kind="lib" path="lib/commons-jxpath-1.3.jar"/>
<classpathentry kind="lib" path="libt/junit.jar"/>
<classpathentry kind="output" path="gen"/>
</classpath>

@ -45,6 +45,7 @@
<property name="libbuild" location="libbuild"/>
<property name="build" location="classes"/>
<property name="htroot" location="htroot"/>
<property name="test" location="test"/>
<property name="langstats" location="langstats"/>
<property name="locales" location="locales"/>
<property name="skins" location="skins"/>
@ -178,7 +179,6 @@
<pathelement location="${lib}/jsch-0.1.21.jar" />
<pathelement location="${lib}/log4j-1.2.9.jar" />
<pathelement location="${lib}/mysql-connector-java-5.1.7-bin.jar" />
<pathelement location="${lib}/odf_utils_05_11_29.jar" />
<pathelement location="${lib}/poi-3.2-FINAL-20081019.jar" />
<pathelement location="${lib}/poi-scratchpad-3.2-FINAL-20081019.jar" />
<pathelement location="${lib}/servlet-api.jar" />
@ -541,7 +541,7 @@
<!-- run unittests-->
<target name="compileTest" depends="compile" description="run unittests">
<javac srcdir="test/" destdir="test/"
<javac srcdir="${test}" destdir="${test}"
debug="true" debuglevel="lines,vars,source"
source="${javacSource}" target="${javacTarget}">
<classpath>
@ -558,12 +558,12 @@
<junit printsummary="yes" haltonfailure="no">
<formatter type="plain"/>
<batchtest>
<fileset dir="test/">
<fileset dir="${test}">
<include name="**/*Test*.java"/>
</fileset>
</batchtest>
<classpath>
<pathelement location="test/"/>
<pathelement location="${test}"/>
<pathelement location="${build}"/>
<pathelement location="${htroot}"/>
<pathelement location="${libt}/junit.jar" />

Binary file not shown.

@ -33,22 +33,22 @@ import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.charset.Charset;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import com.catcode.odf.ODFMetaFileAnalyzer;
import com.catcode.odf.OpenDocumentMetadata;
import com.catcode.odf.OpenDocumentTextInputStream;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.document.AbstractParser;
import de.anomic.document.Idiom;
import de.anomic.document.ParserException;
import de.anomic.document.Document;
import de.anomic.document.parser.xml.ODContentHandler;
import de.anomic.document.parser.xml.ODMetaHandler;
import de.anomic.http.httpClient;
import de.anomic.http.httpHeader;
import de.anomic.http.httpRequestHeader;
@ -126,6 +126,7 @@ public class odtParser extends AbstractParser implements Idiom {
// opening the file as zip file
final ZipFile zipFile= new ZipFile(dest);
final Enumeration<? extends ZipEntry> zipEnum = zipFile.entries();
final SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
// looping through all containing files
while (zipEnum.hasMoreElements()) {
@ -150,18 +151,19 @@ public class odtParser extends AbstractParser implements Idiom {
// extract data
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
final OpenDocumentTextInputStream odStream = new OpenDocumentTextInputStream(zipFileEntryStream);
FileUtils.copy(odStream, writer, Charset.forName("UTF-8"));
final SAXParser saxParser = saxParserFactory.newSAXParser();
saxParser.parse(zipFileEntryStream, new ODContentHandler(writer));
// close readers and writers
odStream.close();
zipFileEntryStream.close();
writer.close();
} else if (entryName.equals("meta.xml")) {
// meta.xml contains metadata about the document
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
final ODFMetaFileAnalyzer metaAnalyzer = new ODFMetaFileAnalyzer();
final OpenDocumentMetadata metaData = metaAnalyzer.analyzeMetaData(zipFileEntryStream);
final SAXParser saxParser = saxParserFactory.newSAXParser();
final ODMetaHandler metaData = new ODMetaHandler();
saxParser.parse(zipFileEntryStream, metaData);
docDescription = metaData.getDescription();
docKeywordStr = metaData.getKeyword();
docShortTitle = metaData.getTitle();
@ -260,7 +262,7 @@ public class odtParser extends AbstractParser implements Idiom {
// Nothing todo here at the moment
super.reset();
}
public static void main(final String[] args) {
try {
if (args.length != 1) return;

@ -0,0 +1,65 @@
// ODContentHandler.java
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 16.07.2007 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2009-07-09 22:13:11 +0200 (Do, 09. Jul 2009) $
// $LastChangedRevision: 6186 $
// $LastChangedBy: low012 $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.document.parser.xml;
import java.io.IOException;
import java.io.Writer;
import org.xml.sax.helpers.DefaultHandler;
/**
* This is a SAX Handler, which handles the content.xml file
* of an OpenDocument-file and passes all interesting data to
* a Writer
* @author f1ori
*
*/
public class ODContentHandler extends DefaultHandler {
private Writer out;
public ODContentHandler(Writer out) {
this.out = out;
}
@Override
public void characters(final char ch[], final int start, final int length) {
try {
out.write(ch, start, length);
} catch (IOException e) {
e.printStackTrace();
}
}
@Override
public void endElement(final String uri, final String name, final String tag) {
if ("text:p".equals(tag) || "table:table-row".equals(tag)) {
// add newlines after paragraphs
try {
out.append("\n");
} catch (IOException e) {
e.printStackTrace();
}
}
}
}

@ -0,0 +1,67 @@
package de.anomic.document.parser.xml;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
public class ODMetaHandler extends DefaultHandler {
private StringBuilder buffer = new StringBuilder();
private String docCreator = null;
private String docLanguage = null;
private String docKeyword = null;
private String docSubject = null;
private String docTitle = null;
private String docDescription = null;
public ODMetaHandler() {
}
@Override
public void characters(final char ch[], final int start, final int length) {
buffer.append(ch, start, length);
}
@Override
public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException {
buffer.delete(0, buffer.length());
}
@Override
public void endElement(final String uri, final String name, final String tag) {
if ("dc:creator".equals(tag)) {
this.docCreator = buffer.toString();
} else if ("dc:language".equals(tag)) {
this.docLanguage = buffer.toString();
} else if ("meta:keyword".equals(tag)) {
this.docKeyword = buffer.toString();
} else if ("dc:subject".equals(tag)) {
this.docSubject = buffer.toString();
} else if ("dc:title".equals(tag)) {
this.docTitle = buffer.toString();
} else if ("dc:description".equals(tag)) {
this.docDescription = buffer.toString();
}
}
public String getCreator() {
return docCreator;
}
public String getLanguage() {
return docLanguage;
}
public String getKeyword() {
return docKeyword;
}
public String getSubject() {
return docSubject;
}
public String getTitle() {
return docTitle;
}
public String getDescription() {
return docDescription;
}
}

@ -2,12 +2,14 @@ package de.anomic.document;
import static org.junit.Assert.*;
import org.junit.Test;
import static org.junit.matchers.JUnitMatchers.*;
import java.io.File;
import java.io.FileInputStream;
import java.io.Reader;
import java.io.InputStreamReader;
import de.anomic.document.Document;
import de.anomic.document.Parser;
import de.anomic.yacy.yacyURL;
@ -17,11 +19,12 @@ public class ParserTest {
de.anomic.document.ParserException, java.net.MalformedURLException,
java.io.UnsupportedEncodingException, java.io.IOException {
String[][] testFiles = new String[][] {
new String[]{"umlaute_linux.odt", "application/vnd.oasis.opendocument.text"},
new String[]{"umlaute_linux.ods", "application/vnd.oasis.opendocument.spreadsheat"},
new String[]{"umlaute_linux.odp", "application/vnd.oasis.opendocument.presentation"},
new String[]{"umlaute_linux.pdf", "application/pdf"},
new String[]{"umlaute_windows.doc", "application/msword"},
// meaning: filename in test/parsertest, mimetype, title, creator, description,
new String[]{"umlaute_linux.odt", "application/vnd.oasis.opendocument.text", "Münchner Hofbräuhaus", "", "Kommentar zum Hofbräuhaus"},
new String[]{"umlaute_linux.ods", "application/vnd.oasis.opendocument.spreadsheat", "", "", ""},
new String[]{"umlaute_linux.odp", "application/vnd.oasis.opendocument.presentation", "", "", ""},
new String[]{"umlaute_linux.pdf", "application/pdf", "", "", ""},
new String[]{"umlaute_windows.doc", "application/msword", "", "", ""},
};
@ -38,10 +41,12 @@ public class ParserTest {
while( (c = content.read()) != -1 )
str.append((char)c);
System.out.println("Parsed: " + str);
assertTrue(str.indexOf("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen") != -1);
System.out.println("Parsed " + filename + ": " + str);
assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen"));
assertThat(doc.dc_title(), containsString(testFiles[i][2]));
assertThat(doc.dc_creator(), containsString(testFiles[i][3]));
assertThat(doc.dc_description(), containsString(testFiles[i][4]));
}
}
}

Binary file not shown.
Loading…
Cancel
Save