- plasmaParserDocument can process subdocuments now (other archive-parsers may want to use this method)
- added 7zip parser - added 'text/sgml' to realtime parseable mimetypes (sometimes returned by the mime type parser) - added new cached output stream class, very suitable for parsers because of limited memory git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3740 6c8d7289-2bf4-0310-a012-ef5d649a1542pull/1/head
parent
b1680ab71f
commit
0a64047081
@ -1,42 +1,43 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<classpath>
|
||||
<classpathentry kind="src" path="source"/>
|
||||
<classpathentry excluding="htdocsdefault/|locale/|proxymsg/|yacy/|env/|env/" kind="src" output="htroot" path="htroot"/>
|
||||
<classpathentry kind="src" path="htroot/htdocsdefault"/>
|
||||
<classpathentry kind="src" path="htroot/yacy"/>
|
||||
<classpathentry kind="src" path="htroot/env"/>
|
||||
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
|
||||
<classpathentry kind="lib" path="lib/commons-collections.jar"/>
|
||||
<classpathentry kind="lib" path="lib/commons-pool.jar"/>
|
||||
<classpathentry kind="lib" path="lib/tar.jar"/>
|
||||
<classpathentry kind="lib" path="libx/axis-ant.jar"/>
|
||||
<classpathentry kind="lib" path="libx/axis.jar"/>
|
||||
<classpathentry kind="lib" path="libx/bzip2.jar"/>
|
||||
<classpathentry kind="lib" path="libx/commons-codec-1.3.jar"/>
|
||||
<classpathentry kind="lib" path="libx/commons-discovery.jar"/>
|
||||
<classpathentry kind="lib" path="libx/commons-jxpath-1.1.jar"/>
|
||||
<classpathentry kind="lib" path="libx/commons-logging.jar"/>
|
||||
<classpathentry kind="lib" path="libx/informa-0.6.0.jar"/>
|
||||
<classpathentry kind="lib" path="libx/jakarta-oro-2.0.7.jar"/>
|
||||
<classpathentry kind="lib" path="libx/jaxrpc.jar"/>
|
||||
<classpathentry kind="lib" path="libx/jdom.jar"/>
|
||||
<classpathentry kind="lib" path="libx/jmimemagic-0.1.0.jar"/>
|
||||
<classpathentry kind="lib" path="libx/jrpm-head.jar"/>
|
||||
<classpathentry kind="lib" path="libx/jrpm-SNAPSHOT.jar"/>
|
||||
<classpathentry kind="lib" path="libx/jsch-0.1.21.jar"/>
|
||||
<classpathentry kind="lib" path="libx/log4j-1.2.9.jar"/>
|
||||
<classpathentry kind="lib" path="libx/odf_utils_05_11_29.jar"/>
|
||||
<classpathentry kind="lib" path="libx/PDFBox-0.7.2.jar"/>
|
||||
<classpathentry kind="lib" path="libx/saaj.jar"/>
|
||||
<classpathentry kind="lib" path="libx/sbbi-upnplib-1.0.3.jar"/>
|
||||
<classpathentry kind="lib" path="libx/tm-extractors-0.4.jar"/>
|
||||
<classpathentry kind="lib" path="libx/wsdl4j.jar"/>
|
||||
<classpathentry kind="lib" path="libx/xerces.jar"/>
|
||||
<classpathentry kind="lib" path="libx/poi-3.0-alpha2-20060616.jar"/>
|
||||
<classpathentry kind="lib" path="libx/poi-scratchpad-3.0-alpha2-20060616.jar"/>
|
||||
<classpathentry kind="lib" path="libx/inetlib.jar"/>
|
||||
<classpathentry kind="lib" path="libx/gnumail.jar"/>
|
||||
<classpathentry kind="lib" path="libx/activation.jar"/>
|
||||
<classpathentry kind="lib" path="libx/webcat-0.1-swf.jar"/>
|
||||
<classpathentry kind="output" path="gen"/>
|
||||
</classpath>
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<classpath>
|
||||
<classpathentry kind="src" path="source"/>
|
||||
<classpathentry excluding="htdocsdefault/|locale/|proxymsg/|yacy/|env/|env/" kind="src" output="htroot" path="htroot"/>
|
||||
<classpathentry kind="src" path="htroot/htdocsdefault"/>
|
||||
<classpathentry kind="src" path="htroot/yacy"/>
|
||||
<classpathentry kind="src" path="htroot/env"/>
|
||||
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
|
||||
<classpathentry kind="lib" path="lib/commons-collections.jar"/>
|
||||
<classpathentry kind="lib" path="lib/commons-pool.jar"/>
|
||||
<classpathentry kind="lib" path="lib/tar.jar"/>
|
||||
<classpathentry kind="lib" path="libx/axis-ant.jar"/>
|
||||
<classpathentry kind="lib" path="libx/axis.jar"/>
|
||||
<classpathentry kind="lib" path="libx/bzip2.jar"/>
|
||||
<classpathentry kind="lib" path="libx/commons-codec-1.3.jar"/>
|
||||
<classpathentry kind="lib" path="libx/commons-discovery.jar"/>
|
||||
<classpathentry kind="lib" path="libx/commons-jxpath-1.1.jar"/>
|
||||
<classpathentry kind="lib" path="libx/commons-logging.jar"/>
|
||||
<classpathentry kind="lib" path="libx/informa-0.6.0.jar"/>
|
||||
<classpathentry kind="lib" path="libx/jakarta-oro-2.0.7.jar"/>
|
||||
<classpathentry kind="lib" path="libx/jaxrpc.jar"/>
|
||||
<classpathentry kind="lib" path="libx/jdom.jar"/>
|
||||
<classpathentry kind="lib" path="libx/jmimemagic-0.1.0.jar"/>
|
||||
<classpathentry kind="lib" path="libx/jrpm-head.jar"/>
|
||||
<classpathentry kind="lib" path="libx/jrpm-SNAPSHOT.jar"/>
|
||||
<classpathentry kind="lib" path="libx/jsch-0.1.21.jar"/>
|
||||
<classpathentry kind="lib" path="libx/log4j-1.2.9.jar"/>
|
||||
<classpathentry kind="lib" path="libx/odf_utils_05_11_29.jar"/>
|
||||
<classpathentry kind="lib" path="libx/PDFBox-0.7.2.jar"/>
|
||||
<classpathentry kind="lib" path="libx/saaj.jar"/>
|
||||
<classpathentry kind="lib" path="libx/sbbi-upnplib-1.0.3.jar"/>
|
||||
<classpathentry kind="lib" path="libx/tm-extractors-0.4.jar"/>
|
||||
<classpathentry kind="lib" path="libx/wsdl4j.jar"/>
|
||||
<classpathentry kind="lib" path="libx/xerces.jar"/>
|
||||
<classpathentry kind="lib" path="libx/poi-3.0-alpha2-20060616.jar"/>
|
||||
<classpathentry kind="lib" path="libx/poi-scratchpad-3.0-alpha2-20060616.jar"/>
|
||||
<classpathentry kind="lib" path="libx/inetlib.jar"/>
|
||||
<classpathentry kind="lib" path="libx/gnumail.jar"/>
|
||||
<classpathentry kind="lib" path="libx/activation.jar"/>
|
||||
<classpathentry kind="lib" path="libx/webcat-0.1-swf.jar"/>
|
||||
<classpathentry kind="lib" path="libx/J7Zip-modified.jar"/>
|
||||
<classpathentry kind="output" path="gen"/>
|
||||
</classpath>
|
||||
|
@ -1,17 +1,17 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<projectDescription>
|
||||
<name>trunk</name>
|
||||
<comment></comment>
|
||||
<projects>
|
||||
</projects>
|
||||
<buildSpec>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.jdt.core.javabuilder</name>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
</buildSpec>
|
||||
<natures>
|
||||
<nature>org.eclipse.jdt.core.javanature</nature>
|
||||
</natures>
|
||||
</projectDescription>
|
||||
<name>yacy</name>
|
||||
<comment></comment>
|
||||
<projects>
|
||||
</projects>
|
||||
<buildSpec>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.jdt.core.javabuilder</name>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
</buildSpec>
|
||||
<natures>
|
||||
<nature>org.eclipse.jdt.core.javanature</nature>
|
||||
</natures>
|
||||
</projectDescription>
|
||||
|
Binary file not shown.
@ -0,0 +1,86 @@
|
||||
// ByteArrayIInStream.java
|
||||
// -------------------------------------
|
||||
// part of YACY
|
||||
// (C) by Michael Peter Christen; mc@anomic.de
|
||||
// first published on http://www.anomic.de
|
||||
// Frankfurt, Germany, 2004
|
||||
//
|
||||
// This file ist contributed by Franz Brausze
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
//
|
||||
// Using this software in any meaning (reading, learning, copying, compiling,
|
||||
// running) means that you agree that the Author(s) is (are) not responsible
|
||||
// for cost, loss of data or any harm that may be caused directly or indirectly
|
||||
// by usage of this softare or this documentation. The usage of this software
|
||||
// is on your own risk. The installation and usage (starting/running) of this
|
||||
// software may allow other people or application to access your computer and
|
||||
// any attached devices and is highly dependent on the configuration of the
|
||||
// software which must be done by the user of the software; the author(s) is
|
||||
// (are) also not responsible for proper configuration and usage of the
|
||||
// software, even if provoked by documentation provided together with
|
||||
// the software.
|
||||
//
|
||||
// Any changes to this file according to the GPL as documented in the file
|
||||
// gpl.txt aside this file in the shipment you received can be done to the
|
||||
// lines that follows this copyright notice here, but changes must not be
|
||||
// done inside the copyright notive above. A re-distribution must contain
|
||||
// the intact and unchanged copyright notice.
|
||||
// Contributions and changes to the program code must be marked as such.
|
||||
|
||||
package de.anomic.plasma.parser.sevenzip;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
import SevenZip.IInStream;
|
||||
|
||||
public class ByteArrayIInStream extends IInStream {
|
||||
|
||||
private class SeekableByteArrayInputStream extends ByteArrayInputStream {
|
||||
public SeekableByteArrayInputStream(byte[] buf) { super(buf); }
|
||||
public SeekableByteArrayInputStream(byte[] buf, int off, int len) { super(buf, off, len); }
|
||||
|
||||
public int getPosition() { return super.pos; }
|
||||
public void seekRelative(int offset) { seekAbsolute(super.pos + offset); }
|
||||
public void seekAbsolute(int offset) {
|
||||
if (offset > super.count)
|
||||
throw new IndexOutOfBoundsException(Integer.toString(offset));
|
||||
super.pos = offset;
|
||||
}
|
||||
}
|
||||
|
||||
private final SeekableByteArrayInputStream sbais;
|
||||
|
||||
public ByteArrayIInStream(byte[] buffer) {
|
||||
this.sbais = new SeekableByteArrayInputStream(buffer);
|
||||
}
|
||||
|
||||
public long Seek(long offset, int origin) {
|
||||
switch (origin) {
|
||||
case STREAM_SEEK_SET: this.sbais.seekAbsolute((int)offset); break;
|
||||
case STREAM_SEEK_CUR: this.sbais.seekRelative((int)offset); break;
|
||||
}
|
||||
return this.sbais.getPosition();
|
||||
}
|
||||
|
||||
public int read() throws IOException {
|
||||
return this.sbais.read();
|
||||
}
|
||||
|
||||
public int read(byte[] b, int off, int len) throws IOException {
|
||||
return this.sbais.read(b, off, len);
|
||||
}
|
||||
}
|
@ -0,0 +1,176 @@
|
||||
// SZParserExtractCallback.java
|
||||
// -------------------------------------
|
||||
// part of YACY
|
||||
// (C) by Michael Peter Christen; mc@anomic.de
|
||||
// first published on http://www.anomic.de
|
||||
// Frankfurt, Germany, 2004
|
||||
//
|
||||
// This file ist contributed by Franz Brausze
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
//
|
||||
// Using this software in any meaning (reading, learning, copying, compiling,
|
||||
// running) means that you agree that the Author(s) is (are) not responsible
|
||||
// for cost, loss of data or any harm that may be caused directly or indirectly
|
||||
// by usage of this softare or this documentation. The usage of this software
|
||||
// is on your own risk. The installation and usage (starting/running) of this
|
||||
// software may allow other people or application to access your computer and
|
||||
// any attached devices and is highly dependent on the configuration of the
|
||||
// software which must be done by the user of the software; the author(s) is
|
||||
// (are) also not responsible for proper configuration and usage of the
|
||||
// software, even if provoked by documentation provided together with
|
||||
// the software.
|
||||
//
|
||||
// Any changes to this file according to the GPL as documented in the file
|
||||
// gpl.txt aside this file in the shipment you received can be done to the
|
||||
// lines that follows this copyright notice here, but changes must not be
|
||||
// done inside the copyright notive above. A re-distribution must contain
|
||||
// the intact and unchanged copyright notice.
|
||||
// Contributions and changes to the program code must be marked as such.
|
||||
|
||||
package de.anomic.plasma.parser.sevenzip;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
|
||||
import de.anomic.net.URL;
|
||||
import de.anomic.plasma.plasmaParser;
|
||||
import de.anomic.plasma.plasmaParserDocument;
|
||||
import de.anomic.plasma.parser.AbstractParser;
|
||||
import de.anomic.plasma.parser.ParserException;
|
||||
import de.anomic.server.serverCachedFileOutputStream;
|
||||
import de.anomic.server.logging.serverLog;
|
||||
|
||||
import SevenZip.ArchiveExtractCallback;
|
||||
import SevenZip.Archive.IInArchive;
|
||||
import SevenZip.Archive.SevenZipEntry;
|
||||
|
||||
// wrapper class to redirect output of standard ArchiveExtractCallback to serverLog
|
||||
// and parse the extracted content
|
||||
public class SZParserExtractCallback extends ArchiveExtractCallback {
|
||||
|
||||
private final serverLog log;
|
||||
private final long maxRamSize;
|
||||
private serverCachedFileOutputStream cfos = null;
|
||||
private final plasmaParser parser;
|
||||
private final plasmaParserDocument doc;
|
||||
private final String prefix;
|
||||
|
||||
public SZParserExtractCallback(serverLog logger, IInArchive handler,
|
||||
long maxRamSize, plasmaParserDocument doc, String prefix) {
|
||||
super.Init(handler);
|
||||
this.log = logger;
|
||||
this.maxRamSize = maxRamSize;
|
||||
this.parser = new plasmaParser();
|
||||
this.doc = doc;
|
||||
this.prefix = prefix;
|
||||
}
|
||||
|
||||
public void PrepareOperation(int arg0) {
|
||||
this.extractMode = (arg0 == IInArchive.NExtract_NAskMode_kExtract);
|
||||
switch (arg0) {
|
||||
case IInArchive.NExtract_NAskMode_kExtract:
|
||||
this.log.logFine("Extracting " + this.filePath);
|
||||
break;
|
||||
case IInArchive.NExtract_NAskMode_kTest:
|
||||
this.log.logFine("Testing " + this.filePath);
|
||||
break;
|
||||
case IInArchive.NExtract_NAskMode_kSkip:
|
||||
this.log.logFine("Skipping " + this.filePath);
|
||||
break;
|
||||
};
|
||||
}
|
||||
|
||||
public void SetOperationResult(int arg0) throws IOException {
|
||||
if (arg0 != IInArchive.NExtract_NOperationResult_kOK) {
|
||||
this.NumErrors++;
|
||||
switch(arg0) {
|
||||
case IInArchive.NExtract_NOperationResult_kUnSupportedMethod:
|
||||
throw new IOException("Unsupported Method");
|
||||
case IInArchive.NExtract_NOperationResult_kCRCError:
|
||||
throw new IOException("CRC Failed");
|
||||
case IInArchive.NExtract_NOperationResult_kDataError:
|
||||
throw new IOException("Data Error");
|
||||
default:
|
||||
// throw new IOException("Unknown Error");
|
||||
}
|
||||
} else try {
|
||||
AbstractParser.checkInterruption();
|
||||
|
||||
if (this.cfos != null) {
|
||||
// parse the file
|
||||
plasmaParserDocument theDoc;
|
||||
// workaround for relative links in file, normally '#' shall be used behind the location, see
|
||||
// below for reversion of the effects
|
||||
URL url = new URL(doc.getLocation(), this.prefix + "/" + super.filePath);
|
||||
String mime = plasmaParser.getMimeTypeByFileExt(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
|
||||
if (this.cfos.isFallback()) {
|
||||
theDoc = this.parser.parseSource(url, mime, null, this.cfos.getContentFile());
|
||||
} else {
|
||||
theDoc = this.parser.parseSource(url, mime, null, this.cfos.getContentBAOS());
|
||||
}
|
||||
|
||||
// revert the above workaround
|
||||
Map nanchors = new HashMap(theDoc.getAnchors().size(), 1f);
|
||||
Iterator it = theDoc.getAnchors().entrySet().iterator();
|
||||
Map.Entry entry;
|
||||
String base = doc.getLocation().toNormalform();
|
||||
while (it.hasNext()) {
|
||||
entry = (Map.Entry)it.next();
|
||||
if (((String)entry.getKey()).startsWith(base + "/")) {
|
||||
String ref = "#" + ((String)entry.getKey()).substring(base.length() + 1);
|
||||
this.log.logFinest("changing " + entry.getKey() + " to use reference " + ref);
|
||||
nanchors.put(base + ref, entry.getValue());
|
||||
} else {
|
||||
nanchors.put(entry.getKey(), entry.getValue());
|
||||
}
|
||||
}
|
||||
theDoc.getAnchors().clear();
|
||||
theDoc.getAnchors().putAll(nanchors);
|
||||
this.doc.addSubDocument(theDoc);
|
||||
}
|
||||
} catch (ParserException e) {
|
||||
IOException ex = new IOException("error parsing extracted content of " + super.filePath + ": " + e.getMessage());
|
||||
ex.initCause(e);
|
||||
throw ex;
|
||||
} catch (InterruptedException e) {
|
||||
IOException ex = new IOException("interrupted");
|
||||
ex.initCause(e);
|
||||
throw ex;
|
||||
}
|
||||
}
|
||||
|
||||
public OutputStream GetStream(int index, int askExtractMode) throws IOException {
|
||||
SevenZipEntry item = super.archiveHandler.getEntry(index);
|
||||
super.filePath = item.getName();
|
||||
try {
|
||||
AbstractParser.checkInterruption();
|
||||
} catch (InterruptedException e) {
|
||||
IOException ex = new IOException("interrupted");
|
||||
ex.initCause(e);
|
||||
throw ex;
|
||||
}
|
||||
this.cfos = (item.isDirectory()) ? null
|
||||
: new serverCachedFileOutputStream(this.maxRamSize, null, true, item.getSize());
|
||||
return this.cfos;
|
||||
}
|
||||
|
||||
public String getCurrentFilePath() {
|
||||
return super.filePath;
|
||||
}
|
||||
}
|
@ -0,0 +1,58 @@
|
||||
<?xml version="1.0"?>
|
||||
<project name="YACY - sevenzipParser" default="dist">
|
||||
<description>
|
||||
Some classes to parse 7zip files
|
||||
</description>
|
||||
|
||||
<property name="parserShortName" value="sevenzip"/>
|
||||
<property name="parserVersion" value="0.1"/>
|
||||
|
||||
<property name="parserLongName" value="yacyContentParser_${parserShortName}"/>
|
||||
<property name="parserArchive" location="${release}/${parserLongName}_${parserVersion}.tgz"/>
|
||||
|
||||
<target name="compile">
|
||||
<javac srcdir="${src}/de/anomic/plasma/parser/${parserShortName}"
|
||||
destdir="${build}" source="${javacSource}" target="${javacTarget}"
|
||||
debug="true" debuglevel="lines,vars,source">
|
||||
<classpath>
|
||||
<pathelement location="${build}" />
|
||||
|
||||
<pathelement location="${libx}/J7Zip-modified.jar" />
|
||||
</classpath>
|
||||
</javac>
|
||||
</target>
|
||||
|
||||
|
||||
<target name="zip" depends="compile">
|
||||
<tar destfile="${parserArchive}" compression="gzip">
|
||||
<tarfileset dir="${libx}"
|
||||
includes="J7Zip-modified.jar"
|
||||
prefix="${releaseFileParentDir}/libx/"
|
||||
dirmode="755" mode="644"/>
|
||||
<tarfileset dir="${src}/de/anomic/plasma/parser/${parserShortName}"
|
||||
prefix="${releaseFileParentDir}/source/de/anomic/plasma/parser/${parserShortName}"
|
||||
dirmode="755" mode="644"/>
|
||||
<tarfileset dir="${build}/de/anomic/plasma/parser/${parserShortName}"
|
||||
prefix="${releaseFileParentDir}/classes/de/anomic/plasma/parser/${parserShortName}"
|
||||
dirmode="755" mode="644"/>
|
||||
</tar>
|
||||
</target>
|
||||
|
||||
<target name="copy" depends="compile">
|
||||
<copy todir="${release}/libx/">
|
||||
<fileset dir="${libx}" includes="J7Zip-modified.jar"/>
|
||||
</copy>
|
||||
<copy todir="${release}/source/de/anomic/plasma/parser/${parserShortName}">
|
||||
<fileset dir="${src}/de/anomic/plasma/parser/${parserShortName}" includes="**/*"/>
|
||||
</copy>
|
||||
<copy todir="${release}/classes/de/anomic/plasma/parser/${parserShortName}">
|
||||
<fileset dir="${build}/de/anomic/plasma/parser/${parserShortName}" includes="**/*"/>
|
||||
</copy>
|
||||
</target>
|
||||
|
||||
|
||||
<target name="dist" depends="compile,zip" description="Compile and zip the parser"/>
|
||||
|
||||
|
||||
</project>
|
||||
|
@ -0,0 +1,146 @@
|
||||
// sevenzipParser.java
|
||||
// -------------------------------------
|
||||
// part of YACY
|
||||
// (C) by Michael Peter Christen; mc@anomic.de
|
||||
// first published on http://www.anomic.de
|
||||
// Frankfurt, Germany, 2004
|
||||
//
|
||||
// This file ist contributed by Franz Brausze
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
//
|
||||
// Using this software in any meaning (reading, learning, copying, compiling,
|
||||
// running) means that you agree that the Author(s) is (are) not responsible
|
||||
// for cost, loss of data or any harm that may be caused directly or indirectly
|
||||
// by usage of this softare or this documentation. The usage of this software
|
||||
// is on your own risk. The installation and usage (starting/running) of this
|
||||
// software may allow other people or application to access your computer and
|
||||
// any attached devices and is highly dependent on the configuration of the
|
||||
// software which must be done by the user of the software; the author(s) is
|
||||
// (are) also not responsible for proper configuration and usage of the
|
||||
// software, even if provoked by documentation provided together with
|
||||
// the software.
|
||||
//
|
||||
// Any changes to this file according to the GPL as documented in the file
|
||||
// gpl.txt aside this file in the shipment you received can be done to the
|
||||
// lines that follows this copyright notice here, but changes must not be
|
||||
// done inside the copyright notive above. A re-distribution must contain
|
||||
// the intact and unchanged copyright notice.
|
||||
// Contributions and changes to the program code must be marked as such.
|
||||
|
||||
package de.anomic.plasma.parser.sevenzip;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.Hashtable;
|
||||
|
||||
import SevenZip.IInStream;
|
||||
import SevenZip.MyRandomAccessFile;
|
||||
import SevenZip.Archive.SevenZip.Handler;
|
||||
|
||||
import de.anomic.net.URL;
|
||||
import de.anomic.plasma.plasmaParserDocument;
|
||||
import de.anomic.plasma.parser.AbstractParser;
|
||||
import de.anomic.plasma.parser.Parser;
|
||||
import de.anomic.plasma.parser.ParserException;
|
||||
import de.anomic.server.serverCachedFileOutputStream;
|
||||
import de.anomic.server.serverFileUtils;
|
||||
|
||||
public class sevenzipParser extends AbstractParser implements Parser {
|
||||
|
||||
/**
|
||||
* a list of mime types that are supported by this parser class
|
||||
* @see #getSupportedMimeTypes()
|
||||
*/
|
||||
public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable();
|
||||
static {
|
||||
SUPPORTED_MIME_TYPES.put("application/x-7z-compressed", "7z");
|
||||
}
|
||||
|
||||
/**
|
||||
* a list of library names that are needed by this parser
|
||||
* @see Parser#getLibxDependences()
|
||||
*/
|
||||
private static final String[] LIBX_DEPENDENCIES = new String[] { "J7Zip-modified.jar" };
|
||||
|
||||
public sevenzipParser() throws IOException {
|
||||
super(LIBX_DEPENDENCIES);
|
||||
super.parserName = "7zip Archive Parser";
|
||||
}
|
||||
|
||||
public plasmaParserDocument parse(URL location, String mimeType, String charset,
|
||||
IInStream source, long maxRamSize) throws ParserException, InterruptedException {
|
||||
plasmaParserDocument doc = new plasmaParserDocument(location, mimeType, charset);
|
||||
Handler archive;
|
||||
super.theLogger.logFine("opening 7zip archive...");
|
||||
try {
|
||||
archive = new Handler(source);
|
||||
} catch (IOException e) {
|
||||
throw new ParserException("error opening 7zip archive", location, e);
|
||||
}
|
||||
checkInterruption();
|
||||
SZParserExtractCallback aec = new SZParserExtractCallback(super.theLogger, archive,
|
||||
maxRamSize, doc, location.getFile());
|
||||
super.theLogger.logFine("processing archive contents...");
|
||||
try {
|
||||
archive.Extract(null, -1, 0, aec);
|
||||
return doc;
|
||||
} catch (IOException e) {
|
||||
if (e.getCause() instanceof InterruptedException)
|
||||
throw (InterruptedException)e.getCause();
|
||||
if (e.getCause() instanceof ParserException)
|
||||
throw (ParserException)e.getCause();
|
||||
throw new ParserException(
|
||||
"error processing 7zip archive at internal file: " + aec.getCurrentFilePath(),
|
||||
location, e);
|
||||
} finally {
|
||||
try { archive.close(); } catch (IOException e) { }
|
||||
}
|
||||
}
|
||||
|
||||
public plasmaParserDocument parse(URL location, String mimeType, String charset,
|
||||
byte[] source) throws ParserException, InterruptedException {
|
||||
return parse(location, mimeType, charset, new ByteArrayIInStream(source), Parser.MAX_KEEP_IN_MEMORY_SIZE - source.length);
|
||||
}
|
||||
|
||||
public plasmaParserDocument parse(URL location, String mimeType, String charset,
|
||||
File sourceFile) throws ParserException, InterruptedException {
|
||||
try {
|
||||
return parse(location, mimeType, charset, new MyRandomAccessFile(sourceFile, "r"), Parser.MAX_KEEP_IN_MEMORY_SIZE);
|
||||
} catch (IOException e) {
|
||||
throw new ParserException("error processing 7zip archive", location, e);
|
||||
}
|
||||
}
|
||||
|
||||
public plasmaParserDocument parse(URL location, String mimeType, String charset,
|
||||
InputStream source) throws ParserException, InterruptedException {
|
||||
try {
|
||||
serverCachedFileOutputStream cfos = new serverCachedFileOutputStream(Parser.MAX_KEEP_IN_MEMORY_SIZE);
|
||||
serverFileUtils.copy(source, cfos);
|
||||
if (cfos.isFallback()) {
|
||||
return parse(location, mimeType, charset, cfos.getContentFile());
|
||||
} else {
|
||||
return parse(location, mimeType, charset, cfos.getContentBAOS());
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new ParserException("error processing 7zip archive", location, e);
|
||||
}
|
||||
}
|
||||
|
||||
public Hashtable getSupportedMimeTypes() {
|
||||
return SUPPORTED_MIME_TYPES;
|
||||
}
|
||||
}
|
@ -0,0 +1,175 @@
|
||||
// FileFallbackByteArrayOutputStream.java
|
||||
// -------------------------------------
|
||||
// part of YACY
|
||||
// (C) by Michael Peter Christen; mc@anomic.de
|
||||
// first published on http://www.anomic.de
|
||||
// Frankfurt, Germany, 2004
|
||||
//
|
||||
// This file ist contributed by Franz Brausze
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
//
|
||||
// Using this software in any meaning (reading, learning, copying, compiling,
|
||||
// running) means that you agree that the Author(s) is (are) not responsible
|
||||
// for cost, loss of data or any harm that may be caused directly or indirectly
|
||||
// by usage of this softare or this documentation. The usage of this software
|
||||
// is on your own risk. The installation and usage (starting/running) of this
|
||||
// software may allow other people or application to access your computer and
|
||||
// any attached devices and is highly dependent on the configuration of the
|
||||
// software which must be done by the user of the software; the author(s) is
|
||||
// (are) also not responsible for proper configuration and usage of the
|
||||
// software, even if provoked by documentation provided together with
|
||||
// the software.
|
||||
//
|
||||
// Any changes to this file according to the GPL as documented in the file
|
||||
// gpl.txt aside this file in the shipment you received can be done to the
|
||||
// lines that follows this copyright notice here, but changes must not be
|
||||
// done inside the copyright notive above. A re-distribution must contain
|
||||
// the intact and unchanged copyright notice.
|
||||
// Contributions and changes to the program code must be marked as such.
|
||||
|
||||
package de.anomic.server;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
|
||||
public class serverCachedFileOutputStream extends ByteArrayOutputStream {
|
||||
|
||||
protected File fallbackFile;
|
||||
protected long fallbackSize;
|
||||
protected boolean buffered;
|
||||
|
||||
protected long size = 0;
|
||||
protected boolean isFallback = false;
|
||||
protected OutputStream fallback = null;
|
||||
|
||||
public serverCachedFileOutputStream(long fallbackSize) throws IOException {
|
||||
this(fallbackSize, null, true, 32);
|
||||
}
|
||||
|
||||
public serverCachedFileOutputStream(long fallbackSize, File fallback, boolean buffered)
|
||||
throws IOException {
|
||||
this(fallbackSize, fallback, buffered, 32);
|
||||
}
|
||||
|
||||
public serverCachedFileOutputStream(long fallbackSize, File fallback, boolean buffered,
|
||||
long size) throws IOException {
|
||||
this.fallbackSize = fallbackSize;
|
||||
this.fallbackFile = (fallback == null) ? File.createTempFile(
|
||||
serverCachedFileOutputStream.class.getName(),
|
||||
Long.toString(System.currentTimeMillis())) : fallback;
|
||||
this.buffered = buffered;
|
||||
checkFallback(size);
|
||||
}
|
||||
|
||||
public serverCachedFileOutputStream(long fallbackSize, File fallback, boolean buffered,
|
||||
byte[] data) throws IOException {
|
||||
this(fallbackSize, fallback, buffered, 0);
|
||||
super.buf = data;
|
||||
super.count = data.length;
|
||||
checkFallback(this.size = data.length);
|
||||
}
|
||||
|
||||
protected boolean checkFallback(long size) {
|
||||
if (size > this.fallbackSize) try {
|
||||
fallback();
|
||||
return true;
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("error falling back to file", e);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public void fallback() throws IOException {
|
||||
if (this.isFallback) return;
|
||||
this.isFallback = true;
|
||||
if (!this.fallbackFile.exists()) {
|
||||
this.fallbackFile.createNewFile();
|
||||
} else if (this.fallbackFile.isDirectory()) {
|
||||
throw new IOException("cannot write on a directory");
|
||||
}
|
||||
OutputStream os = new FileOutputStream(this.fallbackFile);
|
||||
this.fallback = (this.buffered) ? new BufferedOutputStream(os) : os;
|
||||
serverFileUtils.copy(new ByteArrayInputStream(super.buf), this.fallback);
|
||||
super.buf = new byte[0];
|
||||
super.count = 0;
|
||||
super.reset();
|
||||
}
|
||||
|
||||
public boolean isFallback() {
|
||||
return this.isFallback;
|
||||
}
|
||||
|
||||
public void write(int b) {
|
||||
if (checkFallback(++this.size)) try {
|
||||
this.fallback.write(b);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("error writing to fallback", e);
|
||||
} else {
|
||||
super.write(b);
|
||||
}
|
||||
}
|
||||
|
||||
public void write(byte[] b, int off, int len) {
|
||||
if (checkFallback(this.size += len)) try {
|
||||
this.fallback.write(b, off, len);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("error writing to fallback", e);
|
||||
} else {
|
||||
super.write(b, off, len);
|
||||
}
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
if (this.fallback != null)
|
||||
this.fallback.close();
|
||||
super.close();
|
||||
}
|
||||
|
||||
public InputStream getContent() throws IOException {
|
||||
close();
|
||||
if (this.isFallback) {
|
||||
InputStream is = new FileInputStream(this.fallbackFile);
|
||||
return (this.buffered) ? new BufferedInputStream(is) : is;
|
||||
} else {
|
||||
return new ByteArrayInputStream(this.buf);
|
||||
}
|
||||
}
|
||||
|
||||
public byte[] getContentBAOS() {
|
||||
if (this.isFallback)
|
||||
throw new RuntimeException("underlying ByteArrayOutputStream not available, already fell back to file");
|
||||
return super.buf;
|
||||
}
|
||||
|
||||
public File getContentFile() {
|
||||
if (!this.isFallback)
|
||||
throw new RuntimeException("haven't fallen back yet, fallback file has no content");
|
||||
return this.fallbackFile;
|
||||
}
|
||||
|
||||
public long getLength() {
|
||||
return this.size;
|
||||
}
|
||||
}
|
Loading…
Reference in new issue