- plasmaParserDocument can process subdocuments now (other archive-parsers may want to use this method)

- added 7zip parser - added 'text/sgml' to realtime parseable mimetypes (sometimes returned by the mime type parser) - added new cached output stream class, very suitable for parsers because of limited memory git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3740 6c8d7289-2bf4-0310-a012-ef5d649a1542
18 years ago · 0a64047081
parent b1680ab71f
commit 0a64047081
13 changed files with 843 additions and 106 deletions
--- a/.classpath
+++ b/.classpath
@ -1,42 +1,43 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<classpath>
-	<classpathentry kind="src" path="source"/>
-	<classpathentry excluding="htdocsdefault/|locale/|proxymsg/|yacy/|env/|env/" kind="src" output="htroot" path="htroot"/>
-	<classpathentry kind="src" path="htroot/htdocsdefault"/>
-	<classpathentry kind="src" path="htroot/yacy"/>
-	<classpathentry kind="src" path="htroot/env"/>
-	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
-	<classpathentry kind="lib" path="lib/commons-collections.jar"/>
-	<classpathentry kind="lib" path="lib/commons-pool.jar"/>
-	<classpathentry kind="lib" path="lib/tar.jar"/>
-	<classpathentry kind="lib" path="libx/axis-ant.jar"/>
-	<classpathentry kind="lib" path="libx/axis.jar"/>
-	<classpathentry kind="lib" path="libx/bzip2.jar"/>
-	<classpathentry kind="lib" path="libx/commons-codec-1.3.jar"/>
-	<classpathentry kind="lib" path="libx/commons-discovery.jar"/>
-	<classpathentry kind="lib" path="libx/commons-jxpath-1.1.jar"/>
-	<classpathentry kind="lib" path="libx/commons-logging.jar"/>
-	<classpathentry kind="lib" path="libx/informa-0.6.0.jar"/>
-	<classpathentry kind="lib" path="libx/jakarta-oro-2.0.7.jar"/>
-	<classpathentry kind="lib" path="libx/jaxrpc.jar"/>
-	<classpathentry kind="lib" path="libx/jdom.jar"/>
-	<classpathentry kind="lib" path="libx/jmimemagic-0.1.0.jar"/>
-	<classpathentry kind="lib" path="libx/jrpm-head.jar"/>
-	<classpathentry kind="lib" path="libx/jrpm-SNAPSHOT.jar"/>
-	<classpathentry kind="lib" path="libx/jsch-0.1.21.jar"/>
-	<classpathentry kind="lib" path="libx/log4j-1.2.9.jar"/>
-	<classpathentry kind="lib" path="libx/odf_utils_05_11_29.jar"/>
-	<classpathentry kind="lib" path="libx/PDFBox-0.7.2.jar"/>
-	<classpathentry kind="lib" path="libx/saaj.jar"/>
-	<classpathentry kind="lib" path="libx/sbbi-upnplib-1.0.3.jar"/>
-	<classpathentry kind="lib" path="libx/tm-extractors-0.4.jar"/>
-	<classpathentry kind="lib" path="libx/wsdl4j.jar"/>
-	<classpathentry kind="lib" path="libx/xerces.jar"/>
-	<classpathentry kind="lib" path="libx/poi-3.0-alpha2-20060616.jar"/>
-	<classpathentry kind="lib" path="libx/poi-scratchpad-3.0-alpha2-20060616.jar"/>
-	<classpathentry kind="lib" path="libx/inetlib.jar"/>
-	<classpathentry kind="lib" path="libx/gnumail.jar"/>
-	<classpathentry kind="lib" path="libx/activation.jar"/>
-	<classpathentry kind="lib" path="libx/webcat-0.1-swf.jar"/>
-	<classpathentry kind="output" path="gen"/>
-</classpath>
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+	<classpathentry kind="src" path="source"/>
+	<classpathentry excluding="htdocsdefault/|locale/|proxymsg/|yacy/|env/|env/" kind="src" output="htroot" path="htroot"/>
+	<classpathentry kind="src" path="htroot/htdocsdefault"/>
+	<classpathentry kind="src" path="htroot/yacy"/>
+	<classpathentry kind="src" path="htroot/env"/>
+	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
+	<classpathentry kind="lib" path="lib/commons-collections.jar"/>
+	<classpathentry kind="lib" path="lib/commons-pool.jar"/>
+	<classpathentry kind="lib" path="lib/tar.jar"/>
+	<classpathentry kind="lib" path="libx/axis-ant.jar"/>
+	<classpathentry kind="lib" path="libx/axis.jar"/>
+	<classpathentry kind="lib" path="libx/bzip2.jar"/>
+	<classpathentry kind="lib" path="libx/commons-codec-1.3.jar"/>
+	<classpathentry kind="lib" path="libx/commons-discovery.jar"/>
+	<classpathentry kind="lib" path="libx/commons-jxpath-1.1.jar"/>
+	<classpathentry kind="lib" path="libx/commons-logging.jar"/>
+	<classpathentry kind="lib" path="libx/informa-0.6.0.jar"/>
+	<classpathentry kind="lib" path="libx/jakarta-oro-2.0.7.jar"/>
+	<classpathentry kind="lib" path="libx/jaxrpc.jar"/>
+	<classpathentry kind="lib" path="libx/jdom.jar"/>
+	<classpathentry kind="lib" path="libx/jmimemagic-0.1.0.jar"/>
+	<classpathentry kind="lib" path="libx/jrpm-head.jar"/>
+	<classpathentry kind="lib" path="libx/jrpm-SNAPSHOT.jar"/>
+	<classpathentry kind="lib" path="libx/jsch-0.1.21.jar"/>
+	<classpathentry kind="lib" path="libx/log4j-1.2.9.jar"/>
+	<classpathentry kind="lib" path="libx/odf_utils_05_11_29.jar"/>
+	<classpathentry kind="lib" path="libx/PDFBox-0.7.2.jar"/>
+	<classpathentry kind="lib" path="libx/saaj.jar"/>
+	<classpathentry kind="lib" path="libx/sbbi-upnplib-1.0.3.jar"/>
+	<classpathentry kind="lib" path="libx/tm-extractors-0.4.jar"/>
+	<classpathentry kind="lib" path="libx/wsdl4j.jar"/>
+	<classpathentry kind="lib" path="libx/xerces.jar"/>
+	<classpathentry kind="lib" path="libx/poi-3.0-alpha2-20060616.jar"/>
+	<classpathentry kind="lib" path="libx/poi-scratchpad-3.0-alpha2-20060616.jar"/>
+	<classpathentry kind="lib" path="libx/inetlib.jar"/>
+	<classpathentry kind="lib" path="libx/gnumail.jar"/>
+	<classpathentry kind="lib" path="libx/activation.jar"/>
+	<classpathentry kind="lib" path="libx/webcat-0.1-swf.jar"/>
+	<classpathentry kind="lib" path="libx/J7Zip-modified.jar"/>
+	<classpathentry kind="output" path="gen"/>
+</classpath>
--- a/.project
+++ b/.project
@ -1,17 +1,17 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <projectDescription>
-<name>trunk</name>
-<comment></comment>
-<projects>
-</projects>
-<buildSpec>
-<buildCommand>
-<name>org.eclipse.jdt.core.javabuilder</name>
-<arguments>
-</arguments>
-</buildCommand>
-</buildSpec>
-<natures>
-<nature>org.eclipse.jdt.core.javanature</nature>
-</natures>
-</projectDescription> 
+	<name>yacy</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.jdt.core.javabuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.jdt.core.javanature</nature>
+	</natures>
+</projectDescription>
--- a/httpd.mime
+++ b/httpd.mime
@ -3,6 +3,7 @@
 ###

 # Extension = MIME type
+7z      = application/x-7z-compressed
 ai      = application/postscript
 aiff    = audio/x-aiff
 au      = audio/basic
--- a/libx/J7Zip-modified.jar
+++ b/libx/J7Zip-modified.jar
--- a/source/de/anomic/plasma/parser/AbstractParser.java
+++ b/source/de/anomic/plasma/parser/AbstractParser.java
@ -53,6 +53,7 @@ import java.io.IOException;
 import java.io.InputStream;

 import de.anomic.net.URL;
+import de.anomic.plasma.plasmaParser;
 import de.anomic.plasma.plasmaParserDocument;
 import de.anomic.server.serverThread;
 import de.anomic.server.logging.serverLog;
@ -138,6 +139,39 @@ public abstract class AbstractParser implements Parser{
        return tempFile;
    }
    
+    public int parseDir(URL location, String prefix, File dir, plasmaParserDocument doc)
+            throws ParserException, InterruptedException, IOException {
+        if (!dir.isDirectory())
+            throw new ParserException("tried to parse ordinary file " + dir + " as directory", location);
+        
+        String[] files = dir.list();
+        int result = 0;
+        for (int i=0; i<files.length; i++) {
+            checkInterruption();
+            File file = new File(dir, files[i]);
+            this.theLogger.logFine("parsing file " + location + "#" + file + " in archive...");
+            if (file.isDirectory()) {
+                result += parseDir(location, prefix, file, doc);
+            } else try {
+                URL url = new URL(location, "/" + prefix + "/"
+                        // XXX: workaround for relative paths within document
+                        + file.getPath().substring(file.getPath().indexOf(File.separatorChar) + 1)
+                        + "/" + file.getName());
+                plasmaParserDocument subdoc = new plasmaParser().parseSource(
+                        url,
+                        plasmaParser.getMimeTypeByFileExt(files[i].substring(files[i].indexOf('.') + 1)),
+                        null, file);
+                // TODO: change anchors back to use '#' after archive name
+                doc.addSubDocument(subdoc);
+                subdoc.close();
+                result++;
+            } catch (ParserException e) {
+                this.theLogger.logInfo("unable to parse file " + file + " in " + location + ", skipping");
+            }
+        }
+        return result;
+    }
+    
 	/**
 	 * Parsing a document available as byte array.
     * @param location the origin of the document 
--- a/source/de/anomic/plasma/parser/sevenzip/ByteArrayIInStream.java
+++ b/source/de/anomic/plasma/parser/sevenzip/ByteArrayIInStream.java
@ -0,0 +1,86 @@
+// ByteArrayIInStream.java 
+// -------------------------------------
+// part of YACY
+// (C) by Michael Peter Christen; mc@anomic.de
+// first published on http://www.anomic.de
+// Frankfurt, Germany, 2004
+// 
+// This file ist contributed by Franz Brausze
+// 
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+// 
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+// 
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+// 
+// Using this software in any meaning (reading, learning, copying, compiling,
+// running) means that you agree that the Author(s) is (are) not responsible
+// for cost, loss of data or any harm that may be caused directly or indirectly
+// by usage of this softare or this documentation. The usage of this software
+// is on your own risk. The installation and usage (starting/running) of this
+// software may allow other people or application to access your computer and
+// any attached devices and is highly dependent on the configuration of the
+// software which must be done by the user of the software; the author(s) is
+// (are) also not responsible for proper configuration and usage of the
+// software, even if provoked by documentation provided together with
+// the software.
+// 
+// Any changes to this file according to the GPL as documented in the file
+// gpl.txt aside this file in the shipment you received can be done to the
+// lines that follows this copyright notice here, but changes must not be
+// done inside the copyright notive above. A re-distribution must contain
+// the intact and unchanged copyright notice.
+// Contributions and changes to the program code must be marked as such.
+
+package de.anomic.plasma.parser.sevenzip;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+
+import SevenZip.IInStream;
+
+public class ByteArrayIInStream extends IInStream {
+    
+    private class SeekableByteArrayInputStream extends ByteArrayInputStream {
+        public SeekableByteArrayInputStream(byte[] buf) { super(buf); }
+        public SeekableByteArrayInputStream(byte[] buf, int off, int len) { super(buf, off, len); }
+        
+        public int getPosition() { return super.pos; }
+        public void seekRelative(int offset) { seekAbsolute(super.pos + offset); }
+        public void seekAbsolute(int offset) {
+            if (offset > super.count)
+                throw new IndexOutOfBoundsException(Integer.toString(offset));
+            super.pos = offset;
+        }
+    }
+    
+    private final SeekableByteArrayInputStream sbais;
+    
+    public ByteArrayIInStream(byte[] buffer) {
+        this.sbais = new SeekableByteArrayInputStream(buffer);
+    }
+    
+    public long Seek(long offset, int origin) {
+        switch (origin) {
+            case STREAM_SEEK_SET: this.sbais.seekAbsolute((int)offset); break;
+            case STREAM_SEEK_CUR: this.sbais.seekRelative((int)offset); break;
+        }
+        return this.sbais.getPosition();
+    }
+    
+    public int read() throws IOException {
+        return this.sbais.read();
+    }
+    
+    public int read(byte[] b, int off, int len) throws IOException {
+        return this.sbais.read(b, off, len);
+    }
+}
--- a/source/de/anomic/plasma/parser/sevenzip/SZParserExtractCallback.java
+++ b/source/de/anomic/plasma/parser/sevenzip/SZParserExtractCallback.java
@ -0,0 +1,176 @@
+// SZParserExtractCallback.java 
+// -------------------------------------
+// part of YACY
+// (C) by Michael Peter Christen; mc@anomic.de
+// first published on http://www.anomic.de
+// Frankfurt, Germany, 2004
+// 
+// This file ist contributed by Franz Brausze
+// 
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+// 
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+// 
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+// 
+// Using this software in any meaning (reading, learning, copying, compiling,
+// running) means that you agree that the Author(s) is (are) not responsible
+// for cost, loss of data or any harm that may be caused directly or indirectly
+// by usage of this softare or this documentation. The usage of this software
+// is on your own risk. The installation and usage (starting/running) of this
+// software may allow other people or application to access your computer and
+// any attached devices and is highly dependent on the configuration of the
+// software which must be done by the user of the software; the author(s) is
+// (are) also not responsible for proper configuration and usage of the
+// software, even if provoked by documentation provided together with
+// the software.
+// 
+// Any changes to this file according to the GPL as documented in the file
+// gpl.txt aside this file in the shipment you received can be done to the
+// lines that follows this copyright notice here, but changes must not be
+// done inside the copyright notive above. A re-distribution must contain
+// the intact and unchanged copyright notice.
+// Contributions and changes to the program code must be marked as such.
+
+package de.anomic.plasma.parser.sevenzip;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+
+import de.anomic.net.URL;
+import de.anomic.plasma.plasmaParser;
+import de.anomic.plasma.plasmaParserDocument;
+import de.anomic.plasma.parser.AbstractParser;
+import de.anomic.plasma.parser.ParserException;
+import de.anomic.server.serverCachedFileOutputStream;
+import de.anomic.server.logging.serverLog;
+
+import SevenZip.ArchiveExtractCallback;
+import SevenZip.Archive.IInArchive;
+import SevenZip.Archive.SevenZipEntry;
+
+// wrapper class to redirect output of standard ArchiveExtractCallback to serverLog
+// and parse the extracted content
+public class SZParserExtractCallback extends ArchiveExtractCallback {
+    
+    private final serverLog log;
+    private final long maxRamSize;
+    private serverCachedFileOutputStream cfos = null;
+    private final plasmaParser parser;
+    private final plasmaParserDocument doc;
+    private final String prefix;
+    
+    public SZParserExtractCallback(serverLog logger, IInArchive handler,
+            long maxRamSize, plasmaParserDocument doc, String prefix) {
+        super.Init(handler);
+        this.log = logger;
+        this.maxRamSize = maxRamSize;
+        this.parser = new plasmaParser();
+        this.doc = doc;
+        this.prefix = prefix;
+    }
+    
+    public void PrepareOperation(int arg0) {
+        this.extractMode = (arg0 == IInArchive.NExtract_NAskMode_kExtract);
+        switch (arg0) {
+            case IInArchive.NExtract_NAskMode_kExtract:
+                this.log.logFine("Extracting " + this.filePath);
+                break;
+            case IInArchive.NExtract_NAskMode_kTest:
+                this.log.logFine("Testing " + this.filePath);
+                break;
+            case IInArchive.NExtract_NAskMode_kSkip:
+                this.log.logFine("Skipping " + this.filePath);
+                break;
+        };
+    }
+
+    public void SetOperationResult(int arg0) throws IOException {
+        if (arg0 != IInArchive.NExtract_NOperationResult_kOK) {
+            this.NumErrors++;
+            switch(arg0) {
+                case IInArchive.NExtract_NOperationResult_kUnSupportedMethod:
+                    throw new IOException("Unsupported Method");
+                case IInArchive.NExtract_NOperationResult_kCRCError:
+                    throw new IOException("CRC Failed");
+                case IInArchive.NExtract_NOperationResult_kDataError:
+                    throw new IOException("Data Error");
+                default:
+                    // throw new IOException("Unknown Error");
+            }
+        } else try {
+            AbstractParser.checkInterruption();
+            
+            if (this.cfos != null) {
+                // parse the file
+                plasmaParserDocument theDoc;
+                // workaround for relative links in file, normally '#' shall be used behind the location, see
+                // below for reversion of the effects
+                URL url = new URL(doc.getLocation(), this.prefix + "/" + super.filePath);
+                String mime = plasmaParser.getMimeTypeByFileExt(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
+                if (this.cfos.isFallback()) {
+                    theDoc = this.parser.parseSource(url, mime, null, this.cfos.getContentFile());
+                } else {
+                    theDoc = this.parser.parseSource(url, mime, null, this.cfos.getContentBAOS());
+                }
+                
+                // revert the above workaround
+                Map nanchors = new HashMap(theDoc.getAnchors().size(), 1f);
+                Iterator it = theDoc.getAnchors().entrySet().iterator();
+                Map.Entry entry;
+                String base = doc.getLocation().toNormalform();
+                while (it.hasNext()) {
+                    entry = (Map.Entry)it.next();
+                    if (((String)entry.getKey()).startsWith(base + "/")) {
+                        String ref = "#" + ((String)entry.getKey()).substring(base.length() + 1);
+                        this.log.logFinest("changing " + entry.getKey() + " to use reference " + ref);
+                        nanchors.put(base + ref, entry.getValue());
+                    } else {
+                        nanchors.put(entry.getKey(), entry.getValue());
+                    }
+                }
+                theDoc.getAnchors().clear();
+                theDoc.getAnchors().putAll(nanchors);
+                this.doc.addSubDocument(theDoc);
+            }
+        } catch (ParserException e) {
+            IOException ex = new IOException("error parsing extracted content of " + super.filePath + ": " + e.getMessage());
+            ex.initCause(e);
+            throw ex;
+        } catch (InterruptedException e) {
+            IOException ex = new IOException("interrupted");
+            ex.initCause(e);
+            throw ex;
+        }
+    }
+    
+    public OutputStream GetStream(int index, int askExtractMode) throws IOException {
+        SevenZipEntry item = super.archiveHandler.getEntry(index);
+        super.filePath = item.getName();
+        try {
+            AbstractParser.checkInterruption();
+        } catch (InterruptedException e) {
+            IOException ex = new IOException("interrupted");
+            ex.initCause(e);
+            throw ex;
+        }
+        this.cfos = (item.isDirectory()) ? null
+                : new serverCachedFileOutputStream(this.maxRamSize, null, true, item.getSize());
+        return this.cfos;
+    }
+    
+    public String getCurrentFilePath() {
+        return super.filePath;
+    }
+}
--- a/source/de/anomic/plasma/parser/sevenzip/build.xml
+++ b/source/de/anomic/plasma/parser/sevenzip/build.xml
@ -0,0 +1,58 @@
+<?xml version="1.0"?>
+<project name="YACY - sevenzipParser" default="dist">
+    <description>
+            Some classes to parse 7zip files
+    </description>
+
+    <property name="parserShortName" value="sevenzip"/>
+	<property name="parserVersion" value="0.1"/>
+
+    <property name="parserLongName" value="yacyContentParser_${parserShortName}"/>    	
+   	<property name="parserArchive" location="${release}/${parserLongName}_${parserVersion}.tgz"/>	    	
+    	
+    <target name="compile">
+  	  <javac srcdir="${src}/de/anomic/plasma/parser/${parserShortName}"
+  	  	     destdir="${build}" source="${javacSource}" target="${javacTarget}"
+  	  	     debug="true" debuglevel="lines,vars,source">
+  	  	<classpath>
+  	  	  <pathelement location="${build}" />	
+  	  	
+  		  <pathelement location="${libx}/J7Zip-modified.jar" />        	  
+  	  	</classpath>
+  	  </javac>    	
+    </target>
+	
+
+    <target name="zip" depends="compile">
+  	  <tar destfile="${parserArchive}" compression="gzip">
+  	  	<tarfileset dir="${libx}" 
+  	  				includes="J7Zip-modified.jar" 
+  	  				prefix="${releaseFileParentDir}/libx/"
+			  	  	dirmode="755" mode="644"/>
+  	  	<tarfileset dir="${src}/de/anomic/plasma/parser/${parserShortName}" 
+  	  				prefix="${releaseFileParentDir}/source/de/anomic/plasma/parser/${parserShortName}"
+			  	  	dirmode="755" mode="644"/>
+  	  	<tarfileset dir="${build}/de/anomic/plasma/parser/${parserShortName}" 
+  	  				prefix="${releaseFileParentDir}/classes/de/anomic/plasma/parser/${parserShortName}"
+			  	  	dirmode="755" mode="644"/>	  	
+  	  </tar>    	
+    </target>	
+
+    <target name="copy" depends="compile">
+        <copy todir="${release}/libx/">
+             <fileset dir="${libx}" includes="J7Zip-modified.jar"/> 
+        </copy>
+        <copy todir="${release}/source/de/anomic/plasma/parser/${parserShortName}">
+             <fileset dir="${src}/de/anomic/plasma/parser/${parserShortName}" includes="**/*"/> 
+        </copy>        
+        <copy todir="${release}/classes/de/anomic/plasma/parser/${parserShortName}">
+             <fileset dir="${build}/de/anomic/plasma/parser/${parserShortName}" includes="**/*"/> 
+        </copy>         
+    </target>  
+    
+
+    <target name="dist" depends="compile,zip" description="Compile and zip the parser"/>        
+	
+	
+</project>
+
--- a/source/de/anomic/plasma/parser/sevenzip/sevenzipParser.java
+++ b/source/de/anomic/plasma/parser/sevenzip/sevenzipParser.java
@ -0,0 +1,146 @@
+// sevenzipParser.java 
+// -------------------------------------
+// part of YACY
+// (C) by Michael Peter Christen; mc@anomic.de
+// first published on http://www.anomic.de
+// Frankfurt, Germany, 2004
+// 
+// This file ist contributed by Franz Brausze
+// 
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+// 
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+// 
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+// 
+// Using this software in any meaning (reading, learning, copying, compiling,
+// running) means that you agree that the Author(s) is (are) not responsible
+// for cost, loss of data or any harm that may be caused directly or indirectly
+// by usage of this softare or this documentation. The usage of this software
+// is on your own risk. The installation and usage (starting/running) of this
+// software may allow other people or application to access your computer and
+// any attached devices and is highly dependent on the configuration of the
+// software which must be done by the user of the software; the author(s) is
+// (are) also not responsible for proper configuration and usage of the
+// software, even if provoked by documentation provided together with
+// the software.
+// 
+// Any changes to this file according to the GPL as documented in the file
+// gpl.txt aside this file in the shipment you received can be done to the
+// lines that follows this copyright notice here, but changes must not be
+// done inside the copyright notive above. A re-distribution must contain
+// the intact and unchanged copyright notice.
+// Contributions and changes to the program code must be marked as such.
+
+package de.anomic.plasma.parser.sevenzip;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Hashtable;
+
+import SevenZip.IInStream;
+import SevenZip.MyRandomAccessFile;
+import SevenZip.Archive.SevenZip.Handler;
+
+import de.anomic.net.URL;
+import de.anomic.plasma.plasmaParserDocument;
+import de.anomic.plasma.parser.AbstractParser;
+import de.anomic.plasma.parser.Parser;
+import de.anomic.plasma.parser.ParserException;
+import de.anomic.server.serverCachedFileOutputStream;
+import de.anomic.server.serverFileUtils;
+
+public class sevenzipParser extends AbstractParser implements Parser {
+    
+    /**
+     * a list of mime types that are supported by this parser class
+     * @see #getSupportedMimeTypes()
+     */    
+    public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable();    
+    static { 
+        SUPPORTED_MIME_TYPES.put("application/x-7z-compressed", "7z"); 
+    } 
+    
+    /**
+     * a list of library names that are needed by this parser
+     * @see Parser#getLibxDependences()
+     */
+    private static final String[] LIBX_DEPENDENCIES = new String[] { "J7Zip-modified.jar" };
+    
+    public sevenzipParser() throws IOException {
+        super(LIBX_DEPENDENCIES);
+        super.parserName = "7zip Archive Parser";
+    }
+    
+    public plasmaParserDocument parse(URL location, String mimeType, String charset,
+            IInStream source, long maxRamSize) throws ParserException, InterruptedException {
+        plasmaParserDocument doc = new plasmaParserDocument(location, mimeType, charset);
+        Handler archive;
+        super.theLogger.logFine("opening 7zip archive...");
+        try {
+            archive = new Handler(source);
+        } catch (IOException e) {
+            throw new ParserException("error opening 7zip archive", location, e);
+        }
+        checkInterruption();
+        SZParserExtractCallback aec = new SZParserExtractCallback(super.theLogger, archive,
+                maxRamSize, doc, location.getFile());
+        super.theLogger.logFine("processing archive contents...");
+        try {
+            archive.Extract(null, -1, 0, aec);
+            return doc;   
+        } catch (IOException e) {
+            if (e.getCause() instanceof InterruptedException)
+                throw (InterruptedException)e.getCause();
+            if (e.getCause() instanceof ParserException)
+                throw (ParserException)e.getCause();
+            throw new ParserException(
+                    "error processing 7zip archive at internal file: " + aec.getCurrentFilePath(),
+                    location, e);
+        } finally {
+            try { archive.close(); } catch (IOException e) {  }
+        }
+    }
+    
+    public plasmaParserDocument parse(URL location, String mimeType, String charset,
+            byte[] source) throws ParserException, InterruptedException {
+        return parse(location, mimeType, charset, new ByteArrayIInStream(source), Parser.MAX_KEEP_IN_MEMORY_SIZE - source.length);
+    }
+    
+    public plasmaParserDocument parse(URL location, String mimeType, String charset,
+            File sourceFile) throws ParserException, InterruptedException {
+        try {
+            return parse(location, mimeType, charset, new MyRandomAccessFile(sourceFile, "r"), Parser.MAX_KEEP_IN_MEMORY_SIZE);
+        } catch (IOException e) {
+            throw new ParserException("error processing 7zip archive", location, e);
+        }
+    }
+    
+    public plasmaParserDocument parse(URL location, String mimeType, String charset,
+            InputStream source) throws ParserException, InterruptedException {
+        try {
+            serverCachedFileOutputStream cfos = new serverCachedFileOutputStream(Parser.MAX_KEEP_IN_MEMORY_SIZE);
+            serverFileUtils.copy(source, cfos);
+            if (cfos.isFallback()) {
+                return parse(location, mimeType, charset, cfos.getContentFile());
+            } else {
+                return parse(location, mimeType, charset, cfos.getContentBAOS());
+            }
+        } catch (IOException e) {
+            throw new ParserException("error processing 7zip archive", location, e);
+        }
+    }
+    
+    public Hashtable getSupportedMimeTypes() {
+        return SUPPORTED_MIME_TYPES;
+    }
+}
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@ -941,7 +941,7 @@ public final class plasmaParser {
            plasmaParser theParser = new plasmaParser();
            
            // configuring the realtime parsable mimeTypes
-            plasmaParser.initRealtimeParsableMimeTypes("application/xhtml+xml,text/html,text/plain");
+            plasmaParser.initRealtimeParsableMimeTypes("application/xhtml+xml,text/html,text/plain,text/sgml");
            
            // configure all other supported mimeTypes
            plasmaParser.enableAllParsers(PARSER_MODE_PROXY);
--- a/source/de/anomic/plasma/plasmaParserDocument.java
+++ b/source/de/anomic/plasma/plasmaParserDocument.java
@ -46,28 +46,35 @@ import java.io.BufferedInputStream;
 import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.FileInputStream;
+import java.io.IOException;
 import java.io.InputStream;
 import java.net.MalformedURLException;
+
+import de.anomic.server.serverCachedFileOutputStream;
 import de.anomic.server.serverFileUtils;

+import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
 import java.util.Map;
 import java.util.TreeSet;

 import de.anomic.htmlFilter.htmlFilterImageEntry;
 import de.anomic.net.URL;
+import de.anomic.plasma.parser.Parser;

 public class plasmaParserDocument {
    
    private URL location;       // the source url
    private String mimeType;    // mimeType as taken from http header
    private String charset;     // the charset of the document
-    private String[] keywords;  // most resources provide a keyword field
-    private String title;       // a document title, taken from title or h1 tag; shall appear as headline of search result
-    private String author;      // author or copyright
-    private String[] sections;  // if present: more titles/headlines appearing in the document
-    private String abstrct;     // an abstract, if present: short content description
+    private List keywords;  // most resources provide a keyword field
+    private StringBuffer title;       // a document title, taken from title or h1 tag; shall appear as headline of search result
+    private StringBuffer author;      // author or copyright
+    private List sections;  // if present: more titles/headlines appearing in the document
+    private StringBuffer abstrct;     // an abstract, if present: short content description
    private Object text;  // the clear text, all that is visible
    private Map anchors;        // all links embedded as clickeable entities (anchor tags)
    private TreeSet images;     // all visible pictures in document
@ -77,54 +84,63 @@ public class plasmaParserDocument {
    private Map hyperlinks, audiolinks, videolinks, applinks;
    private Map emaillinks;
    private boolean resorted;
-    private InputStream textStream; 
-                    
-    public plasmaParserDocument(URL location, String mimeType, String charset,
+    private InputStream textStream;
+    
+    protected plasmaParserDocument(URL location, String mimeType, String charset,
                    String[] keywords, String title, String author,
                    String[] sections, String abstrct,
-                    byte[] text, Map anchors, TreeSet images) {
+                    Object text, Map anchors, TreeSet images) {
        this.location = location;
-        this.mimeType = (mimeType==null)?"application/octet-stream":mimeType;
+        this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType;
        this.charset = charset;
-        this.keywords = (keywords==null) ? new String[0] : keywords;
-        this.title = (title==null)?"":title;
-        this.author = (author==null)?"":author;
-        this.sections = (sections==null)?new String[0]:sections;
-        this.abstrct = (abstrct==null)?"":abstrct;
-        this.text = (text==null)?new byte[0]:text;
-        this.anchors = (anchors==null)?new HashMap(0):anchors;
-        this.images = (images==null)?new TreeSet():images;
+        this.keywords = (keywords == null) ? new LinkedList() : Arrays.asList(keywords);
+        this.title = (title == null) ? new StringBuffer() : new StringBuffer(title);
+        this.author = (author == null) ? new StringBuffer() : new StringBuffer(author);
+        this.sections = (sections == null) ? new LinkedList() : Arrays.asList(sections);
+        this.abstrct = (abstrct == null) ? new StringBuffer() : new StringBuffer(abstrct);
+        this.anchors = (anchors == null) ? new HashMap(0) : anchors;
+        this.images = (images == null) ? new TreeSet() : images;
        this.hyperlinks = null;
        this.audiolinks = null;
        this.videolinks = null;
        this.applinks = null;
        this.emaillinks = null;
        this.resorted = false;
+        
+        if (text == null) try {
+            this.text = new serverCachedFileOutputStream(Parser.MAX_KEEP_IN_MEMORY_SIZE);
+        } catch (IOException e) {
+            e.printStackTrace();
+            this.text = new StringBuffer();
+        } else {
+            this.text = text;
+        }
+    }
+    
+    public plasmaParserDocument(URL location, String mimeType, String charset) {
+        this(location, mimeType, charset, null, null, null, null, null, (Object)null, null, null);
+    }
+    
+    public plasmaParserDocument(URL location, String mimeType, String charset,
+                    String[] keywords, String title, String author,
+                    String[] sections, String abstrct,
+                    byte[] text, Map anchors, TreeSet images) {
+        this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
    }
    
    public plasmaParserDocument(URL location, String mimeType, String charset,
            String[] keywords, String title, String author,
            String[] sections, String abstrct,
            File text, Map anchors, TreeSet images) {
-        this.location = location;
-        this.mimeType = (mimeType==null)?"application/octet-stream":mimeType;
-        this.charset = charset;
-        this.keywords = (keywords==null) ? new String[0] : keywords;
-        this.title = (title==null)?"":title;
-        this.author = (author==null)?"":author;
-        this.sections = (sections==null)?new String[0]:sections;
-        this.abstrct = (abstrct==null)?"":abstrct;
-        this.text = text;
-        if (text != null) text.deleteOnExit();
-        this.anchors = (anchors==null)?new HashMap(0):anchors;
-        this.images = (images==null)?new TreeSet():images;
-        this.hyperlinks = null;
-        this.audiolinks = null;
-        this.videolinks = null;
-        this.applinks = null;
-        this.emaillinks = null;
-        this.resorted = false;
-    }    
+        this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
+    }
+    
+    public plasmaParserDocument(URL location, String mimeType, String charset,
+            String[] keywords, String title, String author,
+            String[] sections, String abstrct,
+            serverCachedFileOutputStream text, Map anchors, TreeSet images) {
+        this(location, mimeType, charset, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
+    }

    public URL getLocation() {
        return this.location;
@ -142,19 +158,23 @@ public class plasmaParserDocument {
    }
    
    public String getTitle() {
-        return title;
+        return title.toString();
    }
    
    public String[] getSectionTitles() {
-        if (sections != null) return sections; else return new String[]{getTitle()};
+        if (sections != null) {
+            return (String[])sections.toArray(new String[this.sections.size()]);
+        } else {
+            return new String[] { getTitle() };
+        }
    }

    public String getAbstract() {
-        if (abstrct != null) return abstrct; else return getTitle();
+        if (abstrct != null) return abstrct.toString(); else return getTitle();
    }
    
    public String getAuthor() {
-        if (author != null) return author; else return "";
+        if (author != null) return author.toString(); else return new String();
    }
    
    public InputStream getText() {
@ -165,6 +185,8 @@ public class plasmaParserDocument {
                this.textStream = new BufferedInputStream(new FileInputStream((File)this.text));
            } else if (this.text instanceof byte[]) {
                this.textStream =  new ByteArrayInputStream((byte[])this.text);
+            } else if (this.text instanceof serverCachedFileOutputStream) {
+                return ((serverCachedFileOutputStream)this.text).getContent();
            }
            return this.textStream;
        } catch (Exception e) {
@ -177,8 +199,18 @@ public class plasmaParserDocument {
        try {
            if (this.text == null) return new byte[0];

-            if (this.text instanceof File) return serverFileUtils.read((File)this.text);
-            else if (this.text instanceof byte[]) return (byte[])this.text;
+            if (this.text instanceof File) {
+                return serverFileUtils.read((File)this.text);
+            } else if (this.text instanceof byte[]) {
+                return (byte[])this.text;
+            } else if (this.text instanceof serverCachedFileOutputStream) {
+                serverCachedFileOutputStream ffbaos = (serverCachedFileOutputStream)this.text;
+                if (ffbaos.isFallback()) {
+                    return serverFileUtils.read(ffbaos.getContent());
+                } else {
+                    return ffbaos.getContentBAOS();
+                }
+            }
        } catch (Exception e) {
            e.printStackTrace();
        }
@ -189,6 +221,9 @@ public class plasmaParserDocument {
        if (this.text == null) return 0;
        if (this.text instanceof File) return ((File)this.text).length();
        else if (this.text instanceof byte[]) return ((byte[])this.text).length;
+        else if (this.text instanceof serverCachedFileOutputStream) {
+            return ((serverCachedFileOutputStream)this.text).getLength();
+        }
        
        return -1; 
    }
@ -204,19 +239,23 @@ public class plasmaParserDocument {
        // sort out doubles and empty words
        TreeSet hs = new TreeSet();
        String s;
-        for (int i = 0; i < this.keywords.length; i++) {
-            if (this.keywords[i] == null) continue;
-            s = this.keywords[i].trim();
+        for (int i = 0; i < this.keywords.size(); i++) {
+            if (this.keywords.get(i) == null) continue;
+            s = ((String)this.keywords.get(i)).trim();
            if (s.length() > 0) hs.add(s.toLowerCase());
        }
        if (hs.size() == 0) return "";
        // generate a new list
-        StringBuffer sb = new StringBuffer(this.keywords.length * 6);
+        StringBuffer sb = new StringBuffer(this.keywords.size() * 6);
        Iterator i = hs.iterator();
        while (i.hasNext()) sb.append((String) i.next()).append(separator);
        return sb.substring(0, sb.length() - 1);
    }
    
+    public List getKeywords() {
+        return this.keywords;
+    }
+    
    public Map getAnchors() {
        // returns all links embedded as anchors (clickeable entities)
        // this is a url(String)/text(String) map
@ -335,6 +374,27 @@ public class plasmaParserDocument {
        this.resorted = true;
    }
    
+    public void addSubDocument(plasmaParserDocument doc) throws IOException {
+        this.sections.addAll(Arrays.asList(doc.getSectionTitles()));
+        
+        if (this.title.length() > 0) this.title.append('\n');
+        this.title.append(doc.getTitle());
+        
+        this.keywords.addAll(doc.getKeywords());
+        
+        if (this.abstrct.length() > 0) this.abstrct.append('\n');
+        this.abstrct.append(doc.getAbstract());
+        
+        if (!(this.text instanceof serverCachedFileOutputStream)) {
+            this.text = new serverCachedFileOutputStream(Parser.MAX_KEEP_IN_MEMORY_SIZE);
+            serverFileUtils.copy(getText(), (serverCachedFileOutputStream)this.text);
+        }
+        serverFileUtils.copy(doc.getText(), (serverCachedFileOutputStream)this.text);
+        
+        anchors.putAll(doc.getAnchors());
+        images.addAll(doc.getImages());
+    }
+    
    public void close() {
        // try close the output stream
        if (this.textStream != null) {
--- a/source/de/anomic/server/serverCachedFileOutputStream.java
+++ b/source/de/anomic/server/serverCachedFileOutputStream.java
@ -0,0 +1,175 @@
+// FileFallbackByteArrayOutputStream.java 
+// -------------------------------------
+// part of YACY
+// (C) by Michael Peter Christen; mc@anomic.de
+// first published on http://www.anomic.de
+// Frankfurt, Germany, 2004
+// 
+// This file ist contributed by Franz Brausze
+// 
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+// 
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+// 
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+// 
+// Using this software in any meaning (reading, learning, copying, compiling,
+// running) means that you agree that the Author(s) is (are) not responsible
+// for cost, loss of data or any harm that may be caused directly or indirectly
+// by usage of this softare or this documentation. The usage of this software
+// is on your own risk. The installation and usage (starting/running) of this
+// software may allow other people or application to access your computer and
+// any attached devices and is highly dependent on the configuration of the
+// software which must be done by the user of the software; the author(s) is
+// (are) also not responsible for proper configuration and usage of the
+// software, even if provoked by documentation provided together with
+// the software.
+// 
+// Any changes to this file according to the GPL as documented in the file
+// gpl.txt aside this file in the shipment you received can be done to the
+// lines that follows this copyright notice here, but changes must not be
+// done inside the copyright notive above. A re-distribution must contain
+// the intact and unchanged copyright notice.
+// Contributions and changes to the program code must be marked as such.
+
+package de.anomic.server;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+public class serverCachedFileOutputStream extends ByteArrayOutputStream {
+    
+    protected File fallbackFile;
+    protected long fallbackSize;
+    protected boolean buffered;
+    
+    protected long size = 0;
+    protected boolean isFallback = false;
+    protected OutputStream fallback = null;
+    
+    public serverCachedFileOutputStream(long fallbackSize) throws IOException {
+        this(fallbackSize, null, true, 32);
+    }
+    
+    public serverCachedFileOutputStream(long fallbackSize, File fallback, boolean buffered)
+            throws IOException {
+        this(fallbackSize, fallback, buffered, 32);
+    }
+    
+    public serverCachedFileOutputStream(long fallbackSize, File fallback, boolean buffered,
+            long size) throws IOException {
+        this.fallbackSize = fallbackSize;
+        this.fallbackFile = (fallback == null) ? File.createTempFile(
+                serverCachedFileOutputStream.class.getName(),
+                Long.toString(System.currentTimeMillis())) : fallback;
+        this.buffered = buffered;
+        checkFallback(size);
+    }
+    
+    public serverCachedFileOutputStream(long fallbackSize, File fallback, boolean buffered,
+            byte[] data) throws IOException {
+        this(fallbackSize, fallback, buffered, 0);
+        super.buf = data;
+        super.count = data.length;
+        checkFallback(this.size = data.length);
+    }
+    
+    protected boolean checkFallback(long size) {
+        if (size > this.fallbackSize) try {
+            fallback();
+            return true;
+        } catch (IOException e) {
+            throw new RuntimeException("error falling back to file", e);
+        } else {
+            return false;
+        }
+    }
+    
+    public void fallback() throws IOException {
+        if (this.isFallback) return;
+        this.isFallback = true;
+        if (!this.fallbackFile.exists()) {
+            this.fallbackFile.createNewFile();
+        } else if (this.fallbackFile.isDirectory()) {
+            throw new IOException("cannot write on a directory");
+        }
+        OutputStream os = new FileOutputStream(this.fallbackFile);
+        this.fallback = (this.buffered) ? new BufferedOutputStream(os) : os;
+        serverFileUtils.copy(new ByteArrayInputStream(super.buf), this.fallback);
+        super.buf = new byte[0];
+        super.count = 0;
+        super.reset();
+    }
+    
+    public boolean isFallback() {
+        return this.isFallback;
+    }
+    
+    public void write(int b) {
+        if (checkFallback(++this.size)) try {
+            this.fallback.write(b);
+        } catch (IOException e) {
+            throw new RuntimeException("error writing to fallback", e);
+        } else {
+            super.write(b);
+        }
+    }
+    
+    public void write(byte[] b, int off, int len) {
+        if (checkFallback(this.size += len)) try {
+            this.fallback.write(b, off, len);
+        } catch (IOException e) {
+            throw new RuntimeException("error writing to fallback", e);
+        } else {
+            super.write(b, off, len);
+        }
+    }
+    
+    public void close() throws IOException {
+        if (this.fallback != null)
+            this.fallback.close();
+        super.close();
+    }
+    
+    public InputStream getContent() throws IOException {
+        close();
+        if (this.isFallback) {
+            InputStream is = new FileInputStream(this.fallbackFile);
+            return (this.buffered) ? new BufferedInputStream(is) : is;
+        } else {
+            return new ByteArrayInputStream(this.buf);
+        }
+    }
+    
+    public byte[] getContentBAOS() {
+        if (this.isFallback)
+            throw new RuntimeException("underlying ByteArrayOutputStream not available, already fell back to file");
+        return super.buf;
+    }
+    
+    public File getContentFile() {
+        if (!this.isFallback)
+            throw new RuntimeException("haven't fallen back yet, fallback file has no content");
+        return this.fallbackFile;
+    }
+    
+    public long getLength() {
+        return this.size;
+    }
+}
--- a/yacy.init
+++ b/yacy.init
@ -204,7 +204,7 @@ proxyCacheMigration = true
 #
 # parseableRealtimeMimeTypes: specifies mime-types that can be indexed on the fly
 # parseableMime: specifies mime-types that can be indexed but not on the fly
-parseableRealtimeMimeTypes=application/xhtml+xml,text/html,text/plain
+parseableRealtimeMimeTypes=application/xhtml+xml,text/html,text/plain,text/sgml
 parseableMimeTypes=
 parseableMimeTypes.CRAWLER=
 parseableMimeTypes.PROXY=