From cd5f349666ba2d9cc3b3d9cd5488b72333dabb76 Mon Sep 17 00:00:00 2001
From: theli <theli@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Sat, 30 Sep 2006 09:31:53 +0000
Subject: [PATCH] *) Better handling of large files during parsing    Extracted
 text of files that are larger than 5MB is stored in a temp file instead of
 keeping it in memory *) plasmaParserDocument.java; getText now returnes an
 inputStream instead of a byte array *) plasmaParserDocument.java: new
 function getTextBytes returns the parsed content as byte array    Attention:
 the caller of this function has to ensure that enough memory is available to
 do this    to avoid OutOfMemory Exceptions *) httpd.java: better error
 handling if the soaphander is not installed *) pdfParser.java:    - better
 handling of documents with exotic charsets    - better handling of large
 documents    - better error logging of encrypted documents *) rtfParser.java:
 Bugfix for UTF-8 support *) tarParser.java: better handling of large
 documents *) zipParser.java: better handling of large documents *)
 plasmaCrawlEURL.java: new errorcode for encrypted documents *)
 plasmaParserDocument.java: the extracted text can now be passed    to this
 object as byte array or temp file

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2679 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 htroot/ViewFile.java                          |  2 +-
 source/de/anomic/http/httpd.java              | 36 +++----
 .../anomic/plasma/parser/AbstractParser.java  | 11 ++-
 source/de/anomic/plasma/parser/Parser.java    |  3 +
 .../anomic/plasma/parser/pdf/pdfParser.java   | 95 ++++++++++++-------
 .../anomic/plasma/parser/rtf/rtfParser.java   |  5 +-
 .../anomic/plasma/parser/tar/tarParser.java   | 89 ++++++++++++-----
 .../anomic/plasma/parser/zip/zipParser.java   | 83 ++++++++++++----
 source/de/anomic/plasma/plasmaCrawlEURL.java  |  1 +
 source/de/anomic/plasma/plasmaParser.java     |  2 +-
 .../anomic/plasma/plasmaParserDocument.java   | 81 ++++++++++++++--
 .../de/anomic/plasma/plasmaSwitchboard.java   | 15 +--
 12 files changed, 309 insertions(+), 114 deletions(-)
diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java
index f990a530a..79c29238d 100644
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@@ -216,7 +216,7 @@ public class ViewFile {
                 resMime = document.getMimeType();
 
                 if (viewMode.equals("parsed")) {
-                    String content = new String(document.getText());
+                    String content = new String(document.getTextBytes());
                     content = wikiCode.replaceHTML(content); //added by Marc Nause
                     content = content.replaceAll("\n","<br>")
                     .replaceAll("\t","&nbsp;&nbsp;&nbsp;&nbsp;");
diff --git a/source/de/anomic/http/httpd.java b/source/de/anomic/http/httpd.java
index ccebe894d..8168be210 100644
--- a/source/de/anomic/http/httpd.java
+++ b/source/de/anomic/http/httpd.java
@@ -406,24 +406,24 @@ public final class httpd implements serverHandler {
                     /*
                      * Handling SOAP Requests here ...
                      */
-                    if (this.prop.containsKey(httpHeader.CONNECTION_PROP_PATH) && this.prop.getProperty(httpHeader.CONNECTION_PROP_PATH).startsWith("/soap")) {
-                        if (soapHandler == null) {
+                    if (this.prop.containsKey(httpHeader.CONNECTION_PROP_PATH) && this.prop.getProperty(httpHeader.CONNECTION_PROP_PATH).startsWith("/soap/")) {
+                        if (this.soapHandler == null) {
                             try {
                                 Class soapHandlerClass = Class.forName("de.anomic.soap.httpdSoapHandler");
                                 Constructor classConstructor = soapHandlerClass.getConstructor( new Class[] { serverSwitch.class } );
-                                soapHandler  = (httpdHandler) classConstructor.newInstance(new Object[] { switchboard });
+                                this.soapHandler  = (httpdHandler) classConstructor.newInstance(new Object[] { switchboard });
                             } catch (Exception e) {
-                                sendRespondHeader(this.prop,this.session.out,httpVersion,503,null);
+                                sendRespondError(this.prop,this.session.out,4,503,null,"SOAP Extension not installed",e);
                                 return serverCore.TERMINATE_CONNECTION;
                             } catch (NoClassDefFoundError e) {
-                                sendRespondError(this.prop,this.session.out,4,503,null,"SOAP Extension not installed",null);
+                                sendRespondError(this.prop,this.session.out,4,503,null,"SOAP Extension not installed",e);
                                 return serverCore.TERMINATE_CONNECTION;
                             } catch (Error e) {
-                                sendRespondHeader(this.prop,this.session.out,httpVersion,503,null);
+                                sendRespondError(this.prop,this.session.out,4,503,null,"SOAP Extension not installed",e);
                                 return serverCore.TERMINATE_CONNECTION;                                
                             }
                         }
-                        soapHandler.doGet(this.prop, header, this.session.out);
+                        this.soapHandler.doGet(this.prop, header, this.session.out);
                         
                         /*
                          * Handling HTTP requests here ...
@@ -541,7 +541,7 @@ public final class httpd implements serverHandler {
             
             // we now know the HTTP version. depending on that, we read the header
             httpHeader header;
-            String httpVersion = prop.getProperty("HTTP", "HTTP/0.9");
+            String httpVersion = this.prop.getProperty("HTTP", "HTTP/0.9");
             if (httpVersion.equals("HTTP/0.9"))  header = new httpHeader(reverseMappingCache);
             else header = httpHeader.readHeader(this.prop,this.session);
             
@@ -559,8 +559,8 @@ public final class httpd implements serverHandler {
                     /*
                      * Handling SOAP Requests here ...
                      */
-                    if (this.prop.containsKey("PATH") && this.prop.getProperty("PATH").startsWith("/soap")) {
-                        if (soapHandler == null) {
+                    if (this.prop.containsKey("PATH") && this.prop.getProperty("PATH").startsWith("/soap/")) {
+                        if (this.soapHandler == null) {
                             try {
                                 // creating the soap handler class by name
                                 Class soapHandlerClass = Class.forName("de.anomic.soap.httpdSoapHandler");
@@ -569,19 +569,19 @@ public final class httpd implements serverHandler {
                                 Constructor soapHandlerConstructor = soapHandlerClass.getConstructor( new Class[] { serverSwitch.class } );
                                 
                                 // creating the new object
-                                soapHandler = (httpdHandler)soapHandlerConstructor.newInstance( new Object[] { switchboard } );   
+                                this.soapHandler = (httpdHandler)soapHandlerConstructor.newInstance( new Object[] { switchboard } );   
                             } catch (Exception e) {
-                                sendRespondHeader(this.prop,this.session.out,httpVersion,503,null);
+                                sendRespondError(this.prop,this.session.out,4,503,null,"SOAP Extension not installed",e);
                                 return serverCore.TERMINATE_CONNECTION;
                             } catch (NoClassDefFoundError e) {
-                                sendRespondError(this.prop,this.session.out,4,503,"SOAP Extension not installed","SOAP Extension not installed",null);
+                                sendRespondError(this.prop,this.session.out,4,503,null,"SOAP Extension not installed",e);
                                 return serverCore.TERMINATE_CONNECTION;                                
                             } catch (Error e) {
-                                sendRespondHeader(this.prop,this.session.out,httpVersion,503,null);
+                                sendRespondError(this.prop,this.session.out,4,503,null,"SOAP Extension not installed",e);
                                 return serverCore.TERMINATE_CONNECTION;                                
                             }
                         }
-                        soapHandler.doPost(prop, header, this.session.out, this.session.in);                
+                        this.soapHandler.doPost(this.prop, header, this.session.out, this.session.in);                
                         /*
                          * Handling normal HTTP requests here ...
                          */
@@ -981,7 +981,7 @@ public final class httpd implements serverHandler {
             int httpStatusCode,            
             String httpStatusText,
             String detailedErrorMsg,
-            Exception stackTrace
+            Throwable stackTrace
     ) throws IOException {
         sendRespondError(
                 conProp,
@@ -1004,7 +1004,7 @@ public final class httpd implements serverHandler {
             String httpStatusText,
             File detailedErrorMsgFile,
             serverObjects detailedErrorMsgValues,
-            Exception stackTrace
+            Throwable stackTrace
     ) throws IOException {
         sendRespondError(
                 conProp,
@@ -1029,7 +1029,7 @@ public final class httpd implements serverHandler {
             String detailedErrorMsgText,
             Object detailedErrorMsgFile,
             serverObjects detailedErrorMsgValues,
-            Exception stackTrace,
+            Throwable stackTrace,
             httpHeader header
     ) throws IOException {
         
diff --git a/source/de/anomic/plasma/parser/AbstractParser.java b/source/de/anomic/plasma/parser/AbstractParser.java
index fdaef3ba7..2c7f1d701 100644
--- a/source/de/anomic/plasma/parser/AbstractParser.java
+++ b/source/de/anomic/plasma/parser/AbstractParser.java
@@ -64,7 +64,7 @@ import de.anomic.server.logging.serverLog;
  * @version $LastChangedRevision$ / $LastChangedDate$
  */
 public abstract class AbstractParser implements Parser{
-
+    
     /**
      * a list of library names that are needed by this parser
      */
@@ -86,6 +86,12 @@ public abstract class AbstractParser implements Parser{
      */
     protected String parserName = this.getClass().getName();
     
+    /**
+     * The source file file size in bytes if the source document was passed
+     * in as file
+     */
+    protected long fileSize = -1;
+    
     /**
      * The Constructor of this class.
      */
@@ -178,6 +184,9 @@ public abstract class AbstractParser implements Parser{
 	) throws ParserException, InterruptedException {
         BufferedInputStream contentInputStream = null;
         try {
+            // getting the file size of the document
+            this.fileSize = sourceFile.length();            
+            
             // create a stream from the file
             contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile));
             
diff --git a/source/de/anomic/plasma/parser/Parser.java b/source/de/anomic/plasma/parser/Parser.java
index 505bac7fb..83d0daa5c 100644
--- a/source/de/anomic/plasma/parser/Parser.java
+++ b/source/de/anomic/plasma/parser/Parser.java
@@ -60,6 +60,9 @@ import de.anomic.server.logging.serverLog;
  */
 public interface Parser {
     
+
+    public static long MAX_KEEP_IN_MEMORY_SIZE = 5 * 1024 * 1024;    
+    
     /**
      * Parsing a document available as byte array
      * @param location the origin of the document 
diff --git a/source/de/anomic/plasma/parser/pdf/pdfParser.java b/source/de/anomic/plasma/parser/pdf/pdfParser.java
index 0de357ff5..5f2fca420 100644
--- a/source/de/anomic/plasma/parser/pdf/pdfParser.java
+++ b/source/de/anomic/plasma/parser/pdf/pdfParser.java
@@ -43,8 +43,11 @@
 
 package de.anomic.plasma.parser.pdf;
 
+import java.io.File;
+import java.io.FileOutputStream;
 import java.io.InputStream;
 import java.io.OutputStreamWriter;
+import java.io.Writer;
 import java.util.Hashtable;
 
 import org.pdfbox.pdfparser.PDFParser;
@@ -53,11 +56,12 @@ import org.pdfbox.pdmodel.PDDocumentInformation;
 import org.pdfbox.util.PDFTextStripper;
 
 import de.anomic.net.URL;
+import de.anomic.plasma.plasmaCrawlEURL;
 import de.anomic.plasma.plasmaParserDocument;
 import de.anomic.plasma.parser.AbstractParser;
 import de.anomic.plasma.parser.Parser;
 import de.anomic.plasma.parser.ParserException;
-import de.anomic.server.serverByteBuffer;
+import de.anomic.server.serverCharBuffer;
 
 public class pdfParser extends AbstractParser implements Parser {
 
@@ -87,9 +91,9 @@ public class pdfParser extends AbstractParser implements Parser {
     
     public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
         
-        
         PDDocument theDocument = null;
-        OutputStreamWriter writer = null;
+        Writer writer = null;
+        File writerFile = null;
         try {       
             // reducing thread priority
             Thread.currentThread().setPriority(Thread.MIN_PRIORITY);                        
@@ -114,6 +118,10 @@ public class pdfParser extends AbstractParser implements Parser {
             PDFTextStripper stripper = new PDFTextStripper();
             theDocument = parser.getPDDocument();
             
+            if (theDocument.isEncrypted()) {
+                throw new ParserException("Document is encrypted",location,plasmaCrawlEURL.DENIED_DOCUMENT_ENCRYPTED);
+            }
+            
             // extracting some metadata
             PDDocumentInformation theDocInfo = theDocument.getDocumentInformation();            
             if (theDocInfo != null) {
@@ -121,42 +129,54 @@ public class pdfParser extends AbstractParser implements Parser {
                 docSubject = theDocInfo.getSubject();
                 //docAuthor = theDocInfo.getAuthor();
                 docKeywordStr = theDocInfo.getKeywords();
-            }
-            
-            serverByteBuffer out = new serverByteBuffer();
-            writer = new OutputStreamWriter( out );            
-            stripper.writeText(theDocument, writer );
-            
-            writer.close(); writer = null;
-            theDocument.close(); theDocument = null;
+            }            
             
-            byte[] contents = out.toByteArray();
-            out.close();
-            out = null;
-			
-            if ((docTitle == null) || (docTitle.length() == 0)) {
-                docTitle = ((contents.length > 80)? new String(contents, 0, 80, "UTF-8"):new String(contents, "UTF-8")).
-                replaceAll("\r\n"," ").
-                replaceAll("\n"," ").
-                replaceAll("\r"," ").
-                replaceAll("\t"," ");                
+            // creating a writer for output
+            if ((this.fileSize != -1) && (this.fileSize > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
+                writerFile = File.createTempFile("pdfParser",".tmp");
+                writer = new OutputStreamWriter(new FileOutputStream(writerFile),"UTF-8");
+            } else {
+                writer = new serverCharBuffer(); 
             }
+
+            stripper.writeText(theDocument, writer );
+            theDocument.close(); theDocument = null;            
+            writer.close();
+
             
             String[] docKeywords = null;
             if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,");
             
-            plasmaParserDocument theDoc = new plasmaParserDocument(
-                    location,
-                    mimeType,
-                    "UTF-8",
-                    docKeywords,
-                    docSubject,
-                    docTitle,
-                    null,
-                    null,
-                    contents,
-                    null,
-                    null);
+            plasmaParserDocument theDoc = null;
+            
+            if (writer instanceof serverCharBuffer) {
+                byte[] contentBytes = ((serverCharBuffer)writer).toString().getBytes("UTF-8");
+                theDoc = new plasmaParserDocument(
+                        location,
+                        mimeType,
+                        "UTF-8",
+                        docKeywords,
+                        docSubject,
+                        docTitle,
+                        null,
+                        null,
+                        contentBytes,
+                        null,
+                        null);
+            } else {
+                theDoc = new plasmaParserDocument(
+                        location,
+                        mimeType,
+                        "UTF-8",
+                        docKeywords,
+                        docSubject,
+                        docTitle,
+                        null,
+                        null,
+                        writerFile,
+                        null,
+                        null);                
+            }
             
             return theDoc;
         }
@@ -164,6 +184,12 @@ public class pdfParser extends AbstractParser implements Parser {
             if (e instanceof InterruptedException) throw (InterruptedException) e;
             if (e instanceof ParserException) throw (ParserException) e;
             
+            // close the writer
+            if (writer != null) try { writer.close(); } catch (Exception ex) {/* ignore this */}
+            
+            // delete the file
+            if (writerFile != null) try { writerFile.delete(); } catch (Exception ex)  {/* ignore this */}
+            
             throw new ParserException("Unexpected error while parsing pdf file. " + e.getMessage(),location); 
         } finally {
             if (theDocument != null) try { theDocument.close(); } catch (Exception e) {/* ignore this */}
@@ -173,8 +199,7 @@ public class pdfParser extends AbstractParser implements Parser {
     }
     
     public void reset() {
-		// Nothing todo here at the moment
-    	
+        this.fileSize = -1;
     }
 
 }
diff --git a/source/de/anomic/plasma/parser/rtf/rtfParser.java b/source/de/anomic/plasma/parser/rtf/rtfParser.java
index 98bcfda5c..c054f079e 100644
--- a/source/de/anomic/plasma/parser/rtf/rtfParser.java
+++ b/source/de/anomic/plasma/parser/rtf/rtfParser.java
@@ -80,8 +80,7 @@ implements Parser {
         this.parserName = "Rich Text Format Parser";  
 	}
 
-	public plasmaParserDocument parse(URL location, String mimeType, String charset,
-			InputStream source) throws ParserException, InterruptedException {
+	public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
 
         
 		try {	
@@ -105,7 +104,7 @@ implements Parser {
                     null,
                     null,
                     null,
-                    bodyText.getBytes(),
+                    bodyText.getBytes("UTF-8"),
                     null,
                     null);
             
diff --git a/source/de/anomic/plasma/parser/tar/tarParser.java b/source/de/anomic/plasma/parser/tar/tarParser.java
index 023f38b40..4d3ff6860 100644
--- a/source/de/anomic/plasma/parser/tar/tarParser.java
+++ b/source/de/anomic/plasma/parser/tar/tarParser.java
@@ -43,8 +43,11 @@
 
 package de.anomic.plasma.parser.tar;
 
+import java.io.BufferedOutputStream;
 import java.io.File;
+import java.io.FileOutputStream;
 import java.io.InputStream;
+import java.io.OutputStream;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Hashtable;
@@ -96,7 +99,18 @@ public class tarParser extends AbstractParser implements Parser {
     
     public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
         
+        long docTextLength = 0;
+        OutputStream docText = null;
+        File outputFile = null;
+        plasmaParserDocument subDoc = null;        
         try {           
+            if ((this.fileSize != -1) && (this.fileSize > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
+                outputFile = File.createTempFile("zipParser",".tmp");
+                docText = new BufferedOutputStream(new FileOutputStream(outputFile));
+            } else {
+                docText = new serverByteBuffer();
+            }            
+            
             // creating a new parser class to parse the unzipped content
             plasmaParser theParser = new plasmaParser();       
             
@@ -116,7 +130,7 @@ public class tarParser extends AbstractParser implements Parser {
             StringBuffer docLongTitle = new StringBuffer();   
             LinkedList docSections = new LinkedList();
             StringBuffer docAbstrct = new StringBuffer();
-            serverByteBuffer docText = new serverByteBuffer();
+
             Map docAnchors = new HashMap();
             TreeSet docImages = new TreeSet(); 
                         
@@ -141,55 +155,58 @@ public class tarParser extends AbstractParser implements Parser {
                 String entryMime = plasmaParser.getMimeTypeByFileExt(entryExt);
                 
                 // getting the entry content
-                plasmaParserDocument theDoc = null;
-                File tempFile = null;
+                File subDocTempFile = null;
                 try {
                     // create the temp file
-                    tempFile = createTempFile(entryName);
+                    subDocTempFile = createTempFile(entryName);
                     
                     // copy the data into the file
-                    serverFileUtils.copy(tin,tempFile,entry.getSize());
+                    serverFileUtils.copy(tin,subDocTempFile,entry.getSize());
                     
                     // check for interruption
                     checkInterruption();
                     
                     // parsing the content                    
-                    theDoc = theParser.parseSource(new URL(location,"#" + entryName),entryMime,null,tempFile);
+                    subDoc = theParser.parseSource(new URL(location,"#" + entryName),entryMime,null,subDocTempFile);
                 } catch (ParserException e) {
                     this.theLogger.logInfo("Unable to parse tar file entry '" + entryName + "'. " + e.getMessage());
                 } finally {
-                    if (tempFile != null) try {tempFile.delete(); } catch(Exception ex){/* ignore this */}
+                    if (subDocTempFile != null) try {subDocTempFile.delete(); } catch(Exception ex){/* ignore this */}
                 }
-                if (theDoc == null) continue;
+                if (subDoc == null) continue;
                 
                 // merging all documents together
                 if (docKeywords.length() > 0) docKeywords.append(",");
-                docKeywords.append(theDoc.getKeywords(','));
+                docKeywords.append(subDoc.getKeywords(','));
                 
                 if (docLongTitle.length() > 0) docLongTitle.append("\n");
-                docLongTitle.append(theDoc.getMainLongTitle());
+                docLongTitle.append(subDoc.getMainLongTitle());
                 
                 if (docShortTitle.length() > 0) docShortTitle.append("\n");
-                docShortTitle.append(theDoc.getMainShortTitle());                
+                docShortTitle.append(subDoc.getMainShortTitle());                
                 
-                docSections.addAll(Arrays.asList(theDoc.getSectionTitles()));
+                docSections.addAll(Arrays.asList(subDoc.getSectionTitles()));
                 
                 if (docAbstrct.length() > 0) docAbstrct.append("\n");
-                docAbstrct.append(theDoc.getAbstract());                   
+                docAbstrct.append(subDoc.getAbstract());                   
 
-                if (docText.length() > 0) docText.append("\n");
-                docText.append(theDoc.getText());                 
+                if (subDoc.getTextLength() > 0) {
+                    if (docTextLength > 0) docText.write('\n');
+                    docTextLength += serverFileUtils.copy(subDoc.getText(), docText);
+                }               
+                
+                docAnchors.putAll(subDoc.getAnchors());
+                docImages.addAll(subDoc.getImages());
                 
-                docAnchors.putAll(theDoc.getAnchors());
-                docImages.addAll(theDoc.getImages());
+                // release subdocument
+                subDoc.close();
+                subDoc = null;                
             }
             
-            /* (URL location, String mimeType,
-             String keywords, String shortTitle, String longTitle,
-             String[] sections, String abstrct,
-             byte[] text, Map anchors, Map images)
-             */            
-            return new plasmaParserDocument(
+            plasmaParserDocument result = null;
+            
+            if (docText instanceof serverByteBuffer) {
+                result = new plasmaParserDocument(
                     location,
                     mimeType,
                     null,
@@ -198,13 +215,37 @@ public class tarParser extends AbstractParser implements Parser {
                     docLongTitle.toString(),
                     (String[])docSections.toArray(new String[docSections.size()]),
                     docAbstrct.toString(),
-                    docText.toByteArray(),
+                    ((serverByteBuffer)docText).toByteArray(),
                     docAnchors,
                     docImages);
+            } else {
+                result = new plasmaParserDocument(
+                        location,
+                        mimeType,
+                        null,
+                        docKeywords.toString().split(" |,"),
+                        docShortTitle.toString(), 
+                        docLongTitle.toString(),
+                        (String[])docSections.toArray(new String[docSections.size()]),
+                        docAbstrct.toString(),
+                        outputFile,
+                        docAnchors,
+                        docImages);                
+            }
+            
+            return result;
         } catch (Exception e) {
             if (e instanceof InterruptedException) throw (InterruptedException) e;
             if (e instanceof ParserException) throw (ParserException) e;
             
+            if (subDoc != null) subDoc.close();
+            
+            // close the writer
+            if (docText != null) try { docText.close(); } catch (Exception ex) {/* ignore this */}
+            
+            // delete the file
+            if (outputFile != null) try { outputFile.delete(); } catch (Exception ex)  {/* ignore this */}               
+            
             throw new ParserException("Unexpected error while parsing tar resource. " + e.getMessage(),location); 
         }
     }
diff --git a/source/de/anomic/plasma/parser/zip/zipParser.java b/source/de/anomic/plasma/parser/zip/zipParser.java
index aca8f6505..8a523dbcf 100644
--- a/source/de/anomic/plasma/parser/zip/zipParser.java
+++ b/source/de/anomic/plasma/parser/zip/zipParser.java
@@ -43,8 +43,11 @@
 
 package de.anomic.plasma.parser.zip;
 
+import java.io.BufferedOutputStream;
 import java.io.File;
+import java.io.FileOutputStream;
 import java.io.InputStream;
+import java.io.OutputStream;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Hashtable;
@@ -94,13 +97,23 @@ public class zipParser extends AbstractParser implements Parser {
     
     public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
         
+        long docTextLength = 0;
+        OutputStream docText = null;
+        File outputFile = null;
+        plasmaParserDocument subDoc = null;
         try {           
+            if ((this.fileSize != -1) && (this.fileSize > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
+                outputFile = File.createTempFile("zipParser",".tmp");
+                docText = new BufferedOutputStream(new FileOutputStream(outputFile));
+            } else {
+                docText = new serverByteBuffer();
+            }
+            
             StringBuffer docKeywords = new StringBuffer();
             StringBuffer docShortTitle = new StringBuffer();  
             StringBuffer docLongTitle = new StringBuffer();   
             LinkedList docSections = new LinkedList();
             StringBuffer docAbstrct = new StringBuffer();
-            serverByteBuffer docText = new serverByteBuffer();
             Map docAnchors = new HashMap();
             TreeSet docImages = new TreeSet(); 
             
@@ -128,48 +141,56 @@ public class zipParser extends AbstractParser implements Parser {
                 String entryMime = plasmaParser.getMimeTypeByFileExt(entryExt);      
                 
                 // parsing the content
-                plasmaParserDocument theDoc = null;
-                File tempFile = null;
+                File subDocTempFile = null;
                 try {
                     // create the temp file
-                    tempFile = createTempFile(entryName);
+                    subDocTempFile = createTempFile(entryName);
                     
                     // copy the data into the file
-                    serverFileUtils.copy(zippedContent,tempFile,entry.getSize());                    
+                    serverFileUtils.copy(zippedContent,subDocTempFile,entry.getSize());                    
                     
                     // parsing the zip file entry
-                    theDoc = theParser.parseSource(new URL(location,"#" + entryName),entryMime,null, tempFile);
+                    subDoc = theParser.parseSource(new URL(location,"#" + entryName),entryMime,null, subDocTempFile);
                 } catch (ParserException e) {
                     this.theLogger.logInfo("Unable to parse zip file entry '" + entryName + "'. " + e.getMessage());
                 } finally {
-                    if (tempFile != null) try {tempFile.delete(); } catch(Exception ex){/* ignore this */}
+                    if (subDocTempFile != null) try {subDocTempFile.delete(); } catch(Exception ex){/* ignore this */}
                 }
-                if (theDoc == null) continue;
+                if (subDoc == null) continue;
                 
                 // merging all documents together
                 if (docKeywords.length() > 0) docKeywords.append(",");
-                docKeywords.append(theDoc.getKeywords(','));
+                docKeywords.append(subDoc.getKeywords(','));
                 
                 if (docLongTitle.length() > 0) docLongTitle.append("\n");
-                docLongTitle.append(theDoc.getMainLongTitle());
+                docLongTitle.append(subDoc.getMainLongTitle());
                 
                 if (docShortTitle.length() > 0) docShortTitle.append("\n");
-                docShortTitle.append(theDoc.getMainShortTitle());                
+                docShortTitle.append(subDoc.getMainShortTitle());                
                 
-                docSections.addAll(Arrays.asList(theDoc.getSectionTitles()));
+                docSections.addAll(Arrays.asList(subDoc.getSectionTitles()));
                 
                 if (docAbstrct.length() > 0) docAbstrct.append("\n");
-                docAbstrct.append(theDoc.getAbstract());                   
+                docAbstrct.append(subDoc.getAbstract());                   
 
-                if (docText.length() > 0) docText.append("\n");
-                docText.append(theDoc.getText());                 
+                if (subDoc.getTextLength() > 0) {
+                    if (docTextLength > 0) docText.write('\n');
+                    docTextLength += serverFileUtils.copy(subDoc.getText(), docText);
+                }
                 
-                docAnchors.putAll(theDoc.getAnchors());
-                docImages.addAll(theDoc.getImages());
+                docAnchors.putAll(subDoc.getAnchors());
+                docImages.addAll(subDoc.getImages());
+                
+                // release subdocument
+                subDoc.close();
+                subDoc = null;
             }
             
         
-            return new plasmaParserDocument(
+            plasmaParserDocument result = null;
+            
+            if (docText instanceof serverByteBuffer) {
+                result = new plasmaParserDocument(
                     location,
                     mimeType,
                     null,
@@ -178,13 +199,37 @@ public class zipParser extends AbstractParser implements Parser {
                     docLongTitle.toString(),
                     (String[])docSections.toArray(new String[docSections.size()]),
                     docAbstrct.toString(),
-                    docText.toByteArray(),
+                    ((serverByteBuffer)docText).toByteArray(),
                     docAnchors,
                     docImages);
+            } else {
+                result = new plasmaParserDocument(
+                        location,
+                        mimeType,
+                        null,
+                        docKeywords.toString().split(" |,"),
+                        docShortTitle.toString(), 
+                        docLongTitle.toString(),
+                        (String[])docSections.toArray(new String[docSections.size()]),
+                        docAbstrct.toString(),
+                        outputFile,
+                        docAnchors,
+                        docImages);                
+            }
+            
+            return result;
         } catch (Exception e) {  
             if (e instanceof InterruptedException) throw (InterruptedException) e;
             if (e instanceof ParserException) throw (ParserException) e;
             
+            if (subDoc != null) subDoc.close();
+            
+            // close the writer
+            if (docText != null) try { docText.close(); } catch (Exception ex) {/* ignore this */}
+            
+            // delete the file
+            if (outputFile != null) try { outputFile.delete(); } catch (Exception ex)  {/* ignore this */}            
+            
             throw new ParserException("Unexpected error while parsing zip resource. " + e.getClass().getName() + ": "+ e.getMessage(),location);
         }
     }
diff --git a/source/de/anomic/plasma/plasmaCrawlEURL.java b/source/de/anomic/plasma/plasmaCrawlEURL.java
index ba6d7d43e..c05f08e5b 100644
--- a/source/de/anomic/plasma/plasmaCrawlEURL.java
+++ b/source/de/anomic/plasma/plasmaCrawlEURL.java
@@ -117,6 +117,7 @@ public class plasmaCrawlEURL extends indexURL {
     
     // Parser errors
     public static final String DENIED_PARSER_ERROR = "denied_(parser_error)";
+    public static final String DENIED_DOCUMENT_ENCRYPTED = "denied_(document_encrypted)";
     public static final String DENIED_NOT_PARSEABLE_NO_CONTENT = "denied_(not_parseabel_no_content)";
     
     // indexing errors
diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java
index 536307b8a..b0038249f 100644
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@@ -515,7 +515,7 @@ public final class plasmaParser {
             
             // testing if the resource is not empty
             if (!(sourceFile.exists() && sourceFile.canRead() && sourceFile.length() > 0)) {
-                String errorMsg = "No resource content available.";
+                String errorMsg = sourceFile.exists() ? "Empty resource file." : "No resource content available.";
                 this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
                 throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT);
             }
diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java
index c8de2b6af..0ca9bb8a1 100644
--- a/source/de/anomic/plasma/plasmaParserDocument.java
+++ b/source/de/anomic/plasma/plasmaParserDocument.java
@@ -42,8 +42,14 @@
 
 package de.anomic.plasma;
 
+import java.io.BufferedInputStream;
 import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
 import java.net.MalformedURLException;
+import de.anomic.server.serverFileUtils;
+
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
@@ -62,7 +68,7 @@ public class plasmaParserDocument {
     String longTitle;   // the real title of the document, commonly h1-tags
     String[] sections;  // if present: more titles/headlines appearing in the document
     String abstrct;     // an abstract, if present: short content description
-    byte[] text;        // the clear text, all that is visible
+    private Object text;  // the clear text, all that is visible
     Map anchors;        // all links embedded as clickeable entities (anchor tags)
     TreeSet images;     // all visible pictures in document
     // the anchors and images - Maps are URL-to-EntityDescription mappings.
@@ -95,6 +101,29 @@ public class plasmaParserDocument {
         this.condenser = null;
         this.resorted = false;
     }
+    
+    public plasmaParserDocument(URL location, String mimeType, String charset,
+            String[] keywords, String shortTitle, String longTitle,
+            String[] sections, String abstrct,
+            File text, Map anchors, TreeSet images) {
+        this.location = location;
+        this.mimeType = (mimeType==null)?"application/octet-stream":mimeType;
+        this.charset = charset;
+        this.keywords = (keywords==null) ? new String[0] : keywords;
+        this.shortTitle = (shortTitle==null)?"":shortTitle;
+        this.longTitle = (longTitle==null)?"":longTitle;
+        this.sections = (sections==null)?new String[0]:sections;
+        this.abstrct = (abstrct==null)?"":abstrct;
+        this.text = text;
+        if (text != null) text.deleteOnExit();
+        this.anchors = (anchors==null)?new HashMap(0):anchors;
+        this.images = (images==null)?new TreeSet():images;
+        this.hyperlinks = null;
+        this.medialinks = null;
+        this.emaillinks = null;
+        this.condenser = null;
+        this.resorted = false;
+    }    
 
     public String getMimeType() {
         return this.mimeType;
@@ -103,7 +132,7 @@ public class plasmaParserDocument {
     /**
      * @return the supposed charset of this document or <code>null</code> if unknown
      */
-    public String getCharset() {
+    public String getSourceCharset() {
         return this.charset;
     }
     
@@ -123,13 +152,41 @@ public class plasmaParserDocument {
         if (abstrct != null) return abstrct; else return getMainLongTitle();
     }
     
-    public byte[] getText() {
-        // returns only the clear (visible) text (not the source data)
-        return text;
+    public InputStream getText() {
+        try {
+            if (this.text == null) return null;
+
+            if (this.text instanceof File) return new BufferedInputStream(new FileInputStream((File)this.text));
+            else if (this.text instanceof byte[]) return new ByteArrayInputStream((byte[])this.text);
+
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+        return null; 
+    }
+    
+    public byte[] getTextBytes() {
+        try {
+            if (this.text == null) return new byte[0];
+
+            if (this.text instanceof File) return serverFileUtils.read((File)this.text);
+            else if (this.text instanceof byte[]) return (byte[])this.text;
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+        return new byte[0];             
+    }
+    
+    public long getTextLength() {
+        if (this.text == null) return 0;
+        if (this.text instanceof File) return ((File)this.text).length();
+        else if (this.text instanceof byte[]) return ((byte[])this.text).length;
+        
+        return -1; 
     }
     
     public plasmaCondenser getCondenser() {
-        if (condenser == null) condenser = new plasmaCondenser(new ByteArrayInputStream(getText()), 0, 0);
+        if (condenser == null) condenser = new plasmaCondenser(getText(), 0, 0);
         return condenser;
     }
     
@@ -262,4 +319,16 @@ public class plasmaParserDocument {
         this.resorted = true;
     }
     
+    public void close() {
+        // delete the temp file
+        if ((this.text != null) && (this.text instanceof File)) {
+            try { ((File)this.text).delete(); } catch (Exception e) {/* ignore this */}
+        }        
+    }
+    
+    protected void finalize() throws Throwable {
+        this.close();
+        super.finalize();
+    }
+    
 }
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index 4e0fba5ba..4d37c31b7 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -1416,6 +1416,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
     }
     
     private void processResourceStack(plasmaSwitchboardQueue.Entry entry) throws InterruptedException {
+        plasmaParserDocument document = null;
         try {
             // work off one stack entry with a fresh resource
             long stackStartTime = 0, stackEndTime = 0,
@@ -1456,7 +1457,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
             /* =========================================================================
              * PARSE CONTENT
              * ========================================================================= */
-            plasmaParserDocument document = null;
             parsingStartTime = System.currentTimeMillis();
 
             try {
@@ -1527,7 +1527,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                 
                 checkInterruption();
                 log.logFine("Condensing for '" + entry.normalizedURLString() + "'");
-                plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(document.getText()));
+                plasmaCondenser condenser = new plasmaCondenser(document.getText());
                 
                 // generate citation reference
                 Integer[] ioLinks = generateCitationReference(entry.urlHash(), docDate, document, condenser);
@@ -1700,8 +1700,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                             log.logInfo("*Indexed " + words + " words in URL " + entry.url() +
                                     " [" + entry.urlHash() + "]" +
                                     "\n\tDescription:  " + docDescription +
-                                    "\n\tMimeType: "  + document.getMimeType() + " | Charset: " + document.getCharset() + " | " +
-                                    "Size: " + document.text.length + " bytes | " +
+                                    "\n\tMimeType: "  + document.getMimeType() + " | Charset: " + document.getSourceCharset() + " | " +
+                                    "Size: " + document.getTextLength() + " bytes | " +
                                     "Anchors: " + ((document.anchors==null)?0:document.anchors.size()) +
                                     "\n\tStackingTime:  " + (stackEndTime-stackStartTime) + " ms | " +
                                     "ParsingTime:  " + (parsingEndTime-parsingStartTime) + " ms | " +
@@ -1744,6 +1744,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                     yacyClient.crawlReceipt(initiatorPeer, "crawl", "rejected", noIndexReason, null, "");
                 }
             }
+            document.close();
             document = null;
         } catch (Exception e) {
             this.log.logSevere("Unexpected exception while parsing/indexing URL ",e);
@@ -1772,6 +1773,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                 cacheManager.deleteFile(entry.url());
             }
             entry = null;
+            
+            if (document != null) try { document.close(); } catch (Exception e) { /* ignore this */ }
         }
     }
     
@@ -1807,7 +1810,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
         kelondroBase64Order.enhancedCoder.encodeLongSmart(GCount, 2) +  // count of links to global resources
         kelondroBase64Order.enhancedCoder.encodeLongSmart(document.getImages().size(), 2) + // count of Images in document
         kelondroBase64Order.enhancedCoder.encodeLongSmart(0, 2) +       // count of links to other documents
-        kelondroBase64Order.enhancedCoder.encodeLongSmart(document.getText().length, 3) +   // length of plain text in bytes
+        kelondroBase64Order.enhancedCoder.encodeLongSmart(document.getTextLength(), 3) +   // length of plain text in bytes
         kelondroBase64Order.enhancedCoder.encodeLongSmart(condenser.RESULT_NUMB_WORDS, 3) + // count of all appearing words
         kelondroBase64Order.enhancedCoder.encodeLongSmart(condenser.RESULT_SIMI_WORDS, 3) + // count of all unique words
         kelondroBase64Order.enhancedCoder.encodeLongSmart(0, 1); // Flags (update, popularity, attention, vote)
@@ -2173,7 +2176,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
         try {
             // get set of words
             // Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline)));
-            Iterator witer = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline, 10000)).getText());
+            Iterator witer = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline, 10000)).getTextBytes());
             // delete all word references
             int count = removeReferences(urlhash, witer);
             // finally delete the url entry itself