From f17ce28b6d169eea89b3df368ed962fc564d0415 Mon Sep 17 00:00:00 2001
From: theli <theli@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Tue, 3 Oct 2006 11:05:48 +0000
Subject: [PATCH] *) plasmaHTCache:    - method loadResourceContent defined as
 deprecated.      Please do not use this function to avoid OutOfMemory
 Exceptions      when loading large files    - new function
 getResourceContentStream to get an inputstream of a cache file    - new
 function getResourceContentLength to get the size of a cached file *)
 httpc.java:    - Bugfix: resource content was loaded into memory even if this
 was not requested *) Crawler:    - new option to hold loaded resource content
 in memory    - adding option to use the worker class without the worker pool 
     (needed by the snippet fetcher) *) plasmaSnippetCache    - snippet loader
 does not use a crawl-worker from pool but uses      a newly created instance
 to avoid blocking by normal crawling      activity.    - now operates on
 streams instead of byte arrays to avoid OutOfMemory      Exceptions when
 operating on large files    - snippet loader now forces the crawl-worker to
 keep the loaded      resource in memory to avoid IO *) plasmaCondenser:
 adding new function getWords that can directly operate on input streams *)
 Parsers    - keep resource in memory whenever possible (to avoid IO)    -
 when parsing from stream the content length must be passed to the parser
 function now.      this length value is needed by the parsers to decide if
 the parsed resource content is to large      to hold it in memory and must be
 stored to file    - AbstractParser.java: new function to pass the
 contentLength of a resource to the parsers

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2701 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 htroot/ViewFile.java                          |  60 ++++--
 htroot/ViewImage.java                         |  20 +-
 source/de/anomic/http/httpc.java              |  14 +-
 source/de/anomic/http/httpdProxyHandler.java  |   2 +-
 .../plasma/crawler/AbstractCrawlWorker.java   |  29 ++-
 .../plasma/crawler/http/CrawlWorker.java      |   5 +-
 .../plasma/crawler/plasmaCrawlerFactory.java  |  10 +-
 .../plasma/crawler/plasmaCrawlerPool.java     |   7 +
 .../anomic/plasma/parser/AbstractParser.java  |  17 +-
 source/de/anomic/plasma/parser/Parser.java    |   2 +
 .../anomic/plasma/parser/bzip/bzipParser.java |   4 +-
 .../anomic/plasma/parser/doc/docParser.java   |   1 +
 .../anomic/plasma/parser/gzip/gzipParser.java |   4 +-
 .../parser/mimeType/mimeTypeParser.java       |   4 +-
 .../anomic/plasma/parser/odt/odtParser.java   |   4 +-
 .../anomic/plasma/parser/pdf/pdfParser.java   |   5 +-
 .../anomic/plasma/parser/rpm/rpmParser.java   |   4 +-
 .../anomic/plasma/parser/rss/rssParser.java   |   4 +-
 .../anomic/plasma/parser/rtf/rtfParser.java   |   1 +
 .../anomic/plasma/parser/tar/tarParser.java   |   6 +-
 .../anomic/plasma/parser/vcf/vcfParser.java   |   4 +-
 .../anomic/plasma/parser/zip/zipParser.java   |   6 +-
 source/de/anomic/plasma/plasmaCondenser.java  |   9 +-
 .../de/anomic/plasma/plasmaCrawlLoader.java   |  27 ++-
 .../plasma/plasmaCrawlLoaderMessage.java      |   5 +-
 source/de/anomic/plasma/plasmaHTCache.java    |  40 +++-
 source/de/anomic/plasma/plasmaParser.java     | 168 ++++++++--------
 .../de/anomic/plasma/plasmaSearchImages.java  |  10 +-
 .../de/anomic/plasma/plasmaSnippetCache.java  | 189 ++++++++++++------
 .../de/anomic/plasma/plasmaSwitchboard.java   |  22 +-
 source/de/anomic/server/serverFileUtils.java  |   2 +-
 31 files changed, 465 insertions(+), 220 deletions(-)

diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java
index b1f12ef9e..525c9d7e4 100644
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@@ -45,6 +45,7 @@
 //if the shell's current path is HTROOT
 
 import java.io.IOException;
+import java.io.InputStream;
 import java.io.UnsupportedEncodingException;
 import java.net.URLDecoder;
 import java.net.URLEncoder;
@@ -60,6 +61,7 @@ import de.anomic.plasma.cache.IResourceInfo;
 import de.anomic.plasma.crawler.plasmaCrawlerException;
 import de.anomic.plasma.parser.ParserException;
 import de.anomic.plasma.plasmaCrawlLURL.Entry;
+import de.anomic.server.serverFileUtils;
 import de.anomic.server.serverObjects;
 import de.anomic.server.serverSwitch;
 
@@ -121,18 +123,20 @@ public class ViewFile {
             }    
 
             // loading the resource content as byte array
-            byte[] resource = null;
+            InputStream resource = null;
+            long resourceLength = -1;
             IResourceInfo resInfo = null;
             String resMime = null;
             try {
                 // trying to load the resource body
-                resource = sb.cacheManager.loadResourceContent(url);
+                resource = sb.cacheManager.getResourceContentStream(url);
+                resourceLength = sb.cacheManager.getResourceContentLength(url);
 
                 // if the resource body was not cached we try to load it from web
                 if (resource == null) {
                     plasmaHTCache.Entry entry = null;
                     try {
-                        entry = sb.snippetCache.loadResourceFromWeb(url, 5000);
+                        entry = sb.snippetCache.loadResourceFromWeb(url, 5000, false);
                     } catch (plasmaCrawlerException e) {
                         prop.put("error",4);
                         prop.put("error_errorText",e.getMessage());
@@ -142,11 +146,13 @@ public class ViewFile {
 
                     if (entry != null) {
                         resInfo = entry.getDocumentInfo();
-                        resource = sb.cacheManager.loadResourceContent(url);
+                        resource = sb.cacheManager.getResourceContentStream(url);
+                        resourceLength = sb.cacheManager.getResourceContentLength(url);
                     }
 
                     if (resource == null) {
                         prop.put("error",4);
+                        prop.put("error_errorText","No resource available");
                         prop.put("viewMode",VIEW_MODE_NO_TEXT);
                         return prop;
                     } 
@@ -172,21 +178,46 @@ public class ViewFile {
                         httpHeader responseHeader = httpc.whead(url,url.getHost(),5000,null,null,sb.remoteProxyConfig);
                         if (responseHeader == null) {
                             prop.put("error",4);
+                            prop.put("error_errorText","Unable to load resource metadata.");
                             prop.put("viewMode",VIEW_MODE_NO_TEXT);
                             return prop;
                         } 
+                        try {
+                            resInfo = sb.cacheManager.getResourceInfoFactory().buildResourceInfoObj(url, responseHeader);
+                        } catch (Exception e) {
+                            prop.put("error",4);
+                            prop.put("error_errorText",e.getMessage());
+                            prop.put("viewMode",VIEW_MODE_NO_TEXT);
+                            return prop;
+                        }
                         resMime = responseHeader.mime();
                     }
                 } else {
                     resMime = resInfo.getMimeType();
                 }
             } catch (IOException e) {
+                if (resource != null) try { resource.close(); } catch (Exception ex) {/* ignore this */}
                 prop.put("error",4);
+                prop.put("error_errorText",e.getMessage());
                 prop.put("viewMode",VIEW_MODE_NO_TEXT);
                 return prop; 
-            }    
-            if (viewMode.equals("plain")) {                
-                String content = new String(resource);
+            } 
+            
+            if (viewMode.equals("plain")) {
+                
+                // TODO: how to handle very large files here ?
+                String content;
+                try {
+                    content = new String(serverFileUtils.read(resource),"UTF-8");
+                } catch (Exception e) {
+                    prop.put("error",4);
+                    prop.put("error_errorText",e.getMessage());
+                    prop.put("viewMode",VIEW_MODE_NO_TEXT);
+                    return prop;                     
+                } finally {
+                    if (resource != null) try { resource.close(); } catch (Exception e) {/* ignore this */}
+                }
+                
                 content = content.replaceAll("<","&lt;")
                 .replaceAll(">","&gt;")
                 .replaceAll("\"","&quot;")
@@ -195,12 +226,15 @@ public class ViewFile {
 
                 prop.put("error",0);
                 prop.put("viewMode",VIEW_MODE_AS_PLAIN_TEXT);
-                prop.put("viewMode_plainText",content);                     
-            } else if (viewMode.equals("parsed") || viewMode.equals("sentences") || viewMode.equals("iframe")) {
+                prop.put("viewMode_plainText",content); 
+            } else if (viewMode.equals("iframe")) {
+                prop.put("viewMode",VIEW_MODE_AS_IFRAME);
+                prop.put("viewMode_url",url.toString());                
+            } else if (viewMode.equals("parsed") || viewMode.equals("sentences")) {
                 // parsing the resource content
                 plasmaParserDocument document = null;
                 try {
-                    document = sb.snippetCache.parseDocument(url, resource,resInfo);
+                    document = sb.snippetCache.parseDocument(url, resourceLength, resource,resInfo);
                     if (document == null) {
                         prop.put("error",5);
                         prop.put("error_errorText","Unknown error");
@@ -212,7 +246,10 @@ public class ViewFile {
                     prop.put("error_errorText",e.getMessage());
                     prop.put("viewMode",VIEW_MODE_NO_TEXT);
                     return prop;     
+                } finally {
+                    if (resource != null) try { resource.close(); } catch (Exception e) {/* ignore this */}
                 }
+                
                 resMime = document.getMimeType();
 
                 if (viewMode.equals("parsed")) {
@@ -223,9 +260,6 @@ public class ViewFile {
 
                     prop.put("viewMode",VIEW_MODE_AS_PARSED_TEXT);
                     prop.put("viewMode_parsedText",content);
-                } else if (viewMode.equals("iframe")) {
-                    prop.put("viewMode",VIEW_MODE_AS_IFRAME);
-                    prop.put("viewMode_url",url.toString());
                 } else {
                     prop.put("viewMode",VIEW_MODE_AS_PARSED_SENTENCES);
                     String[] sentences = document.getSentences();
diff --git a/htroot/ViewImage.java b/htroot/ViewImage.java
index 1d1329873..30d765ee2 100644
--- a/htroot/ViewImage.java
+++ b/htroot/ViewImage.java
@@ -43,11 +43,14 @@ import java.awt.Container;
 import java.awt.Image;
 import java.awt.MediaTracker;
 import java.awt.Toolkit;
+import java.io.IOException;
+import java.io.InputStream;
 import java.net.MalformedURLException;
 
 import de.anomic.http.httpHeader;
 import de.anomic.net.URL;
 import de.anomic.plasma.plasmaSwitchboard;
+import de.anomic.server.serverFileUtils;
 import de.anomic.server.serverObjects;
 import de.anomic.server.serverSwitch;
 
@@ -70,9 +73,20 @@ public class ViewImage {
         int maxheight = post.getInt("maxheight", 0);
         int timeout = post.getInt("timeout", 5000);
         
-        // load image
-        byte[] imgb = sb.snippetCache.getResource(url, true, timeout);
-        if (imgb == null) return null;
+        // getting the image as stream
+        InputStream imgStream = (InputStream) sb.snippetCache.getResource(url, true, timeout)[0];
+        if (imgStream == null) return null;
+        
+        // read image data
+        byte[] imgb = null;
+        try {
+            imgb = serverFileUtils.read(imgStream);
+        } catch (IOException e) {
+            return null;
+        } finally {
+            try { imgStream.close(); } catch (Exception e) {/* ignore this */}
+        }
+        
         
         // create image 
         MediaTracker mediaTracker = new MediaTracker(new Container()); 
diff --git a/source/de/anomic/http/httpc.java b/source/de/anomic/http/httpc.java
index ee910c023..0a5656058 100644
--- a/source/de/anomic/http/httpc.java
+++ b/source/de/anomic/http/httpc.java
@@ -1828,7 +1828,7 @@ do upload
 //            return sbb.getBytes();
             return serverFileUtils.read(this.getContentInputStream());
         }
-
+        
         /**
         * This method outputs the found content into an byte-array and
         * additionally outputs it to procOS.
@@ -1837,9 +1837,13 @@ do upload
         * @return 
         * @throws IOException
         */
-        public byte[] writeContent(Object procOS) throws IOException {
-            int contentLength = (int) this.responseHeader.contentLength();
-            serverByteBuffer sbb = new serverByteBuffer((contentLength==-1)?8192:contentLength);
+        public byte[] writeContent(Object procOS, boolean returnByteArray) throws IOException {
+            serverByteBuffer sbb = null;
+            
+            if (returnByteArray) {
+                int contentLength = (int) this.responseHeader.contentLength();
+                sbb = new serverByteBuffer((contentLength==-1)?8192:contentLength);
+            }
             
             if (procOS instanceof OutputStream) {
                 //writeContentX(httpc.this.clientInput, this.gzip, this.responseHeader.contentLength(), procOS, sbb);
@@ -1852,7 +1856,7 @@ do upload
                 throw new IllegalArgumentException("Invalid procOS object type '" + procOS.getClass().getName() + "'");
             }
             
-            return sbb.getBytes();
+            return (sbb==null)?null:sbb.getBytes();
         }        
 
         /**
diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java
index ea7edfbcb..441bdef2c 100644
--- a/source/de/anomic/http/httpdProxyHandler.java
+++ b/source/de/anomic/http/httpdProxyHandler.java
@@ -662,7 +662,7 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
                 if ((contentLength > 0) && (contentLength < 1048576)) // if the length is known and < 1 MB
                 {
                     // ok, we don't write actually into a file, only to RAM, and schedule writing the file.
-                    byte[] cacheArray = res.writeContent(hfos);
+                    byte[] cacheArray = res.writeContent(hfos,true);
                     this.theLogger.logFine("writeContent of " + url + " produced cacheArray = " + ((cacheArray == null) ? "null" : ("size=" + cacheArray.length)));
 
                     if (hfos instanceof htmlFilterWriter) ((htmlFilterWriter) hfos).finalize();
diff --git a/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java b/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java
index 2df4f4d4b..6960ea857 100644
--- a/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java
+++ b/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java
@@ -80,6 +80,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
      */
     protected boolean done = false;       
     
+    
     /* ============================================================
      * Crawl job specific variables
      * ============================================================ */    
@@ -92,6 +93,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
     protected long startdate;
     protected plasmaCrawlProfile.entry profile;  
     protected boolean acceptAllContent;
+    protected boolean keepInMemory;
     
     protected String errorMessage;
     
@@ -159,22 +161,27 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
 
         try {
             // The thread keeps running.
-            while (!this.stopped && !this.isInterrupted() && !this.myPool.isClosed) {
-                if (this.done) {       
-                    synchronized (this) { 
-                        // return thread back into pool
-                        this.myPool.returnObject(this.protocol,this);
-                        
-                        // We are waiting for a new task now.
-                        if (!this.stopped && !this.destroyed && !this.isInterrupted()) { 
-                            this.wait(); 
+            while (!this.stopped && !this.isInterrupted()) {
+                if (this.done) {  
+                    if (this.myPool != null && !this.myPool.isClosed) {
+                        synchronized (this) { 
+                            // return thread back into pool
+                            this.myPool.returnObject(this.protocol,this);
+
+                            // We are waiting for a new task now.
+                            if (!this.stopped && !this.destroyed && !this.isInterrupted()) { 
+                                this.wait(); 
+                            }
                         }
+                    } else {
+                        this.stopped = true;
                     }
                 } else {
                     try {
                         // executing the new task
                         execute();
                     } finally {
+                        // free memory
                         reset();
                     }
                 }
@@ -231,6 +238,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
             this.depth = theNewMsg.depth;
             this.profile = theNewMsg.profile;
             this.acceptAllContent = theNewMsg.acceptAllContent;
+            this.keepInMemory = theNewMsg.keepInMemory;
 
             this.startdate = System.currentTimeMillis();
 
@@ -260,6 +268,7 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
     
     public void reset() {
         this.theMsg = null;
+        
         this.url = null;
         this.name = null;
         this.refererURLString = null;
@@ -268,6 +277,8 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
         this.startdate = 0;
         this.profile = null;
         this.acceptAllContent = false;
+        this.keepInMemory = false;
+        
         this.errorMessage = null;
     }    
     
diff --git a/source/de/anomic/plasma/crawler/http/CrawlWorker.java b/source/de/anomic/plasma/crawler/http/CrawlWorker.java
index 54c1a8a60..ebb064048 100644
--- a/source/de/anomic/plasma/crawler/http/CrawlWorker.java
+++ b/source/de/anomic/plasma/crawler/http/CrawlWorker.java
@@ -262,8 +262,9 @@ public final class CrawlWorker extends AbstractCrawlWorker {
                             }
 
                             // we write the new cache entry to file system directly
-                            res.writeContent(fos);
-                            htCache.setCacheArray(null);
+                            byte[] cacheArray = null;
+                            cacheArray = res.writeContent(fos,this.keepInMemory);
+                            htCache.setCacheArray(cacheArray);
                             this.cacheManager.writeFileAnnouncement(cacheFile);
                         } finally {
                             if (fos!=null)try{fos.close();}catch(Exception e){/* ignore this */}
diff --git a/source/de/anomic/plasma/crawler/plasmaCrawlerFactory.java b/source/de/anomic/plasma/crawler/plasmaCrawlerFactory.java
index 7f4f229ee..6f974957d 100644
--- a/source/de/anomic/plasma/crawler/plasmaCrawlerFactory.java
+++ b/source/de/anomic/plasma/crawler/plasmaCrawlerFactory.java
@@ -84,10 +84,14 @@ public final class plasmaCrawlerFactory implements KeyedPoolableObjectFactory {
         this.thePool = pool;    
     }
 
+    public Object makeObject(Object key) throws Exception { 
+        return makeObject(key, true);
+    }
+    
     /**
      * @see org.apache.commons.pool.PoolableObjectFactory#makeObject()
      */
-    public Object makeObject(Object key) throws Exception {        
+    public Object makeObject(Object key, boolean usePool) throws Exception {        
         if (!(key instanceof String))
             throw new IllegalArgumentException("The object key must be of type string.");        
         
@@ -109,11 +113,11 @@ public final class plasmaCrawlerFactory implements KeyedPoolableObjectFactory {
         // instantiating class
         plasmaCrawlWorker theCrawlWorker = (plasmaCrawlWorker) classConstructor.newInstance(new Object[] {
               this.theThreadGroup,
-              this.thePool,
+              (usePool)?this.thePool:null,
               this.sb,
               this.cacheManager,
               this.theLog
-        });        
+        });           
         
         // return the newly created object
         return theCrawlWorker;
diff --git a/source/de/anomic/plasma/crawler/plasmaCrawlerPool.java b/source/de/anomic/plasma/crawler/plasmaCrawlerPool.java
index f69845901..7b52106ee 100644
--- a/source/de/anomic/plasma/crawler/plasmaCrawlerPool.java
+++ b/source/de/anomic/plasma/crawler/plasmaCrawlerPool.java
@@ -52,15 +52,22 @@ import org.apache.commons.pool.impl.GenericKeyedObjectPool;
 import de.anomic.server.logging.serverLog;
 
 public final class plasmaCrawlerPool extends GenericKeyedObjectPool {
+    
+    private plasmaCrawlerFactory theFactory;
     private final ThreadGroup theThreadGroup;
     public boolean isClosed = false;
 
     public plasmaCrawlerPool(plasmaCrawlerFactory objFactory, GenericKeyedObjectPool.Config config, ThreadGroup threadGroup) {
         super(objFactory, config);
+        this.theFactory = objFactory;
         this.theThreadGroup = threadGroup;
         objFactory.setPool(this);
     }
 
+    public plasmaCrawlerFactory getFactory() {
+        return this.theFactory;
+    }
+    
     public Object borrowObject(Object key) throws Exception  {
        return super.borrowObject(key);
     }
diff --git a/source/de/anomic/plasma/parser/AbstractParser.java b/source/de/anomic/plasma/parser/AbstractParser.java
index 2c7f1d701..baa413a06 100644
--- a/source/de/anomic/plasma/parser/AbstractParser.java
+++ b/source/de/anomic/plasma/parser/AbstractParser.java
@@ -90,7 +90,7 @@ public abstract class AbstractParser implements Parser{
      * The source file file size in bytes if the source document was passed
      * in as file
      */
-    protected long fileSize = -1;
+    protected long contentLength = -1;
     
     /**
      * The Constructor of this class.
@@ -99,6 +99,15 @@ public abstract class AbstractParser implements Parser{
 		super();
         this.libxDependencies = libxDependencies;
 	}
+    
+    /**
+     * Set the content length of the source file.
+     * This value is needed by some parsers to decide
+     * if the parsed text could be hold in memory
+     */
+    public void setContentLength(long length) {
+        this.contentLength = length;
+    }
 
     /**
      * Check if the parser was interrupted.
@@ -185,7 +194,7 @@ public abstract class AbstractParser implements Parser{
         BufferedInputStream contentInputStream = null;
         try {
             // getting the file size of the document
-            this.fileSize = sourceFile.length();            
+            this.contentLength = sourceFile.length();            
             
             // create a stream from the file
             contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile));
@@ -242,4 +251,8 @@ public abstract class AbstractParser implements Parser{
     public String getName() {
         return this.parserName;
     }
+    
+    public void reset() {
+        this.contentLength = -1;
+    }
 }
diff --git a/source/de/anomic/plasma/parser/Parser.java b/source/de/anomic/plasma/parser/Parser.java
index 83d0daa5c..a1adeae06 100644
--- a/source/de/anomic/plasma/parser/Parser.java
+++ b/source/de/anomic/plasma/parser/Parser.java
@@ -117,6 +117,8 @@ public interface Parser {
      */
     public void reset();
     
+    public void setContentLength(long length);
+    
     /**
      * @return Returns a list of library names that are needed by this parser
      */
diff --git a/source/de/anomic/plasma/parser/bzip/bzipParser.java b/source/de/anomic/plasma/parser/bzip/bzipParser.java
index 60621e7f8..53b2630dd 100644
--- a/source/de/anomic/plasma/parser/bzip/bzipParser.java
+++ b/source/de/anomic/plasma/parser/bzip/bzipParser.java
@@ -138,7 +138,7 @@ public class bzipParser extends AbstractParser implements Parser {
     }
     
     public void reset() {
-		// Nothing todo here at the moment
-    	
+        // Nothing todo here at the moment
+        super.reset();
     }
 }
diff --git a/source/de/anomic/plasma/parser/doc/docParser.java b/source/de/anomic/plasma/parser/doc/docParser.java
index b5a076399..92c116b4c 100644
--- a/source/de/anomic/plasma/parser/doc/docParser.java
+++ b/source/de/anomic/plasma/parser/doc/docParser.java
@@ -118,6 +118,7 @@ implements Parser {
 
 	public void reset() {
         // Nothing todo here at the moment
+        super.reset();
 	}
 
 }
diff --git a/source/de/anomic/plasma/parser/gzip/gzipParser.java b/source/de/anomic/plasma/parser/gzip/gzipParser.java
index 0c2af76b3..a289eb361 100644
--- a/source/de/anomic/plasma/parser/gzip/gzipParser.java
+++ b/source/de/anomic/plasma/parser/gzip/gzipParser.java
@@ -122,7 +122,7 @@ public class gzipParser extends AbstractParser implements Parser {
     }
     
     public void reset() {
-		// Nothing todo here at the moment
-    	
+        // Nothing todo here at the moment
+        super.reset();
     }
 }
diff --git a/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java b/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java
index 38665c6c5..70b01f471 100644
--- a/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java
+++ b/source/de/anomic/plasma/parser/mimeType/mimeTypeParser.java
@@ -187,8 +187,7 @@ implements Parser {
         }
     }
     
-    public plasmaParserDocument parse(URL location, String mimeType,String charset,
-            InputStream source) throws ParserException, InterruptedException {
+    public plasmaParserDocument parse(URL location, String mimeType,String charset, InputStream source) throws ParserException, InterruptedException {
         File dstFile = null;
         try {
             dstFile = File.createTempFile("mimeTypeParser",".tmp");
@@ -208,6 +207,7 @@ implements Parser {
     
     public void reset() {
         // Nothing todo here at the moment
+        super.reset();
     }
     
 }
diff --git a/source/de/anomic/plasma/parser/odt/odtParser.java b/source/de/anomic/plasma/parser/odt/odtParser.java
index 9d8e9e011..6fc977644 100644
--- a/source/de/anomic/plasma/parser/odt/odtParser.java
+++ b/source/de/anomic/plasma/parser/odt/odtParser.java
@@ -197,8 +197,8 @@ public class odtParser extends AbstractParser implements Parser {
     }
     
     public void reset() {
-		// Nothing todo here at the moment
-    	
+        // Nothing todo here at the moment
+        super.reset();
     }
     
     public static void main(String[] args) {
diff --git a/source/de/anomic/plasma/parser/pdf/pdfParser.java b/source/de/anomic/plasma/parser/pdf/pdfParser.java
index 5f2fca420..174d8fbd9 100644
--- a/source/de/anomic/plasma/parser/pdf/pdfParser.java
+++ b/source/de/anomic/plasma/parser/pdf/pdfParser.java
@@ -132,7 +132,7 @@ public class pdfParser extends AbstractParser implements Parser {
             }            
             
             // creating a writer for output
-            if ((this.fileSize != -1) && (this.fileSize > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
+            if ((this.contentLength == -1) || (this.contentLength > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
                 writerFile = File.createTempFile("pdfParser",".tmp");
                 writer = new OutputStreamWriter(new FileOutputStream(writerFile),"UTF-8");
             } else {
@@ -199,7 +199,8 @@ public class pdfParser extends AbstractParser implements Parser {
     }
     
     public void reset() {
-        this.fileSize = -1;
+        // Nothing todo here at the moment
+        super.reset();
     }
 
 }
diff --git a/source/de/anomic/plasma/parser/rpm/rpmParser.java b/source/de/anomic/plasma/parser/rpm/rpmParser.java
index 6c52cb97c..90ee23222 100644
--- a/source/de/anomic/plasma/parser/rpm/rpmParser.java
+++ b/source/de/anomic/plasma/parser/rpm/rpmParser.java
@@ -169,8 +169,8 @@ public class rpmParser extends AbstractParser implements Parser {
     }
     
     public void reset() {
-		// Nothing todo here at the moment
-    	
+        // Nothing todo here at the moment
+        super.reset();
     }
     
     public static void main(String[] args) {
diff --git a/source/de/anomic/plasma/parser/rss/rssParser.java b/source/de/anomic/plasma/parser/rss/rssParser.java
index 41cf8573b..dbf3d11ee 100644
--- a/source/de/anomic/plasma/parser/rss/rssParser.java
+++ b/source/de/anomic/plasma/parser/rss/rssParser.java
@@ -211,8 +211,8 @@ public class rssParser extends AbstractParser implements Parser {
 	}
 
 	public void reset() {
-		// TODO Auto-generated method stub
-
+        // Nothing todo here at the moment
+        super.reset();
 	}
 
 }
diff --git a/source/de/anomic/plasma/parser/rtf/rtfParser.java b/source/de/anomic/plasma/parser/rtf/rtfParser.java
index c054f079e..de5e3ff72 100644
--- a/source/de/anomic/plasma/parser/rtf/rtfParser.java
+++ b/source/de/anomic/plasma/parser/rtf/rtfParser.java
@@ -124,6 +124,7 @@ implements Parser {
 
 	public void reset() {
         // Nothing todo here at the moment
+        super.reset();
 	}
 
 }
diff --git a/source/de/anomic/plasma/parser/tar/tarParser.java b/source/de/anomic/plasma/parser/tar/tarParser.java
index 4d3ff6860..4f066232a 100644
--- a/source/de/anomic/plasma/parser/tar/tarParser.java
+++ b/source/de/anomic/plasma/parser/tar/tarParser.java
@@ -104,7 +104,7 @@ public class tarParser extends AbstractParser implements Parser {
         File outputFile = null;
         plasmaParserDocument subDoc = null;        
         try {           
-            if ((this.fileSize != -1) && (this.fileSize > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
+            if ((this.contentLength == -1) || (this.contentLength > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
                 outputFile = File.createTempFile("zipParser",".tmp");
                 docText = new BufferedOutputStream(new FileOutputStream(outputFile));
             } else {
@@ -251,7 +251,7 @@ public class tarParser extends AbstractParser implements Parser {
     }
     
     public void reset() {
-		// Nothing todo here at the moment
-    	
+        // Nothing todo here at the moment
+        super.reset();
     }
 }
diff --git a/source/de/anomic/plasma/parser/vcf/vcfParser.java b/source/de/anomic/plasma/parser/vcf/vcfParser.java
index e31010537..f553d5032 100644
--- a/source/de/anomic/plasma/parser/vcf/vcfParser.java
+++ b/source/de/anomic/plasma/parser/vcf/vcfParser.java
@@ -262,8 +262,8 @@ public class vcfParser extends AbstractParser implements Parser {
     }
     
     public void reset() {
-		// Nothing todo here at the moment
-    	
+        // Nothing todo here at the moment
+        super.reset();
     }
     
     public static void main(String[] args) {
diff --git a/source/de/anomic/plasma/parser/zip/zipParser.java b/source/de/anomic/plasma/parser/zip/zipParser.java
index 8a523dbcf..e672df7dd 100644
--- a/source/de/anomic/plasma/parser/zip/zipParser.java
+++ b/source/de/anomic/plasma/parser/zip/zipParser.java
@@ -102,7 +102,7 @@ public class zipParser extends AbstractParser implements Parser {
         File outputFile = null;
         plasmaParserDocument subDoc = null;
         try {           
-            if ((this.fileSize != -1) && (this.fileSize > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
+            if ((this.contentLength == -1) || (this.contentLength > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
                 outputFile = File.createTempFile("zipParser",".tmp");
                 docText = new BufferedOutputStream(new FileOutputStream(outputFile));
             } else {
@@ -235,7 +235,7 @@ public class zipParser extends AbstractParser implements Parser {
     }
     
     public void reset() {
-		// Nothing todo here at the moment
-    	
+        // Nothing todo here at the moment
+        super.reset();
     }
 }
diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java
index 6ca5bfc63..d72eb43f8 100644
--- a/source/de/anomic/plasma/plasmaCondenser.java
+++ b/source/de/anomic/plasma/plasmaCondenser.java
@@ -671,11 +671,16 @@ public final class plasmaCondenser {
     }
     */
     
+    public static Iterator getWords(InputStream input) {
+        if (input == null) return null;
+        plasmaCondenser condenser = new plasmaCondenser(input);
+        return condenser.words();        
+    }
+    
     public static Iterator getWords(byte[] text) {
         if (text == null) return null;
         ByteArrayInputStream buffer = new ByteArrayInputStream(text);
-        plasmaCondenser condenser = new plasmaCondenser(buffer);
-        return condenser.words();
+        return getWords(buffer);
     }
         
     public static void main(String[] args) {
diff --git a/source/de/anomic/plasma/plasmaCrawlLoader.java b/source/de/anomic/plasma/plasmaCrawlLoader.java
index e349da8bf..eac42b16e 100644
--- a/source/de/anomic/plasma/plasmaCrawlLoader.java
+++ b/source/de/anomic/plasma/plasmaCrawlLoader.java
@@ -163,15 +163,23 @@ public final class plasmaCrawlLoader extends Thread {
         return this.theThreadGroup;
     }
     
-    private void execute(plasmaCrawlLoaderMessage theMsg) throws Exception {
+    private void execute(plasmaCrawlLoaderMessage theMsg, boolean useThreadPool) throws Exception {
         // getting the protocol of the next URL                
         String protocol = theMsg.url.getProtocol();
         
         // TODO: remove this
         if (protocol.equals("https")) protocol = "http";
         
-        // getting a new crawler from the crawler pool
-        plasmaCrawlWorker theWorker = (plasmaCrawlWorker) this.crawlwerPool.borrowObject(protocol);
+        // get a new worker thread
+        plasmaCrawlWorker theWorker = null;
+        if (useThreadPool) {
+            // getting a new crawler from the crawler pool
+            theWorker = (plasmaCrawlWorker) this.crawlwerPool.borrowObject(protocol);
+        } else {
+            // create a new one
+            theWorker = (plasmaCrawlWorker) this.crawlwerPool.getFactory().makeObject(protocol,false);
+        }
+        
         if (theWorker == null) {
             this.log.logWarning("Unsupported protocol '" + protocol + "' in url " + theMsg.url);
         } else {
@@ -187,7 +195,7 @@ public final class plasmaCrawlLoader extends Thread {
                 plasmaCrawlLoaderMessage theMsg = this.theQueue.waitForMessage();
 
                 // start new crawl job
-                this.execute(theMsg);
+                this.execute(theMsg, true);
 
             } catch (InterruptedException e) {
                 Thread.interrupted();
@@ -218,7 +226,8 @@ public final class plasmaCrawlLoader extends Thread {
             String initiator, 
             int depth, 
             plasmaCrawlProfile.entry profile,
-            int timeout
+            int timeout,
+            boolean keepInMemory
     ) throws plasmaCrawlerException {
 
         plasmaHTCache.Entry result = null;
@@ -235,13 +244,14 @@ public final class plasmaCrawlLoader extends Thread {
                     profile, 
                     crawlingPriority,
                     true,
-                    timeout
+                    timeout,
+                    keepInMemory
             );
 
 
             try {
                 // start new crawl job
-                this.execute(theMsg);
+                this.execute(theMsg, false);
 
                 // wait for the crawl job result
                 result = theMsg.waitForResult();                
@@ -283,7 +293,8 @@ public final class plasmaCrawlLoader extends Thread {
                     profile,            // crawling profile
                     crawlingPriority,   // crawling priority
                     false,              // only download documents whose mimetypes are enabled for the crawler
-                    -1                  // use default crawler timeout
+                    -1,                 // use default crawler timeout
+                    false               // resource should not be kept in memory 
             );
             
             // adding the message to the queue
diff --git a/source/de/anomic/plasma/plasmaCrawlLoaderMessage.java b/source/de/anomic/plasma/plasmaCrawlLoaderMessage.java
index b3d678c67..60929d606 100644
--- a/source/de/anomic/plasma/plasmaCrawlLoaderMessage.java
+++ b/source/de/anomic/plasma/plasmaCrawlLoaderMessage.java
@@ -56,6 +56,7 @@ public final class plasmaCrawlLoaderMessage {
     public final plasmaCrawlProfile.entry profile;
     public final boolean acceptAllContent;
     public final int timeout;
+    public final boolean keepInMemory;
     
     private serverSemaphore resultSync  = null;
     private plasmaHTCache.Entry result;
@@ -71,7 +72,8 @@ public final class plasmaCrawlLoaderMessage {
             plasmaCrawlProfile.entry profile,
             int crawlingPriority,
             boolean acceptAllContent,
-            int timeout
+            int timeout,
+            boolean keepInMemory
     ) {
         this.url = url;
         this.name = name;
@@ -82,6 +84,7 @@ public final class plasmaCrawlLoaderMessage {
         this.crawlingPriority = crawlingPriority;
         this.acceptAllContent = acceptAllContent;
         this.timeout = timeout;
+        this.keepInMemory = keepInMemory;
         
         this.resultSync  = new serverSemaphore(0);
         this.result = null;
diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java
index 293acdc28..17d34d372 100644
--- a/source/de/anomic/plasma/plasmaHTCache.java
+++ b/source/de/anomic/plasma/plasmaHTCache.java
@@ -53,9 +53,12 @@
 
 package de.anomic.plasma;
 
+import java.io.BufferedInputStream;
 import java.io.File;
+import java.io.FileInputStream;
 import java.io.FileNotFoundException;
 import java.io.IOException;
+import java.io.InputStream;
 import java.net.InetAddress;
 import java.net.MalformedURLException;
 import java.util.Date;
@@ -701,16 +704,51 @@ public final class plasmaHTCache {
         return null;
     }
 
+    /**
+     * @param url
+     * @return
+     * 
+     * @deprecated dont't use this function to avoid OutOfMemory-Exceptions.
+     *  Use {@link #getResourceContentStream(URL)} instead 
+     */
     public byte[] loadResourceContent(URL url) {
         // load the url as resource from the cache
         File f = getCachePath(url);
-        if (f.exists()) try {
+        if (f.exists() && f.canRead()) try {
             return serverFileUtils.read(f);
         } catch (IOException e) {
             return null;
         }
         return null;
     }
+    
+    /**
+     * Returns the content of a cached resource as {@link InputStream}
+     * @param url the requested resource
+     * @return the resource content as {@link InputStream}. In no data
+     * is available or the cached file is not readable, <code>null</code>
+     * is returned.
+     */
+    public InputStream getResourceContentStream(URL url) {
+        // load the url as resource from the cache
+        File f = getCachePath(url);
+        if (f.exists() && f.canRead()) try {
+            return new BufferedInputStream(new FileInputStream(f));
+        } catch (IOException e) {
+            this.log.logSevere("Unable to create a BufferedInputStream from file " + f,e);
+            return null;
+        }
+        return null;        
+    }
+    
+    public long getResourceContentLength(URL url) {
+        // load the url as resource from the cache
+        File f = getCachePath(url);
+        if (f.exists() && f.canRead()) {
+            return f.length();
+        } 
+        return 0;           
+    }
 
     public static boolean isPOST(String urlString) {
         return (urlString.indexOf("?") >= 0 ||
diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java
index e339bc0fe..b420b7ffc 100644
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@@ -45,11 +45,13 @@
 package de.anomic.plasma;
 
 import java.io.BufferedInputStream;
+import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.FileFilter;
 import java.io.FileInputStream;
 import java.io.FilenameFilter;
 import java.io.IOException;
+import java.io.InputStream;
 import java.net.MalformedURLException;
 import java.net.URI;
 import java.util.Arrays;
@@ -465,16 +467,25 @@ public final class plasmaParser {
         } catch (Exception e) {/* ignore this */}
     }    
     
-    public plasmaParserDocument parseSource(URL location, String mimeType, String charset, byte[] source) 
+    public plasmaParserDocument parseSource(URL location, String mimeType, String charset, byte[] sourceArray) 
     throws InterruptedException, ParserException {
-        File tempFile = null;
+        ByteArrayInputStream byteIn = null;
         try {
-            // creating a temp file to store the byte array
-            tempFile = File.createTempFile("parseSource", ".tmp");
-            serverFileUtils.write(source, tempFile);
+            if (this.theLogger.isFine())
+                this.theLogger.logFine("Parsing '" + location + "' from byte-array");
+            
+            // testing if the resource is not empty
+            if (sourceArray == null || sourceArray.length == 0) {
+                String errorMsg = "No resource content available.";
+                this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
+                throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT);
+            }              
+            
+            // creating an InputStream
+            byteIn = new ByteArrayInputStream(sourceArray);
             
             // parsing the temp file
-            return parseSource(location, mimeType, charset, tempFile);
+            return parseSource(location, mimeType, charset, sourceArray.length, byteIn);
             
         } catch (Exception e) {
             // Interrupted- and Parser-Exceptions should pass through
@@ -482,20 +493,65 @@ public final class plasmaParser {
             if (e instanceof ParserException) throw (ParserException) e;
             
             // log unexpected error
-            this.theLogger.logSevere("Unexpected exception in parseSource1: " + e.getMessage(), e);
+            this.theLogger.logSevere("Unexpected exception in parseSource from byte-array: " + e.getMessage(), e);
             throw new ParserException("Unexpected exception while parsing " + location,location, e);
         } finally {
-            if (tempFile != null) try { tempFile.delete(); } catch (Exception ex){/* ignore this */}
+            if (byteIn != null) try { byteIn.close(); } catch (Exception ex){/* ignore this */}
         }
         
     }
 
-    public plasmaParserDocument parseSource(URL location, String theMimeType, String theDocumentCharset, File sourceFile) 
-    throws InterruptedException, ParserException {
+    public plasmaParserDocument parseSource(URL location, String theMimeType, String theDocumentCharset, File sourceFile) throws InterruptedException, ParserException {
+        
+        BufferedInputStream sourceStream = null;
+        try {
+            if (this.theLogger.isFine())
+                this.theLogger.logFine("Parsing '" + location + "' from file");
+            
+            // testing if the resource is not empty
+            if (!(sourceFile.exists() && sourceFile.canRead() && sourceFile.length() > 0)) {
+                String errorMsg = sourceFile.exists() ? "Empty resource file." : "No resource content available.";
+                this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
+                throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT);
+            }        
+            
+            // create a new InputStream
+            sourceStream = new BufferedInputStream(new FileInputStream(sourceFile));
+            
+            // parsing the data
+            return this.parseSource(location, theMimeType, theDocumentCharset, sourceFile.length(),  sourceStream);
+            
+        } catch (Exception e) {
+            // Interrupted- and Parser-Exceptions should pass through
+            if (e instanceof InterruptedException) throw (InterruptedException) e;
+            if (e instanceof ParserException) throw (ParserException) e;
 
+            // log unexpected error
+            this.theLogger.logSevere("Unexpected exception in parseSource from File: " + e.getMessage(), e);
+            throw new ParserException("Unexpected exception while parsing " + location,location, e);
+        } finally {
+            if (sourceStream != null) try { sourceStream.close(); } catch (Exception ex){/* ignore this */}
+        }
+    }
+    
+    /**
+     * To parse a resource from an {@link InputStream}
+     * @param location the URL of the resource
+     * @param theMimeType the resource mimetype (<code>null</code> if unknown)
+     * @param theDocumentCharset the charset of the resource (<code>null</code> if unknown)
+     * @param contentLength the content length of the resource (<code>-1</code> if unknown)
+     * @param sourceStream an {@link InputStream} containing the resource body 
+     * @return the parsed {@link plasmaParserDocument document}
+     * @throws InterruptedException
+     * @throws ParserException
+     */
+    public plasmaParserDocument parseSource(URL location, String theMimeType, String theDocumentCharset, long contentLength, InputStream sourceStream) throws InterruptedException, ParserException {        
         Parser theParser = null;
         String mimeType = null;
         try {
+            if (this.theLogger.isFine())
+                this.theLogger.logFine("Parsing '" + location + "' from stream");            
+            
             // getting the mimetype of the document
             mimeType = getRealMimeType(theMimeType);
             
@@ -513,66 +569,9 @@ public final class plasmaParser {
                 throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_WRONG_MIMETYPE_OR_EXT);
             }
             
-            // testing if the resource is not empty
-            if (!(sourceFile.exists() && sourceFile.canRead() && sourceFile.length() > 0)) {
-                String errorMsg = sourceFile.exists() ? "Empty resource file." : "No resource content available.";
-                this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
-                throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT);
-            }
-
-            
             if (this.theLogger.isFine())
                 this.theLogger.logInfo("Parsing " + location + " with mimeType '" + mimeType + 
-                                       "' and file extension '" + fileExt + "'.");
-            
-            /*
-             * There are some problematic mimeType - fileExtension combination where we have to enforce
-             * a mimeType detection to get the proper parser for the content
-             * 
-             * - application/zip + .odt
-             * - text/plain + .odt
-             * - text/plain + .vcf
-             * - text/xml + .rss
-             * - text/xml + .atom
-             * 
-             * In all these cases we can trust the fileExtension and have to determine the proper mimeType.
-             * 
-             */
-            
-//            // Handling of not trustable mimeTypes
-//            // - text/plain
-//            // - text/xml
-//            // - application/octet-stream
-//            // - application/zip
-//            if (
-//                    (mimeType.equalsIgnoreCase("text/plain") && !fileExt.equalsIgnoreCase("txt")) || 
-//                    (mimeType.equalsIgnoreCase("text/xml")   && !fileExt.equalsIgnoreCase("txt")) 
-//            ) {
-//                if (this.theLogger.isFine())
-//                    this.theLogger.logFine("Document " + location + " has an mimeType '" + mimeType + 
-//                                           "' that seems not to be correct for file extension '" + fileExt + "'.");                
-//                
-//                if (enabledParserList.containsKey("application/octet-stream")) {
-//                    theParser = this.getParser("application/octet-stream");
-//                    Object newMime = theParser.getClass().getMethod("getMimeType", new Class[]{File.class}).invoke(theParser, sourceFile);
-//                    if (newMime == null)
-//                    if (newMime instanceof String) {
-//                        String newMimeType = (String)newMime;
-//                        if ((newMimeType.equals("application/octet-stream")) {
-//                            return null;
-//                        }
-//                        mimeType = newMimeType;
-//                    }
-//                } else {
-//                    return null;
-//                }
-//            } else if (mimeType.equalsIgnoreCase("application/zip") && fileExt.equalsIgnoreCase("odt")){
-//                if (enabledParserList.containsKey("application/vnd.oasis.opendocument.text")) {
-//                    mimeType = "application/vnd.oasis.opendocument.text";
-//                } else {
-//                    return null;
-//                }
-//            }        
+                                       "' and file extension '" + fileExt + "'.");                
             
             // getting the correct parser for the given mimeType
             theParser = this.getParser(mimeType);
@@ -580,9 +579,12 @@ public final class plasmaParser {
             // if a parser was found we use it ...
             plasmaParserDocument doc = null;
             if (theParser != null) {
-                doc = theParser.parse(location, mimeType,documentCharset,sourceFile);
+                // set the content length of the resource
+                theParser.setContentLength(contentLength);
+                // parse the resource
+                doc = theParser.parse(location, mimeType,documentCharset,sourceStream);
             } else if (realtimeParsableMimeTypesContains(mimeType)) {                      
-                doc = parseHtml(location, mimeType, documentCharset, sourceFile);
+                doc = parseHtml(location, mimeType, documentCharset, sourceStream);
             } else {
                 String errorMsg = "No parser available to parse mimetype '" + mimeType + "'";
                 this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
@@ -611,14 +613,13 @@ public final class plasmaParser {
             if (theParser != null) {
                 try { plasmaParser.theParserPool.returnObject(mimeType, theParser); } catch (Exception e) { /* ignore this */}
             }
-        }
+        }        
     }
     
-    private plasmaParserDocument parseHtml(URL location, String mimeType, String documentCharset, File sourceFile) throws IOException, ParserException {
+    private plasmaParserDocument parseHtml(URL location, String mimeType, String documentCharset, InputStream sourceStream) throws IOException, ParserException {
         
         // ...otherwise we make a scraper and transformer
-        FileInputStream fileIn = new FileInputStream(sourceFile);
-        htmlFilterInputStream htmlFilter = new htmlFilterInputStream(fileIn,documentCharset,location,null,false);
+        htmlFilterInputStream htmlFilter = new htmlFilterInputStream(sourceStream,documentCharset,location,null,false);
         String charset = htmlFilter.detectCharset();
         if (charset == null) {
             charset = documentCharset;
@@ -763,7 +764,7 @@ public final class plasmaParser {
         //javac -classpath lib/commons-collections.jar:lib/commons-pool-1.2.jar -sourcepath source source/de/anomic/plasma/plasmaParser.java
         //java -cp source:lib/commons-collections.jar:lib/commons-pool-1.2.jar de.anomic.plasma.plasmaParser bug.html bug.out
         try {
-            File contentFile = null;
+            Object content = null;
             URL contentURL = null;
             String contentMimeType = "application/octet-stream";
             String charSet = "UTF-8";
@@ -774,17 +775,13 @@ public final class plasmaParser {
                         
             String mode = args[0];
             if (mode.equalsIgnoreCase("-f")) {
-                contentFile = new File(args[1]);
-                contentURL = new URL(contentFile);
+                content = new File(args[1]);
+                contentURL = new URL((File)content);
             } else if (mode.equalsIgnoreCase("-u")) {
                 contentURL = new URL(args[1]);
                 
                 // downloading the document content
-                byte[] contentBytes = httpc.singleGET(contentURL, contentURL.getHost(), 10000, null, null, null);
-                
-                contentFile = File.createTempFile("content",".tmp");
-                contentFile.deleteOnExit();
-                serverFileUtils.write(contentBytes, contentFile);
+                content = httpc.singleGET(contentURL, contentURL.getHost(), 10000, null, null, null);
             }
             
             if ((args.length >= 4)&&(args[2].equalsIgnoreCase("-m"))) {
@@ -805,7 +802,12 @@ public final class plasmaParser {
             plasmaParser.enableAllParsers(PARSER_MODE_PROXY);
 
             // parsing the content
-            plasmaParserDocument document = theParser.parseSource(contentURL, contentMimeType, charSet, contentFile);
+            plasmaParserDocument document = null;
+            if (content instanceof byte[]) {
+                document = theParser.parseSource(contentURL, contentMimeType, charSet, (byte[])content);
+            } else if (content instanceof File) {
+                document = theParser.parseSource(contentURL, contentMimeType, charSet, (File)content);
+            }
 
             // printing out all parsed sentences
             if (document != null) {
diff --git a/source/de/anomic/plasma/plasmaSearchImages.java b/source/de/anomic/plasma/plasmaSearchImages.java
index 129302433..d6ea1bd9d 100644
--- a/source/de/anomic/plasma/plasmaSearchImages.java
+++ b/source/de/anomic/plasma/plasmaSearchImages.java
@@ -41,6 +41,7 @@
 
 package de.anomic.plasma;
 
+import java.io.InputStream;
 import java.net.MalformedURLException;
 import java.util.Iterator;
 import java.util.Map;
@@ -59,13 +60,18 @@ public final class plasmaSearchImages {
         long start = System.currentTimeMillis();
         this.images = new TreeSet();
         if (maxTime > 10) {
-            byte[] res = sc.getResource(url, true, (int) maxTime);
+            Object[] resource = sc.getResource(url, true, (int) maxTime);
+            InputStream res = (InputStream) resource[0];
+            Long resLength = (Long) resource[1];
             if (res != null) {
                 plasmaParserDocument document = null;
                 try {
-                    document = sc.parseDocument(url, res);
+                    // parse the document
+                    document = sc.parseDocument(url, resLength.longValue(), res);
                 } catch (ParserException e) {
                     // parsing failed
+                } finally {
+                    try { res.close(); } catch (Exception e) {/* ignore this */}
                 }
                 if (document == null) return;
                 
diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java
index d2a1f6864..80a63a1a1 100644
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@@ -44,7 +44,9 @@
 
 package de.anomic.plasma;
 
+import java.io.ByteArrayInputStream;
 import java.io.IOException;
+import java.io.InputStream;
 import java.util.Enumeration;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -187,46 +189,62 @@ public class plasmaSnippetCache {
          * LOADING RESOURCE DATA
          * =========================================================================== */
         // if the snippet is not in the cache, we can try to get it from the htcache
-        byte[] resource = null;
-        IResourceInfo docInfo = null;
+        long resContentLength = 0;
+        InputStream resContent = null;
+        IResourceInfo resInfo = null;
         try {
             // trying to load the resource from the cache
-            resource = this.cacheManager.loadResourceContent(url);
+            resContent = this.cacheManager.getResourceContentStream(url);
+            if (resContent != null) {
+                // if the content was found
+                resContentLength = this.cacheManager.getResourceContentLength(url);
+                
+                // getting resource metadata
+                resInfo = this.cacheManager.loadResourceInfo(url);
             
-            // if not found try to download it
-            if ((resource == null) && (fetchOnline)) {
-                // download resource using the crawler
-                plasmaHTCache.Entry entry = loadResourceFromWeb(url, timeout);
+            } else if (fetchOnline) {
+                // if not found try to download it
                 
-                // getting resource metadata (e.g. the http headers for http resources)
-                if (entry != null) docInfo = entry.getDocumentInfo();
+                // download resource using the crawler and keep resource in memory if possible
+                plasmaHTCache.Entry entry = loadResourceFromWeb(url, timeout, true);
                 
-                // read resource body (if it is there)
-                resource = entry.cacheArray();
+                // getting resource metadata (e.g. the http headers for http resources)
+                if (entry != null) {
+                    resInfo = entry.getDocumentInfo();
+
+                    // read resource body (if it is there)
+                    byte []resourceArray = entry.cacheArray();
+                    if (resourceArray != null) {
+                        resContent = new ByteArrayInputStream(resourceArray);
+                        resContentLength = resourceArray.length;
+                    } else {
+                        resContent = this.cacheManager.getResourceContentStream(url); 
+                        resContentLength = this.cacheManager.getResourceContentLength(url);
+                    }
+                }
                 
-                // in case that the reosurce was not in ram, read it from disk
-                if (resource == null) resource = this.cacheManager.loadResourceContent(url);
+                // if it is still not available, report an error
+                if (resContent == null) return new Snippet(null, ERROR_RESOURCE_LOADING, "error loading resource, plasmaHTCache.Entry cache is NULL");                
                 
-                // if it is still not available, throw exception
-                if (resource == null) return new Snippet(null, ERROR_RESOURCE_LOADING, "error loading resource, plasmaHTCache.Entry cache is NULL");
-                            
                 source = SOURCE_WEB;
+            } else {
+                return new Snippet(null, ERROR_SOURCE_LOADING, "no resource available");
             }
         } catch (Exception e) {
             if (!(e instanceof plasmaCrawlerException)) e.printStackTrace();
             return new Snippet(null, ERROR_SOURCE_LOADING, "error loading resource: " + e.getMessage());
-        }
+        } 
 
-        if (resource == null) return new Snippet(null, ERROR_SOURCE_LOADING, "no resource available");
-        
         /* ===========================================================================
          * PARSING RESOURCE
          * =========================================================================== */
         plasmaParserDocument document = null;
         try {
-             document = parseDocument(url, resource, docInfo);            
+             document = parseDocument(url, resContentLength, resContent, resInfo);            
         } catch (ParserException e) {
             return new Snippet(null, ERROR_PARSER_FAILED, e.getMessage()); // cannot be parsed
+        } finally {
+            try { resContent.close(); } catch (Exception e) {/* ignore this */}
         }
         if (document == null) return new Snippet(null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
                 
@@ -263,30 +281,40 @@ public class plasmaSnippetCache {
      * @return the parsed document as {@link plasmaParserDocument}
      */
     public plasmaParserDocument retrieveDocument(URL url, boolean fetchOnline) {
-        byte[] resource = null;
         IResourceInfo docInfo = null;
         try {
             // trying to load the resource body from cache
-            resource = this.cacheManager.loadResourceContent(url);
+            InputStream content = this.cacheManager.getResourceContentStream(url);
+            long resourceLength = this.cacheManager.getResourceContentLength(url);
             
             // if not available try to load resource from web
-            if ((fetchOnline) && (resource == null)) {
+            if ((fetchOnline) && (content == null)) {
                 // download resource using crawler
-                plasmaHTCache.Entry entry = loadResourceFromWeb(url, 5000);
+                plasmaHTCache.Entry entry = loadResourceFromWeb(url, 5000, true);
                 
                 // fetching metadata of the resource (e.g. http headers for http resource)
-                if (entry != null) docInfo = entry.getDocumentInfo();
-                
-                // getting the resource body from the cache
-                resource = this.cacheManager.loadResourceContent(url);
+                if (entry != null) {
+                    docInfo = entry.getDocumentInfo();
+                    
+                    byte[] resourceArray = entry.cacheArray();
+                    if (resourceArray != null) {
+                        // read resource body (if it is there)
+                        content = new ByteArrayInputStream(resourceArray);
+                        resourceLength = resourceArray.length;
+                    } else {
+                        // in case that the reosurce was not in ram, read it from disk
+                        content = this.cacheManager.getResourceContentStream(url);
+                        resourceLength = this.cacheManager.getResourceContentLength(url);
+                    }
+                }
             } else {
                 // trying to load resource metadata
                 docInfo = this.cacheManager.loadResourceInfo(url);
             }
             
             // parsing document
-            if (resource == null) return null;
-            return parseDocument(url, resource, docInfo);
+            if (content == null) return null;
+            return parseDocument(url, resourceLength, content, docInfo);
         } catch (ParserException e) {
             this.log.logWarning("Unable to parse resource. " + e.getMessage());
             return null;
@@ -446,15 +474,24 @@ public class plasmaSnippetCache {
         return map;
     }
      
-    public plasmaParserDocument parseDocument(URL url, byte[] resource) throws ParserException {
-        return parseDocument(url, resource, null);
+    public plasmaParserDocument parseDocument(URL url, long contentLength, InputStream resourceStream) throws ParserException {
+        return parseDocument(url, contentLength, resourceStream, null);
     }
     
-    public plasmaParserDocument parseDocument(URL url, byte[] resource, IResourceInfo docInfo) throws ParserException {
+    /**
+     * Parse the resource
+     * @param url the URL of the resource
+     * @param contentLength the contentLength of the resource
+     * @param resourceStream the resource body as stream
+     * @param docInfo metadata about the resource
+     * @return the extracted data
+     * @throws ParserException
+     */
+    public plasmaParserDocument parseDocument(URL url, long contentLength, InputStream resourceStream, IResourceInfo docInfo) throws ParserException {
         try {
-            if (resource == null) return null;
+            if (resourceStream == null) return null;
 
-            // if no resource metadata is available, try to load it
+            // STEP 1: if no resource metadata is available, try to load it from cache 
             if (docInfo == null) {
                 // try to get the header from the htcache directory
                 try {                    
@@ -464,18 +501,21 @@ public class plasmaSnippetCache {
                 }   
             }
             
+            // STEP 2: if the metadata is still null try to download it from web
+            if ((docInfo == null) && (url.getProtocol().startsWith("http"))) {
                 // TODO: we need a better solution here
-                // encapsulate this in the crawlLoader class
-                if ((docInfo == null) && (url.getProtocol().startsWith("http"))) {
-                    // getting URL mimeType
-                    try {
-                        httpHeader header = httpc.whead(url, url.getHost(), 10000, null, null, this.sb.remoteProxyConfig);
-                        docInfo = this.cacheManager.getResourceInfoFactory().buildResourceInfoObj(url, header);
-                    } catch (Exception e) {
-                        // ingore this. http header download failed
-                    } 
-                }
+                // e.g. encapsulate this in the crawlLoader class
+                
+                // getting URL mimeType
+                try {
+                    httpHeader header = httpc.whead(url, url.getHost(), 10000, null, null, this.sb.remoteProxyConfig);
+                    docInfo = this.cacheManager.getResourceInfoFactory().buildResourceInfoObj(url, header);
+                } catch (Exception e) {
+                    // ingore this. http header download failed
+                } 
+            }
 
+            // STEP 3: if the metadata is still null try to guess the mimeType of the resource
             if (docInfo == null) {
                 String filename = this.cacheManager.getCachePath(url).getName();
                 int p = filename.lastIndexOf('.');
@@ -495,12 +535,12 @@ public class plasmaSnippetCache {
                         supposedMime = plasmaParser.getMimeTypeByFileExt(filename.substring(p + 1));
                     }
 
-                    return this.parser.parseSource(url, supposedMime, null, resource);
+                    return this.parser.parseSource(url, supposedMime, null, contentLength, resourceStream);
                 }
                 return null;
-            }
+            }            
             if (plasmaParser.supportedMimeTypesContains(docInfo.getMimeType())) {
-                return this.parser.parseSource(url, docInfo.getMimeType(), docInfo.getCharacterEncoding(), resource);
+                return this.parser.parseSource(url, docInfo.getMimeType(), docInfo.getCharacterEncoding(), contentLength, resourceStream);
             }
             return null;
         } catch (InterruptedException e) {
@@ -509,27 +549,57 @@ public class plasmaSnippetCache {
         }
     }
     
-    public byte[] getResource(URL url, boolean fetchOnline, int socketTimeout) {
+    /**
+     * 
+     * @param url
+     * @param fetchOnline
+     * @param socketTimeout
+     * @return an Object array containing
+     * <table>
+     * <tr><td>[0]</td><td>the content as {@link InputStream}</td></tr>
+     * <tr><td>[1]</td><td>the content-length as {@link Integer}</td></tr>
+     * </table>
+     */
+    public Object[] getResource(URL url, boolean fetchOnline, int socketTimeout) {
         // load the url as resource from the web
         try {
-            // trying to load the resource body from cache
-            byte[] resource = cacheManager.loadResourceContent(url);
+            long contentLength = -1;
             
-            // if the content is not available in cache try to download it from web
-            if ((fetchOnline) && (resource == null)) {
+            // trying to load the resource body from cache
+            InputStream resource = this.cacheManager.getResourceContentStream(url);
+            if (resource != null) {
+                contentLength = this.cacheManager.getResourceContentLength(url);
+            } else if (fetchOnline) {
+                // if the content is not available in cache try to download it from web
+                
                 // try to download the resource using a crawler
-                loadResourceFromWeb(url, (socketTimeout < 0) ? -1 : socketTimeout);
+                plasmaHTCache.Entry entry = loadResourceFromWeb(url, (socketTimeout < 0) ? -1 : socketTimeout, true);
                 
-                // get the content from cache
-                resource = cacheManager.loadResourceContent(url);
+                // read resource body (if it is there)
+                byte[] resourceArray = entry.cacheArray();
+            
+                // in case that the reosurce was not in ram, read it from disk
+                if (resourceArray == null) {
+                    resource = this.cacheManager.getResourceContentStream(url);   
+                    contentLength = this.cacheManager.getResourceContentLength(url); 
+                } else {
+                    resource = new ByteArrayInputStream(resourceArray);
+                    contentLength = resourceArray.length;
+                }
+            } else {
+                return null;
             }
-            return resource;
+            return new Object[]{resource,new Long(contentLength)};
         } catch (IOException e) {
             return null;
         }
     }
     
-    public plasmaHTCache.Entry loadResourceFromWeb(URL url, int socketTimeout) throws plasmaCrawlerException {
+    public plasmaHTCache.Entry loadResourceFromWeb(
+            URL url, 
+            int socketTimeout,
+            boolean keepInMemory
+    ) throws plasmaCrawlerException {
         
         plasmaHTCache.Entry result = this.sb.cacheLoader.loadSync(
                 url, 
@@ -538,7 +608,8 @@ public class plasmaSnippetCache {
                 null, 
                 0, 
                 null,
-                socketTimeout
+                socketTimeout,
+                keepInMemory
         );
         
         return result;
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index 939e2e21f..0adc1cabe 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -105,6 +105,7 @@ package de.anomic.plasma;
 
 import java.io.File;
 import java.io.IOException;
+import java.io.InputStream;
 import java.lang.reflect.Constructor;
 import java.net.InetAddress;
 import java.net.MalformedURLException;
@@ -2181,17 +2182,32 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
         URL url = entry.url();
         if (url == null) return 0;
         
+        InputStream resourceContent = null;
         try {
-            // get set of words
-            // Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline)));
-            Iterator witer = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline, 10000)).getTextBytes());
+            // get the resource content
+            Object[] resource = snippetCache.getResource(url, fetchOnline, 10000);
+            resourceContent = (InputStream) resource[0];
+            Long resourceContentLength = (Long) resource[1];
+            
+            // parse the resource
+            plasmaParserDocument document = snippetCache.parseDocument(url, resourceContentLength.longValue(), resourceContent);
+            
+            // getting parsed body input stream
+            InputStream docBodyInputStream = document.getText();
+            
+            // getting word iterator
+            Iterator witer = plasmaCondenser.getWords(docBodyInputStream);
+            
             // delete all word references
             int count = removeReferences(urlhash, witer);
+            
             // finally delete the url entry itself
             urlPool.loadedURL.remove(urlhash);
             return count;
         } catch (ParserException e) {
             return 0;
+        } finally {
+            if (resourceContent != null) try { resourceContent.close(); } catch (Exception e) {/* ignore this */}
         }
     }
     
diff --git a/source/de/anomic/server/serverFileUtils.java b/source/de/anomic/server/serverFileUtils.java
index 363754e96..e514a09e4 100644
--- a/source/de/anomic/server/serverFileUtils.java
+++ b/source/de/anomic/server/serverFileUtils.java
@@ -144,7 +144,7 @@ public final class serverFileUtils {
     
     public static void writeX(InputStream source, String inputCharset, Writer procOS, OutputStream bufferOS, String outputCharset) throws IOException {
         InputStreamReader sourceReader = new InputStreamReader(source,inputCharset);
-        OutputStreamWriter bufferOSWriter = new OutputStreamWriter(bufferOS,outputCharset);
+        OutputStreamWriter bufferOSWriter = (bufferOS==null)?null:new OutputStreamWriter(bufferOS,outputCharset);
         writeX(sourceReader,procOS,bufferOSWriter);
     }