redesign of access to the HTCache (now http.client.Cache):

- better control to the cache by using combined request-header and content access methods - refactoring of many classes to comply to this new access method - make shure that the cache is always written if something was loaded - some redesign of the process how http response results are feeded into the new indexing queue - introduction of a cache read policy: * never use the cache * use the cache if entry exist * use the cache if the proxy freshness rule confirmes * use only the cache and go never online - added configuration options for the crawl profiles to use the new cache policies. There is not yet a input during crawl start to set the policy but this will be added in another step. - set the default policies for the existing crawl profiles. If you want them to appear in your default profiles you must delete the crawl profiles database; othervise the policy is 'proxy freshness rule' - enhanced some cache access methods in such a way that unnecessary retrievals are omitted (i.e. for size computation). That should reduce some IO but also a lot of CPU computation because sizes were computed after decompression of content after retrieval of the content from the disc. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6239 6c8d7289-2bf4-0310-a012-ef5d649a1542
16 years ago · 161d2fd2ef
parent da43164dd6
commit 161d2fd2ef
21 changed files with 415 additions and 320 deletions
--- a/htroot/QuickCrawlLink_p.java
+++ b/htroot/QuickCrawlLink_p.java
@ -147,7 +147,8 @@ public class QuickCrawlLink_p {
                        remoteIndexing,
                        xsstopw,
                        xdstopw,
-                        xpstopw
+                        xpstopw,
+                        CrawlProfile.CACHE_STRATEGY_IFFRESH
                );
            } catch (final Exception e) {
                // mist
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@ -146,15 +146,15 @@ public class ViewFile {
        ResponseHeader responseHeader = null;
        String resMime = null;
        // trying to load the resource body
-        resource = Cache.getResourceContentStream(url);
+        resource = Cache.getContentStream(url);
        resourceLength = Cache.getResourceContentLength(url);
-        responseHeader = Cache.loadResponseHeader(url);
+        responseHeader = Cache.getResponseHeader(url);

        // if the resource body was not cached we try to load it from web
        if (resource == null) {
            Response entry = null;
            try {
-                entry = sb.crawlQueues.loadResourceFromWeb(url, true, false);
+                entry = sb.loader.load(url, true, false);
            } catch (final Exception e) {
                prop.put("error", "4");
                prop.putHTML("error_errorText", e.getMessage());
@ -163,7 +163,7 @@ public class ViewFile {
            }

            if (entry != null) {
-                resource = Cache.getResourceContentStream(url);
+                resource = Cache.getContentStream(url);
                resourceLength = Cache.getResourceContentLength(url);
            }

@ -180,7 +180,7 @@ public class ViewFile {

            // try to load the metadata from cache
            try {
-                responseHeader = Cache.loadResponseHeader(url);
+                responseHeader = Cache.getResponseHeader(url);
            } catch (final Exception e) {
                /* ignore this */
            }
--- a/htroot/WatchCrawler_p.java
+++ b/htroot/WatchCrawler_p.java
@ -224,7 +224,7 @@ public class WatchCrawler_p {
                                    crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages,
                                    crawlingQ,
                                    indexText, indexMedia,
-                                    storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw);
+                                    storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, CrawlProfile.CACHE_STRATEGY_IFFRESH);
                            final String reasonString = sb.crawlStacker.stackCrawl(new Request(
                                    sb.peers.mySeed().hash,
                                    url,
@ -350,7 +350,8 @@ public class WatchCrawler_p {
                                        storeHTCache,
                                        true,
                                        crawlOrder,
-                                        xsstopw, xdstopw, xpstopw);
+                                        xsstopw, xdstopw, xpstopw,
+                                        CrawlProfile.CACHE_STRATEGY_IFFRESH);
                                
                                // pause local crawl here
                                sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
@ -408,7 +409,9 @@ public class WatchCrawler_p {
                    				crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages,
                    				crawlingQ,
                    				indexText, indexMedia,
-                    				storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw);
+                    				storeHTCache, true, crawlOrder,
+                    				xsstopw, xdstopw, xpstopw,
+                    				CrawlProfile.CACHE_STRATEGY_IFFRESH);
                    		
                    		// create a new sitemap importer
                    		final SitemapImporter importerThread = new SitemapImporter(sb, sb.dbImportManager, new yacyURL(sitemapURLStr, null), pe);
--- a/source/de/anomic/crawler/CrawlProfile.java
+++ b/source/de/anomic/crawler/CrawlProfile.java
@ -168,7 +168,8 @@ public class CrawlProfile {
                           final boolean indexText, final boolean indexMedia,
                           final boolean storeHTCache, final boolean storeTXCache,
                           final boolean remoteIndexing,
-                           final boolean xsstopw, final boolean xdstopw, final boolean xpstopw) {
+                           final boolean xsstopw, final boolean xdstopw, final boolean xpstopw,
+                           final int cacheStrategy) {
        
        final entry ne = new entry(
                             name, startURL,
@ -179,7 +180,8 @@ public class CrawlProfile {
                             indexText, indexMedia,
                             storeHTCache, storeTXCache,
                             remoteIndexing,
-                             xsstopw, xdstopw, xpstopw);
+                             xsstopw, xdstopw, xpstopw,
+                             cacheStrategy);
        try {
            profileTable.put(ne.handle(), ne.map());
        } catch (final kelondroException e) {
@ -247,6 +249,11 @@ public class CrawlProfile {
        
    }
    
+    public final static int CACHE_STRATEGY_NOCACHE = 0;
+    public final static int CACHE_STRATEGY_IFEXIST = 1;
+    public final static int CACHE_STRATEGY_IFFRESH = 2;
+    public final static int CACHE_STRATEGY_CACHEONLY = 3;
+    
    public static class entry {
        // this is a simple record structure that hold all properties of a single crawl start
        
@ -268,6 +275,7 @@ public class CrawlProfile {
        public static final String XSSTOPW          = "xsstopw";
        public static final String XDSTOPW          = "xdstopw";
        public static final String XPSTOPW          = "xpstopw";
+        public static final String CACHE_STRAGEGY   = "cacheStrategy";
        
        Map<String, String> mem;
        private ConcurrentHashMap<String, DomProfile> doms;
@ -284,7 +292,8 @@ public class CrawlProfile {
                     final boolean indexText, final boolean indexMedia,
                     final boolean storeHTCache, final boolean storeTXCache,
                     final boolean remoteIndexing,
-                     final boolean xsstopw, final boolean xdstopw, final boolean xpstopw) {
+                     final boolean xsstopw, final boolean xdstopw, final boolean xpstopw,
+                     final int cacheStrategy) {
            if (name == null || name.length() == 0) throw new NullPointerException("name must not be null");
            final String handle = (startURL == null) ? Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(Long.toString(System.currentTimeMillis()))).substring(0, yacySeedDB.commonHashLength) : startURL.hash();
            mem = new HashMap<String, String>();
@ -306,7 +315,7 @@ public class CrawlProfile {
            mem.put(XSSTOPW,          Boolean.toString(xsstopw)); // exclude static stop-words
            mem.put(XDSTOPW,          Boolean.toString(xdstopw)); // exclude dynamic stop-word
            mem.put(XPSTOPW,          Boolean.toString(xpstopw)); // exclude parent stop-words
-
+            mem.put(CACHE_STRAGEGY, Integer.toString(cacheStrategy));
            doms = new ConcurrentHashMap<String, DomProfile>();
        }
        
@ -368,6 +377,15 @@ public class CrawlProfile {
                return 0;
            }
        }
+        public int cacheStrategy() {
+            final String r = mem.get(CACHE_STRAGEGY);
+            if (r == null) return CACHE_STRATEGY_IFFRESH;
+            try {
+                return Integer.parseInt(r);
+            } catch (final NumberFormatException e) {
+                return CACHE_STRATEGY_IFFRESH;
+            }
+        }
        public long recrawlIfOlder() {
            // returns a long (millis) that is the minimum age that
            // an entry must have to be re-crawled
--- a/source/de/anomic/crawler/CrawlQueues.java
+++ b/source/de/anomic/crawler/CrawlQueues.java
@ -38,7 +38,6 @@ import java.util.concurrent.ConcurrentHashMap;

 import de.anomic.content.RSSMessage;
 import de.anomic.crawler.retrieval.Request;
-import de.anomic.crawler.retrieval.LoaderDispatcher;
 import de.anomic.crawler.retrieval.Response;
 import de.anomic.document.parser.xml.RSSFeed;
 import de.anomic.http.client.Client;
@ -59,7 +58,6 @@ public class CrawlQueues {
    protected Switchboard sb;
    protected Log log;
    protected Map<Integer, crawlWorker> workers; // mapping from url hash to Worker thread object
-    protected LoaderDispatcher loader;
    private   final ArrayList<String> remoteCrawlProviderHashes;

    public  NoticedURL noticeURL;
@ -69,7 +67,6 @@ public class CrawlQueues {
        this.sb = sb;
        this.log = new Log("CRAWLER");
        this.workers = new ConcurrentHashMap<Integer, crawlWorker>();
-        this.loader = new LoaderDispatcher(sb, log);
        this.remoteCrawlProviderHashes = new ArrayList<String>();
        
        // start crawling management
@ -94,7 +91,7 @@ public class CrawlQueues {
        if (delegatedURL.exists(hash)) return "delegated";
        if (errorURL.exists(hash)) return "errors";
        for (final crawlWorker worker: workers.values()) {
-            if (worker.entry.url().hash().equals(hash)) return "worker";
+            if (worker.request.url().hash().equals(hash)) return "worker";
        }
        return null;
    }
@ -115,7 +112,7 @@ public class CrawlQueues {
        ee = errorURL.getEntry(urlhash);
        if (ee != null) return ee.url();
        for (final crawlWorker w: workers.values()) {
-            if (w.entry.url().hash().equals(urlhash)) return w.entry.url();
+            if (w.request.url().hash().equals(urlhash)) return w.request.url();
        }
        return null;
    }
@ -170,15 +167,11 @@ public class CrawlQueues {
        synchronized (workers) {
            final Request[] e = new Request[workers.size()];
            int i = 0;
-            for (final crawlWorker w: workers.values()) e[i++] = w.entry;
+            for (final crawlWorker w: workers.values()) e[i++] = w.request;
            return e;
        }
    }
    
-    public boolean isSupportedProtocol(final String protocol) {
-        return loader.isSupportedProtocol(protocol);
-    }
-    
    public int coreCrawlJobSize() {
        return noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE);
    }
@ -243,7 +236,7 @@ public class CrawlQueues {
            // check if the protocol is supported
            final yacyURL url = urlEntry.url();
            final String urlProtocol = url.getProtocol();
-            if (this.isSupportedProtocol(urlProtocol)) {
+            if (sb.loader.isSupportedProtocol(urlProtocol)) {

                if (this.log.isFine())
                    log.logFine(stats + ": URL=" + urlEntry.url()
@ -494,48 +487,20 @@ public class CrawlQueues {
        }
    }
    
-    public Response loadResourceFromWeb(
-            final yacyURL url,
-            final boolean forText,
-            final boolean global
-    ) throws IOException {
-        
-        final Request centry = new Request(
-                sb.peers.mySeed().hash, 
-                url, 
-                "", 
-                "", 
-                new Date(),
-                new Date(),
-                (forText) ?
-                    ((global) ?
-                        sb.crawler.defaultTextSnippetGlobalProfile.handle() :
-                        sb.crawler.defaultTextSnippetLocalProfile.handle())
-                    :
-                    ((global) ?
-                        sb.crawler.defaultMediaSnippetGlobalProfile.handle() :
-                        sb.crawler.defaultMediaSnippetLocalProfile.handle()), // crawl profile
-                0, 
-                0, 
-                0);
-        
-        return loader.load(centry);
-    }
-    
    public int size() {
        return workers.size();
    }
    
    protected final class crawlWorker extends Thread {
        
-        protected Request entry;
+        protected Request request;
        private final Integer code;
        private long start;
        
        public crawlWorker(final Request entry) {
            this.start = System.currentTimeMillis();
-            this.entry = entry;
-            this.entry.setStatus("worker-initialized", serverProcessorJob.STATUS_INITIATED);
+            this.request = entry;
+            this.request.setStatus("worker-initialized", serverProcessorJob.STATUS_INITIATED);
            this.code = Integer.valueOf(entry.hashCode());
            if (!workers.containsKey(code)) {
                workers.put(code, this);
@ -550,39 +515,57 @@ public class CrawlQueues {
        public void run() {
            try {
                // checking robots.txt for http(s) resources
-                this.entry.setStatus("worker-checkingrobots", serverProcessorJob.STATUS_STARTED);
-                if ((entry.url().getProtocol().equals("http") || entry.url().getProtocol().equals("https")) && sb.robots.isDisallowed(entry.url())) {
-                    if (log.isFine()) log.logFine("Crawling of URL '" + entry.url().toString() + "' disallowed by robots.txt.");
+                this.request.setStatus("worker-checkingrobots", serverProcessorJob.STATUS_STARTED);
+                if ((request.url().getProtocol().equals("http") || request.url().getProtocol().equals("https")) && sb.robots.isDisallowed(request.url())) {
+                    if (log.isFine()) log.logFine("Crawling of URL '" + request.url().toString() + "' disallowed by robots.txt.");
                    final ZURL.Entry eentry = errorURL.newEntry(
-                            this.entry,
+                            this.request,
                            sb.peers.mySeed().hash,
                            new Date(),
                            1,
                            "denied by robots.txt");
                    eentry.store();
                    errorURL.push(eentry);
-                    this.entry.setStatus("worker-disallowed", serverProcessorJob.STATUS_FINISHED);
+                    this.request.setStatus("worker-disallowed", serverProcessorJob.STATUS_FINISHED);
                } else {
                    // starting a load from the internet
-                    this.entry.setStatus("worker-loading", serverProcessorJob.STATUS_RUNNING);
-                    final String result = loader.process(this.entry);
+                    this.request.setStatus("worker-loading", serverProcessorJob.STATUS_RUNNING);
+                    String result = null;
+                    
+                    // load a resource, store it to htcache and push queue entry to switchboard queue
+                    // returns null if everything went fine, a fail reason string if a problem occurred
+                    Response response;
+                    try {
+                        request.setStatus("loading", serverProcessorJob.STATUS_RUNNING);
+                        response = sb.loader.load(request);
+                        assert response != null;
+                        request.setStatus("loaded", serverProcessorJob.STATUS_RUNNING);
+                        final boolean stored = sb.toIndexer(response);
+                        request.setStatus("enqueued-" + ((stored) ? "ok" : "fail"), serverProcessorJob.STATUS_FINISHED);
+                        result = (stored) ? null : "not enqueued to indexer";
+                    } catch (IOException e) {
+                        request.setStatus("error", serverProcessorJob.STATUS_FINISHED);
+                        if (log.isFine()) log.logFine("problem loading " + request.url().toString() + ": " + e.getMessage());
+                        result = "load error - " + e.getMessage();
+                    }
+                    
                    if (result != null) {
                        final ZURL.Entry eentry = errorURL.newEntry(
-                                this.entry,
+                                this.request,
                                sb.peers.mySeed().hash,
                                new Date(),
                                1,
                                "cannot load: " + result);
                        eentry.store();
                        errorURL.push(eentry);
-                        this.entry.setStatus("worker-error", serverProcessorJob.STATUS_FINISHED);
+                        this.request.setStatus("worker-error", serverProcessorJob.STATUS_FINISHED);
                    } else {
-                        this.entry.setStatus("worker-processed", serverProcessorJob.STATUS_FINISHED);
+                        this.request.setStatus("worker-processed", serverProcessorJob.STATUS_FINISHED);
                    }
                }
            } catch (final Exception e) {
                final ZURL.Entry eentry = errorURL.newEntry(
-                        this.entry,
+                        this.request,
                        sb.peers.mySeed().hash,
                        new Date(),
                        1,
@ -591,7 +574,7 @@ public class CrawlQueues {
                errorURL.push(eentry);
                e.printStackTrace();
                Client.initConnectionManager();
-                this.entry.setStatus("worker-exception", serverProcessorJob.STATUS_FINISHED);
+                this.request.setStatus("worker-exception", serverProcessorJob.STATUS_FINISHED);
            } finally {
                crawlWorker w = workers.remove(code);
                assert w != null;
--- a/source/de/anomic/crawler/CrawlStacker.java
+++ b/source/de/anomic/crawler/CrawlStacker.java
@ -46,7 +46,7 @@ public final class CrawlStacker {

    private Log log = new Log("STACKCRAWL");

-    private serverProcessor<Request> fastQueue, slowQueue;
+    private serverProcessor<Request>  fastQueue, slowQueue;
    private long                      dnsHit, dnsMiss;
    private CrawlQueues               nextQueue;
    private CrawlSwitchboard          crawler;
@ -177,7 +177,7 @@ public final class CrawlStacker {

        // check if the protocol is supported
        final String urlProtocol = entry.url().getProtocol();
-        if (!nextQueue.isSupportedProtocol(urlProtocol)) {
+        if (!Switchboard.getSwitchboard().loader.isSupportedProtocol(urlProtocol)) {
            this.log.logSevere("Unsupported protocol in URL '" + entry.url().toString() + "'. " +
                               "Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
            return "unsupported protocol";
--- a/source/de/anomic/crawler/CrawlSwitchboard.java
+++ b/source/de/anomic/crawler/CrawlSwitchboard.java
@ -178,37 +178,38 @@ public final class CrawlSwitchboard {
                    true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/,
                    true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/,
                    true, true,
-                    false /*getConfigBool(PROXY_INDEXING_REMOTE, false)*/, true, true, true);
+                    false /*getConfigBool(PROXY_INDEXING_REMOTE, false)*/, true, true, true,
+                    CrawlProfile.CACHE_STRATEGY_IFFRESH);
        }
        if (this.defaultRemoteProfile == null) {
            // generate new default entry for remote crawling
            defaultRemoteProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    -1, -1, -1, true, true, true, false, true, false, true, true, false);
+                    -1, -1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CACHE_STRATEGY_IFFRESH);
        }
        if (this.defaultTextSnippetLocalProfile == null) {
            // generate new default entry for snippet fetch and optional crawling
            defaultTextSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, -1, true, false, false, false, false, false, true, true, false);
+                    this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, -1, true, false, false, false, false, false, true, true, false, CrawlProfile.CACHE_STRATEGY_IFFRESH);
        }
        if (this.defaultTextSnippetGlobalProfile == null) {
            // generate new default entry for snippet fetch and optional crawling
            defaultTextSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false);
+                    this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CACHE_STRATEGY_CACHEONLY);
        }
        if (this.defaultMediaSnippetLocalProfile == null) {
            // generate new default entry for snippet fetch and optional crawling
            defaultMediaSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, false, false, false, false, true, true, false);
+                    this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, false, false, false, false, true, true, false, CrawlProfile.CACHE_STRATEGY_IFEXIST);
        }
        if (this.defaultMediaSnippetGlobalProfile == null) {
            // generate new default entry for snippet fetch and optional crawling
            defaultMediaSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false);
+                    this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CACHE_STRATEGY_IFEXIST);
        }
        if (this.defaultSurrogateProfile == null) {
            // generate new default entry for surrogate parsing
            defaultSurrogateProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, -1, true, true, false, false, false, false, true, true, false);
+                    this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CACHE_STRATEGY_NOCACHE);
        }
    }
    
--- a/source/de/anomic/crawler/retrieval/FTPLoader.java
+++ b/source/de/anomic/crawler/retrieval/FTPLoader.java
@ -34,7 +34,6 @@ import java.util.Date;

 import de.anomic.crawler.Latency;
 import de.anomic.document.Parser;
-import de.anomic.http.client.Cache;
 import de.anomic.http.metadata.HeaderFramework;
 import de.anomic.http.metadata.RequestHeader;
 import de.anomic.http.metadata.ResponseHeader;
@ -110,20 +109,22 @@ public class FTPLoader {
                    // directory -> get list of files
                    RequestHeader requestHeader = new RequestHeader();
                    if (request.referrerhash() != null) requestHeader.put(RequestHeader.REFERER, sb.getURL(request.referrerhash()).toNormalform(true, false));
-                    ResponseHeader responseHeader = new ResponseHeader();
-                    responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(new Date()));
-                    responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
-                    response = new Response(
-                            request, 
-                            requestHeader,
-                            responseHeader,
-                            "OK",
-                            sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()));
-                    Cache.storeMetadata(request.url(), responseHeader);
                    
                    byte[] dirList = generateDirlist(ftpClient, request, path);
+
                    if (dirList == null) {
                        response = null;
+                    } else {
+                        ResponseHeader responseHeader = new ResponseHeader();
+                        responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(new Date()));
+                        responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
+                        response = new Response(
+                                request, 
+                                requestHeader,
+                                responseHeader,
+                                "OK",
+                                sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
+                                dirList);
                    }
                } else {
                    // file -> download
@ -236,6 +237,9 @@ public class FTPLoader {
                // determine the file date
                final Date fileDate = ftpClient.entryDate(path);

+                // download the remote file
+                byte[] b = ftpClient.get(path);
+                
                // create a cache entry
                RequestHeader requestHeader = new RequestHeader();
                if (request.referrerhash() != null) requestHeader.put(RequestHeader.REFERER, sb.getURL(request.referrerhash()).toNormalform(true, false));
@ -247,12 +251,8 @@ public class FTPLoader {
                        requestHeader,
                        responseHeader,
                        "OK",
-                        sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()));
-                Cache.storeMetadata(request.url(), responseHeader);
-
-                // download the remote file
-                byte[] b = ftpClient.get(path);
-                response.setContent(b);
+                        sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
+                        b);
            } else {
                log.logInfo("REJECTED TOO BIG FILE with size " + size + " Bytes for URL " + request.url().toString());
                sb.crawlQueues.errorURL.newEntry(request, this.sb.peers.mySeed().hash, new Date(), 1, "file size limit exceeded");
--- a/source/de/anomic/crawler/retrieval/HTTPLoader.java
+++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java
@ -32,7 +32,6 @@ import de.anomic.crawler.Latency;
 import de.anomic.data.Blacklist;
 import de.anomic.document.Parser;
 import de.anomic.http.client.Client;
-import de.anomic.http.client.Cache;
 import de.anomic.http.metadata.HeaderFramework;
 import de.anomic.http.metadata.RequestHeader;
 import de.anomic.http.metadata.ResponseContainer;
@ -135,18 +134,6 @@ public final class HTTPLoader {
            if (res.getStatusCode() == 200 || res.getStatusCode() == 203) {
                // the transfer is ok
                
-                // create a new cache entry
-                response = new Response(
-                		request,
-                		requestHeader,
-                		res.getResponseHeader(), 
-                		res.getStatusLine(),
-                        sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle())
-                );
-                Cache.storeMetadata(request.url(), res.getResponseHeader());
-                
-                // request has been placed and result has been returned. work off response
-                
            	// if the response has not the right file type then reject file
                supportError = Parser.supports(request.url(), res.getResponseHeader().mime());
                if (supportError != null) {
@ -165,7 +152,15 @@ public final class HTTPLoader {
                	throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)");
                }

-                response.setContent(responseBody);
+                // create a new cache entry
+                response = new Response(
+                        request,
+                        requestHeader,
+                        res.getResponseHeader(), 
+                        res.getStatusLine(),
+                        sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
+                        responseBody
+                );

                return response;
            } else if (res.getStatusLine().startsWith("30")) {
--- a/source/de/anomic/crawler/retrieval/LoaderDispatcher.java
+++ b/source/de/anomic/crawler/retrieval/LoaderDispatcher.java
@ -28,14 +28,20 @@ package de.anomic.crawler.retrieval;

 import java.io.IOException;
 import java.util.Arrays;
+import java.util.Date;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.concurrent.ConcurrentHashMap;

+import de.anomic.crawler.CrawlProfile;
+import de.anomic.http.client.Cache;
+import de.anomic.http.metadata.HeaderFramework;
+import de.anomic.http.metadata.RequestHeader;
+import de.anomic.http.metadata.ResponseHeader;
 import de.anomic.search.Switchboard;
 import de.anomic.server.serverCore;
-import de.anomic.server.serverProcessorJob;
+import de.anomic.yacy.yacyURL;
 import de.anomic.yacy.logging.Log;

 public final class LoaderDispatcher {
@ -44,17 +50,17 @@ public final class LoaderDispatcher {
    private static final ConcurrentHashMap<String, Long> accessTime = new ConcurrentHashMap<String, Long>(); // to protect targets from DDoS
    
    private final Switchboard sb;
-    private final Log log;
    private final HashSet<String> supportedProtocols;
    private final HTTPLoader httpLoader;
    private final FTPLoader ftpLoader;
+    private final Log log;
    
-    public LoaderDispatcher(final Switchboard sb, final Log log) {
+    public LoaderDispatcher(final Switchboard sb) {
        this.sb = sb;
-        this.log = log;
        this.supportedProtocols = new HashSet<String>(Arrays.asList(new String[]{"http","https","ftp"}));
        
        // initiate loader objects
+        this.log = new Log("LOADER");
        httpLoader = new HTTPLoader(sb, log);
        ftpLoader = new FTPLoader(sb, log);
    }
@ -69,17 +75,100 @@ public final class LoaderDispatcher {
        return (HashSet<String>) this.supportedProtocols.clone();
    }
    
-    public Response load(final Request entry) throws IOException {
-        // getting the protocol of the next URL
-        final String protocol = entry.url().getProtocol();
-        final String host = entry.url().getHost();
+    public Response load(
+            final yacyURL url,
+            final boolean forText,
+            final boolean global
+    ) throws IOException {
+        
+        final Request centry = new Request(
+                sb.peers.mySeed().hash, 
+                url, 
+                "", 
+                "", 
+                new Date(),
+                new Date(),
+                (forText) ?
+                    ((global) ?
+                        sb.crawler.defaultTextSnippetGlobalProfile.handle() :
+                        sb.crawler.defaultTextSnippetLocalProfile.handle())
+                    :
+                    ((global) ?
+                        sb.crawler.defaultMediaSnippetGlobalProfile.handle() :
+                        sb.crawler.defaultMediaSnippetLocalProfile.handle()), // crawl profile
+                0, 
+                0, 
+                0);
+        
+        return load(centry);
+    }
+    
+    public Response load(final Request request) throws IOException {
+        // get the protocol of the next URL
+        final String protocol = request.url().getProtocol();
+        final String host = request.url().getHost();
        
        // check if this loads a page from localhost, which must be prevented to protect the server
        // against attacks to the administration interface when localhost access is granted
-        if (serverCore.isLocalhost(host) && sb.getConfigBool("adminAccountForLocalhost", false)) throw new IOException("access to localhost not granted for url " + entry.url());
+        if (serverCore.isLocalhost(host) && sb.getConfigBool("adminAccountForLocalhost", false)) throw new IOException("access to localhost not granted for url " + request.url());
+        
+        // check if we have the page in the cache
+
+        CrawlProfile.entry crawlProfile = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle());
+        int cacheStrategy = CrawlProfile.CACHE_STRATEGY_NOCACHE;
+        if (crawlProfile != null && (cacheStrategy = crawlProfile.cacheStrategy()) != CrawlProfile.CACHE_STRATEGY_NOCACHE) {
+            // we have passed a first test if caching is allowed
+            // now see if there is a cache entry
+        
+            ResponseHeader cachedResponse = (request.url().isLocal()) ? null : Cache.getResponseHeader(request.url());
+            byte[] content = (cachedResponse == null) ? null : Cache.getContent(request.url());
+            if (cachedResponse != null && content != null) {
+                // yes we have the content
+                
+                // create request header values and a response object because we need that
+                // in case that we want to return the cached content in the next step
+                final RequestHeader requestHeader = new RequestHeader();
+                requestHeader.put(HeaderFramework.USER_AGENT, HTTPLoader.crawlerUserAgent);
+                yacyURL refererURL = null;
+                if (request.referrerhash() != null) refererURL = sb.getURL(request.referrerhash());
+                if (refererURL != null) requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true, true));
+                Response response = new Response(
+                        request,
+                        requestHeader,
+                        cachedResponse,
+                        "200",
+                        crawlProfile,
+                        content);
+                
+                // check which caching strategy shall be used
+                if (cacheStrategy == CrawlProfile.CACHE_STRATEGY_IFEXIST || cacheStrategy == CrawlProfile.CACHE_STRATEGY_CACHEONLY) {
+                    // well, just take the cache and don't care about freshness of the content
+                    log.logInfo("cache hit/useall for: " + request.url().toNormalform(true, false));
+                    return response;
+                }
+                
+                // now the cacheStrategy must be CACHE_STRATEGY_IFFRESH, that means we should do a proxy freshness test
+                assert cacheStrategy == CrawlProfile.CACHE_STRATEGY_IFFRESH : "cacheStrategy = " + cacheStrategy;
+                if (response.isFreshForProxy()) {
+                    log.logInfo("cache hit/fresh for: " + request.url().toNormalform(true, false));
+                    return response;
+                } else {
+                    log.logInfo("cache hit/stale for: " + request.url().toNormalform(true, false));
+                }
+            }
+        }
        
-        // check access time
-        if (!entry.url().isLocal()) {
+        // check case where we want results from the cache exclusively, and never from the internet (offline mode)
+        if (cacheStrategy == CrawlProfile.CACHE_STRATEGY_CACHEONLY) {
+            // we had a chance to get the content from the cache .. its over. We don't have it.
+            return null;
+        }
+        
+        // now forget about the cache, nothing there. Try to load the content from the internet
+        
+        // check access time: this is a double-check (we checked possibly already in the balancer)
+        // to make shure that we don't DoS the target by mistake
+        if (!request.url().isLocal()) {
            final Long lastAccess = accessTime.get(host);
            long wait = 0;
            if (lastAccess != null) wait = Math.max(0, minDelay + lastAccess.longValue() - System.currentTimeMillis());
@ -91,13 +180,26 @@ public final class LoaderDispatcher {
                    try {Thread.sleep(untilTime - System.currentTimeMillis());} catch (final InterruptedException ee) {}
            }
        }
+
+        // now it's for shure that we will access the target. Remember the access time
        accessTime.put(host, System.currentTimeMillis());
        
-        // load resource
-        if ((protocol.equals("http") || (protocol.equals("https")))) return httpLoader.load(entry);
-        if (protocol.equals("ftp")) return ftpLoader.load(entry);
+        // load resource from the internet
+        Response response = null;
+        if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request);
+        if (protocol.equals("ftp")) response = ftpLoader.load(request);
+        if (response != null) {
+            // we got something. Now check if we want to store that to the cache
+            String storeError = response.shallStoreCache();
+            if (storeError == null) {
+                Cache.store(request.url(), response.getResponseHeader(), response.getContent());
+            } else {
+                if (Cache.log.isFine()) Cache.log.logFine("no storage of url " + request.url() + ": " + storeError);
+            }
+            return response;
+        }
        
-        throw new IOException("Unsupported protocol '" + protocol + "' in url " + entry.url());
+        throw new IOException("Unsupported protocol '" + protocol + "' in url " + request.url());
    }
    
    public synchronized void cleanupAccessTimeTable(long timeout) {
@ -109,24 +211,4 @@ public final class LoaderDispatcher {
            if (System.currentTimeMillis() - e.getValue().longValue() > minDelay) i.remove();
        }
    }
-    
-    public String process(final Request entry) {
-        // load a resource, store it to htcache and push queue entry to switchboard queue
-        // returns null if everything went fine, a fail reason string if a problem occurred
-        Response h;
-        try {
-            entry.setStatus("loading", serverProcessorJob.STATUS_RUNNING);
-            h = load(entry);
-            assert h != null;
-            entry.setStatus("loaded", serverProcessorJob.STATUS_RUNNING);
-            final boolean stored = sb.htEntryStoreProcess(h);
-            entry.setStatus("stored-" + ((stored) ? "ok" : "fail"), serverProcessorJob.STATUS_FINISHED);
-            return (stored) ? null : "not stored";
-        } catch (IOException e) {
-            entry.setStatus("error", serverProcessorJob.STATUS_FINISHED);
-            if (log.isFine()) log.logFine("problem loading " + entry.url().toString() + ": " + e.getMessage());
-            return "load error - " + e.getMessage();
-        }
-    }
-
 }
--- a/source/de/anomic/crawler/retrieval/Request.java
+++ b/source/de/anomic/crawler/retrieval/Request.java
@ -1,4 +1,4 @@
-// CrawlEntry.java
+// Request.java
 // (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
 // first published 14.03.2007 on http://yacy.net
 //
@ -69,7 +69,7 @@ public class Request extends serverProcessorJob {
    private long     loaddate;      // the time when the url was loaded
    private long     serverdate;    // the document date from the target server
    private long     imsdate;       // the time of a ifModifiedSince request
-    private String   profileHandle; // the name of the prefetch profile
+    private String   profileHandle; // the name of the fetch profile
    private int      depth;         // the prefetch depth so far, starts at 0
    private int      anchors;       // number of anchors of the parent
    private int      forkfactor;    // sum of anchors of all ancestors
@ -80,7 +80,7 @@ public class Request extends serverProcessorJob {
    
    
    /**
-     * A HarvestRequest Entry is a object that is created to provide
+     * A Request Entry is a object that is created to provide
     * all information to load a specific resource.
     * 
     * @param initiator the hash of the initiator peer
--- a/source/de/anomic/crawler/retrieval/Response.java
+++ b/source/de/anomic/crawler/retrieval/Response.java
@ -145,7 +145,8 @@ public class Response {
            final RequestHeader requestHeader,
            final ResponseHeader responseHeader,
            final String responseStatus,
-            final CrawlProfile.entry profile) {
+            final CrawlProfile.entry profile,
+            final byte[] content) {
        this.request = request;
        // request and response headers may be zero in case that we process surrogates
        this.requestHeader = requestHeader;
@ -153,15 +154,26 @@ public class Response {
        this.responseStatus = responseStatus;
        this.profile = profile;
        this.status = QUEUE_STATE_FRESH;
-        
-        // to be defined later:
-        this.content = null;
+        this.content = content;
+    }
+    
+    public Response(
+            Request request,
+            final RequestHeader requestHeader,
+            final ResponseHeader responseHeader,
+            final String responseStatus,
+            final CrawlProfile.entry profile) {
+        this(request, requestHeader, responseHeader, responseStatus, profile, null);
    }

    public void updateStatus(final int newStatus) {
        this.status = newStatus;
    }
    
+    public ResponseHeader getResponseHeader() {
+        return this.responseHeader;
+    }
+    
    public int getStatus() {
        return this.status;
    }
@ -241,7 +253,7 @@ public class Response {
     * @return NULL if the answer is TRUE, in case of FALSE, the reason as
     *         String is returned
     */
-    public String shallStoreCacheForProxy() {
+    public String shallStoreCache() {

        // check profile (disabled: we will check this in the plasmaSwitchboard)
        // if (!this.profile.storeHTCache()) { return "storage_not_wanted"; }
@ -252,7 +264,7 @@ public class Response {

        // check storage size: all files will be handled in RAM before storage, so they must not exceed
        // a given size, which we consider as 1MB
-        if (this.size() > 1024L * 1024L) return "too_large_for_caching_" + this.size();
+        if (this.size() > 10 * 1024L * 1024L) return "too_large_for_caching_" + this.size();
        
        // check status code
        if (!validResponseStatus()) {
@ -265,10 +277,15 @@ public class Response {
        if (this.url().isPOST() && !this.profile.crawlingQ()) {
            return "dynamic_post";
        }
+        
        if (this.url().isCGI()) {
            return "dynamic_cgi";
        }
        
+        if (this.url().isLocal()) {
+            return "local_URL_no_cache_needed";
+        }
+        
        if (requestHeader != null) {
            // -authorization cases in request
            // authorization makes pages very individual, and therefore we cannot use the
@ -338,7 +355,7 @@ public class Response {
     * 
     * @return whether the file should be taken from the cache
     */
-    public boolean shallUseCacheForProxy() {
+    public boolean isFreshForProxy() {

        // -CGI access in request
        // CGI access makes the page very individual, and therefore not usable
@ -488,7 +505,7 @@ public class Response {

        // check profile
        if (!profile().indexText() && !profile().indexMedia()) {
-            return "indexing not allowed - indexText and indexMedia not set (for proxy)";
+            return "indexing not allowed - indexText and indexMedia not set (for proxy = " + profile.name()+ ")";
        }

        // -CGI access in request
@ -629,7 +646,7 @@ public class Response {

        // check profile
        if (!profile().indexText() && !profile().indexMedia()) {
-            return "indexing not allowed - indexText and indexMedia not set (for crawler)";
+            return "indexing not allowed - indexText and indexMedia not set (for crawler = " + profile.name()+ ")";
        }

        // -CGI access in request
--- a/source/de/anomic/data/SitemapParser.java
+++ b/source/de/anomic/data/SitemapParser.java
@ -329,6 +329,7 @@ public class SitemapParser extends DefaultHandler {
                // remote Indexing disabled
                false,
                // exclude stop-words
-                true, true, true);
+                true, true, true,
+                CrawlProfile.CACHE_STRATEGY_IFFRESH);
    }
 }
--- a/source/de/anomic/data/bookmarksDB.java
+++ b/source/de/anomic/data/bookmarksDB.java
@ -170,9 +170,17 @@ public class bookmarksDB {
           								Integer.parseInt(parser[5]), Integer.parseInt(parser[6]), Boolean.parseBoolean(parser[7]), 
           								Boolean.parseBoolean(parser[8]), Boolean.parseBoolean(parser[9]), 
           								Boolean.parseBoolean(parser[10]), Boolean.parseBoolean(parser[11]), 
-           								Boolean.parseBoolean(parser[12])
-           				);           			
+           								Boolean.parseBoolean(parser[12]), CrawlProfile.CACHE_STRATEGY_IFFRESH
+           				);
        			}
+        			if (parser.length == 14) {                       
+                        folderReCrawl(Long.parseLong(parser[0]), parser[1], parser[2], Integer.parseInt(parser[3]), Long.parseLong(parser[4]), 
+                                        Integer.parseInt(parser[5]), Integer.parseInt(parser[6]), Boolean.parseBoolean(parser[7]), 
+                                        Boolean.parseBoolean(parser[8]), Boolean.parseBoolean(parser[9]), 
+                                        Boolean.parseBoolean(parser[10]), Boolean.parseBoolean(parser[11]), 
+                                        Boolean.parseBoolean(parser[12]), Integer.parseInt(parser[13])
+                        ); 
+                    }
        		}        		
        	}
        	in.close();
@ -204,9 +212,9 @@ public class bookmarksDB {
    	return true;
    }    
    
-    public void folderReCrawl (long schedule, String folder, String crawlingfilter, int newcrawlingdepth, long crawlingIfOlder, 
+    public void folderReCrawl(long schedule, String folder, String crawlingfilter, int newcrawlingdepth, long crawlingIfOlder, 
    		int crawlingDomFilterDepth, int crawlingDomMaxPages, boolean crawlingQ, boolean indexText, boolean indexMedia, 
-    		boolean crawlOrder, boolean xsstopw, boolean storeHTCache) {
+    		boolean crawlOrder, boolean xsstopw, boolean storeHTCache, int cacheStrategy) {

 	    Switchboard sb = Switchboard.getSwitchboard();
 	    Iterator<String> bit=getBookmarksIterator(folder, true);    		
@ -261,7 +269,7 @@ public class bookmarksDB {
 	                        sb.crawler.profilesActiveCrawls.getRecrawlDate(crawlingIfOlder), crawlingDomFilterDepth, crawlingDomMaxPages,
 	                        crawlingQ,
 	                        indexText, indexMedia,
-	                        storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw);
+	                        storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cacheStrategy);
 	                sb.crawlStacker.enqueueEntry(new Request(
 	                        sb.peers.mySeed().hash,
                            crawlingStartURL,
--- a/source/de/anomic/http/client/Cache.java
+++ b/source/de/anomic/http/client/Cache.java
@ -63,7 +63,7 @@ public final class Cache {
    private static long maxCacheSize = 0l;
    private static File cachePath = null;
    private static String prefix;
-    private static final Log log = new Log("HTCACHE");
+    public static final Log log = new Log("HTCACHE");
    
    public static void init(final File htCachePath, String peerSalt, final long CacheSizeMax) {
        
@ -103,31 +103,39 @@ public final class Cache {
        fileDBunbuffered.setMaxSize(maxCacheSize);
    }
    
+    /**
+     * close the databases
+     */
    public static void close() {
        responseHeaderDB.close();
        fileDB.close(true);
    }
    
-    // Store to Cache
-    public static void storeMetadata(final yacyURL url, final ResponseHeader responseHeader) {
-        if (responseHeader != null) try {
+    public static void store(yacyURL url, final ResponseHeader responseHeader, byte[] file) {
+        if (responseHeader != null && file != null) try {
            // store the response header into the header database
            final HashMap<String, String> hm = new HashMap<String, String>();
            hm.putAll(responseHeader);
            hm.put("@@URL", url.toNormalform(true, false));
            responseHeaderDB.put(url.hash(), hm);
-        } catch (final Exception e) {
-            log.logWarning("could not write ResourceInfo: "
-                    + e.getClass() + ": " + e.getMessage());
+            fileDB.put(url.hash().getBytes("UTF-8"), file);
+            if (log.isFine()) log.logFine("stored in cache: " + url.toNormalform(true, false));
+        } catch (IOException e) {
+            e.printStackTrace();
        }
    }
-
    
-    public static void storeFile(yacyURL url, byte[] file) {
+    /**
+     * check if the responseHeaderDB and the fileDB has an entry for the given url
+     * @param url the url of the resource
+     * @return true if the content of the url is in the cache, false othervise
+     */
+    public static boolean has(final yacyURL url) {
        try {
-            fileDB.put(url.hash().getBytes("UTF-8"), file);
+            return responseHeaderDB.has(url.hash()) && fileDB.has(url.hash().getBytes());
        } catch (IOException e) {
            e.printStackTrace();
+            return false;
        }
    }
    
@ -140,7 +148,7 @@ public final class Cache {
     * @throws <b>UnsupportedProtocolException</b> if the protocol is not supported and therefore the
     * info object couldn't be created
     */
-    public static ResponseHeader loadResponseHeader(final yacyURL url) {    
+    public static ResponseHeader getResponseHeader(final yacyURL url) {    
        
        // loading data from database
        Map<String, String> hdb;
@ -161,14 +169,21 @@ public final class Cache {
     * is available or the cached file is not readable, <code>null</code>
     * is returned.
     */
-    public static InputStream getResourceContentStream(final yacyURL url) {
+    public static InputStream getContentStream(final yacyURL url) {
        // load the url as resource from the cache
-        byte[] b = getResourceContent(url);
+        byte[] b = getContent(url);
        if (b == null) return null;
        return new ByteArrayInputStream(b);
    }
    
-    public static byte[] getResourceContent(final yacyURL url) {
+    /**
+     * Returns the content of a cached resource as byte[]
+     * @param url the requested resource
+     * @return the resource content as byte[]. In no data
+     * is available or the cached file is not readable, <code>null</code>
+     * is returned.
+     */
+    public static byte[] getContent(final yacyURL url) {
        // load the url as resource from the cache
        try {
            return fileDB.get(url.hash().getBytes("UTF-8"));
@ -178,8 +193,24 @@ public final class Cache {
        }
    }
    
+    /**
+     * requesting the content length of a resource is discouraged since it may
+     * be performed by loading of the resource from the cache and then measuring the
+     * size after decompression of the content. This may use a lot of CPU resources
+     * and maybe cause also high IO. Please omit usage of this method as much as possible.
+     * @param url
+     * @return the size of the cached content
+     */
    public static long getResourceContentLength(final yacyURL url) {
-        // load the url as resource from the cache
+        // first try to get the length from the response header,
+        // this is less costly than loading the content from its gzipped cache
+        ResponseHeader responseHeader = getResponseHeader(url);
+        if (responseHeader != null) {
+            long length = responseHeader.getContentLength();
+            if (length > 0) return length; 
+        }
+        // load the url as resource from the cache (possibly decompress it),
+        // and get the length from the content array size
        try {
            return fileDB.length(url.hash().getBytes("UTF-8"));
        } catch (IOException e) {
@ -188,7 +219,12 @@ public final class Cache {
        }
    }

-    public static void deleteFromCache(yacyURL url) throws IOException {
+    /**
+     * removed response header and cached content from the database
+     * @param url
+     * @throws IOException
+     */
+    public static void delete(yacyURL url) throws IOException {
        responseHeaderDB.remove(url.hash());
        fileDB.remove(url.hash().getBytes("UTF-8"));
    }
--- a/source/de/anomic/http/client/MultiOutputStream.java
+++ b/source/de/anomic/http/client/MultiOutputStream.java
@ -36,9 +36,24 @@ public class MultiOutputStream extends OutputStream {
     */
    @Override
    public void write(int b) throws IOException {
-        for(OutputStream stream: streams) {
+        for (OutputStream stream: streams) {
            stream.write(b);
        }
    }
+    
+    /**
+     * writes the byte[] to each of the streams
+     * overriding this high-level method causes less overhead
+     * than overriding only the low-level write method:
+     * it causes (a large number) less 'for' loops
+     * 
+     * @see java.io.OutputStream#write(int)
+     */
+    @Override
+    public void write(byte[] b, int start, int len) throws IOException {
+        for (OutputStream stream: streams) {
+            stream.write(b, start, len);
+        }
+    }

 }
--- a/source/de/anomic/http/server/HTTPDProxyHandler.java
+++ b/source/de/anomic/http/server/HTTPDProxyHandler.java
@ -358,7 +358,7 @@ public final class HTTPDProxyHandler {
            // handle outgoing cookies
            handleOutgoingCookies(requestHeader, host, ip);
            prepareRequestHeader(conProp, requestHeader, hostlow);
-            ResponseHeader cachedResponseHeader = Cache.loadResponseHeader(url);
+            ResponseHeader cachedResponseHeader = Cache.getResponseHeader(url);
            
            // why are files unzipped upon arrival? why not zip all files in cache?
            // This follows from the following premises
@ -404,10 +404,8 @@ public final class HTTPDProxyHandler {
                        "200 OK",
                        sb.crawler.defaultProxyProfile
                );
-                //Cache.storeMetadata(cachedResponseHeader, response); // TODO: check if this storeMetadata is necessary
-
-                byte[] cacheContent = Cache.getResourceContent(url);
-                if (cacheContent != null && response.shallUseCacheForProxy()) {
+                byte[] cacheContent = Cache.getContent(url);
+                if (cacheContent != null && response.isFreshForProxy()) {
                    if (theLogger.isFinest()) theLogger.logFinest(reqID + " fulfill request from cache");
                    fulfillRequestFromCache(conProp, url, requestHeader, cachedResponseHeader, cacheContent, countedRespond);
                } else {
@ -502,7 +500,7 @@ public final class HTTPDProxyHandler {
                if (cachedResponseHeader != null) {
                    // delete the cache
                    sizeBeforeDelete = Cache.getResourceContentLength(url);
-                    Cache.deleteFromCache(url);
+                    Cache.delete(url);
                    conProp.setProperty(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE, "TCP_REFRESH_MISS");
                }

@ -518,14 +516,7 @@ public final class HTTPDProxyHandler {
                        0, 
                        0, 
                        0);
-                final Response response = new Response(
-                		request,
-                        requestHeader,
-                        responseHeader,
-                        res.getStatusLine(),
-                        sb.crawler.defaultProxyProfile
-                );
-                Cache.storeMetadata(request.url(), responseHeader);
+                

                // handle incoming cookies
                handleIncomingCookies(responseHeader, host, ip);
@ -549,8 +540,14 @@ public final class HTTPDProxyHandler {
                if (hasBody(res.getStatusCode())) {

                    final OutputStream outStream = (gzippedOut != null) ? gzippedOut : ((chunkedOut != null)? chunkedOut : respond);
-
-                    final String storeError = response.shallStoreCacheForProxy();
+                    final Response response = new Response(
+                            request,
+                            requestHeader,
+                            responseHeader,
+                            res.getStatusLine(),
+                            sb.crawler.defaultProxyProfile
+                    );
+                    final String storeError = response.shallStoreCache();
                    final boolean storeHTCache = response.profile().storeHTCache();
                    final String supportError = Parser.supports(response.url(), response.getMimeType());
                    if (
@ -582,22 +579,21 @@ public final class HTTPDProxyHandler {

                        if (sizeBeforeDelete == -1) {
                            // totally fresh file
-                            //cacheEntry.status = plasmaHTCache.CACHE_FILL; // it's an insert
                            response.setContent(cacheArray);
-                            sb.htEntryStoreProcess(response);
-                            conProp.setProperty(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE,"TCP_MISS");
+                            Cache.store(response.url(), response.getResponseHeader(), cacheArray);
+                            sb.toIndexer(response);
+                            conProp.setProperty(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE, "TCP_MISS");
                        } else if (cacheArray != null && sizeBeforeDelete == cacheArray.length) {
                            // before we came here we deleted a cache entry
                            cacheArray = null;
-                            //cacheEntry.status = plasmaHTCache.CACHE_STALE_RELOAD_BAD;
                            //cacheManager.push(cacheEntry); // unnecessary update
-                            conProp.setProperty(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE,"TCP_REF_FAIL_HIT");
+                            conProp.setProperty(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE, "TCP_REF_FAIL_HIT");
                        } else {
                            // before we came here we deleted a cache entry
-                            //cacheEntry.status = plasmaHTCache.CACHE_STALE_RELOAD_GOOD;
                            response.setContent(cacheArray);
-                            sb.htEntryStoreProcess(response);
-                            conProp.setProperty(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE,"TCP_REFRESH_MISS");
+                            Cache.store(response.url(), response.getResponseHeader(), cacheArray);
+                            sb.toIndexer(response);
+                            conProp.setProperty(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE, "TCP_REFRESH_MISS");
                        }
                    } else {
                        // no caching
--- a/source/de/anomic/kelondro/blob/Compressor.java
+++ b/source/de/anomic/kelondro/blob/Compressor.java
@ -150,7 +150,7 @@ public class Compressor implements BLOB {
                return null;
            }
        } else if (ByteArray.equals(b, plainMagic)) {
-            System.out.print("-"); // DEBUG
+            //System.out.print("-"); // DEBUG
            byte[] r = new byte[b.length - 2];
            System.arraycopy(b, 2, r, 0, b.length - 2);
            return r;
--- a/source/de/anomic/search/SnippetCache.java
+++ b/source/de/anomic/search/SnippetCache.java
@ -344,8 +344,8 @@ public class SnippetCache {
                return new TextSnippet(url, loc, SOURCE_METADATA, null, null, faviconCache.get(url.hash()));
            } else {
                // trying to load the resource from the cache
-                resContent = Cache.getResourceContentStream(url);
-                responseHeader = Cache.loadResponseHeader(url);
+                resContent = Cache.getContentStream(url);
+                responseHeader = Cache.getResponseHeader(url);
                if (resContent != null && ((resContentLength = Cache.getResourceContentLength(url)) > maxDocLen) && (!fetchOnline)) {
                    // content may be too large to be parsed here. To be fast, we omit calculation of snippet here
                    return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "resource available, but too large: " + resContentLength + " bytes");
@ -353,12 +353,12 @@ public class SnippetCache {
                    // if not found try to download it
                    
                    // download resource using the crawler and keep resource in memory if possible
-                    final Response entry = Switchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, true, reindexing);
+                    final Response entry = Switchboard.getSwitchboard().loader.load(url, true, reindexing);
                    
                    // getting resource metadata (e.g. the http headers for http resources)
                    if (entry != null) {
-                        // place entry on crawl queue
-                        sb.htEntryStoreProcess(entry);
+                        // place entry on indexing queue
+                        sb.toIndexer(entry);
                        
                        // read resource body (if it is there)
                        final byte []resourceArray = entry.getContent();
@ -366,7 +366,7 @@ public class SnippetCache {
                            resContent = new ByteArrayInputStream(resourceArray);
                            resContentLength = resourceArray.length;
                        } else {
-                            resContent = Cache.getResourceContentStream(url); 
+                            resContent = Cache.getContentStream(url); 
                            resContentLength = Cache.getResourceContentLength(url);
                        }
                    }
@ -456,8 +456,8 @@ public class SnippetCache {
        ResponseHeader responseHeader = null;
        try {
            // trying to load the resource from the cache
-            resContent = Cache.getResourceContentStream(url);
-            responseHeader = Cache.loadResponseHeader(url);
+            resContent = Cache.getContentStream(url);
+            responseHeader = Cache.getResponseHeader(url);
            if (resContent != null) {
                // if the content was found
                resContentLength = Cache.getResourceContentLength(url);
@ -465,7 +465,7 @@ public class SnippetCache {
                // if not found try to download it
                
                // download resource using the crawler and keep resource in memory if possible
-                final Response entry = Switchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, forText, global);
+                final Response entry = Switchboard.getSwitchboard().loader.load(url, forText, global);
                
                // getting resource metadata (e.g. the http headers for http resources)
                if (entry != null) {
@ -476,7 +476,7 @@ public class SnippetCache {
                        resContent = new ByteArrayInputStream(resourceArray);
                        resContentLength = resourceArray.length;
                    } else {
-                        resContent = Cache.getResourceContentStream(url); 
+                        resContent = Cache.getContentStream(url); 
                        resContentLength = Cache.getResourceContentLength(url);
                    }
                }
@ -844,7 +844,7 @@ public class SnippetCache {
            if (responseHeader == null) {
                // try to get the header from the htcache directory
                try {                    
-                    responseHeader = Cache.loadResponseHeader(url);
+                    responseHeader = Cache.getResponseHeader(url);
                } catch (final Exception e) {
                    // ignore this. resource info loading failed
                }   
@ -897,14 +897,14 @@ public class SnippetCache {
            long contentLength = -1;
            
            // trying to load the resource body from cache
-            InputStream resource = Cache.getResourceContentStream(url);
+            InputStream resource = Cache.getContentStream(url);
            if (resource != null) {
                contentLength = Cache.getResourceContentLength(url);
            } else if (fetchOnline) {
                // if the content is not available in cache try to download it from web
                
                // try to download the resource using a crawler
-                final Response entry = Switchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, forText, reindexing);
+                final Response entry = Switchboard.getSwitchboard().loader.load(url, forText, reindexing);
                if (entry == null) return null; // not found in web
                
                // read resource body (if it is there)
@ -912,7 +912,7 @@ public class SnippetCache {
            
                // in case that the resource was not in ram, read it from disk
                if (resourceArray == null) {
-                    resource = Cache.getResourceContentStream(url);   
+                    resource = Cache.getContentStream(url);   
                    contentLength = Cache.getResourceContentLength(url); 
                } else {
                    resource = new ByteArrayInputStream(resourceArray);
--- a/source/de/anomic/search/Switchboard.java
+++ b/source/de/anomic/search/Switchboard.java
@ -230,6 +230,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
    public  File                           surrogatesOutPath;
    public  Map<String, String>            rankingPermissions;
    public  Segment                        indexSegment;
+    public  LoaderDispatcher               loader;
    public  CrawlSwitchboard               crawler;
    public  CrawlQueues                    crawlQueues;
    public  ResultURLs                     crawlResults;
@ -514,6 +515,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
        
        // start a loader
        log.logConfig("Starting Crawl Loader");
+        this.loader = new LoaderDispatcher(this);
        this.crawlQueues = new CrawlQueues(this, queuesRoot);
        this.crawlQueues.noticeURL.setMinimumDelta(
                this.getConfigLong("minimumLocalDelta", this.crawlQueues.noticeURL.getMinimumLocalDelta()),
@ -1092,90 +1094,6 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
        return this.crawler.cleanProfiles();
    }
    
-    public boolean htEntryStoreProcess(final Response entry) {
-        
-        if (entry == null) return false;
-
-        /* =========================================================================
-         * PARSER SUPPORT
-         * 
-         * Testing if the content type is supported by the available parsers
-         * ========================================================================= */
-        final String supportError = Parser.supports(entry.url(), entry.getMimeType());
-        if (log.isFinest()) log.logFinest("STORE "+ entry.url() +" content of type "+ entry.getMimeType() + " is supported: " + supportError);
-        
-        /* =========================================================================
-         * INDEX CONTROL HEADER
-         * 
-         * With the X-YACY-Index-Control header set to "no-index" a client could disallow
-         * yacy to index the response returned as answer to a request
-         * ========================================================================= */
-        boolean doIndexing = true;        
-        if (entry.requestProhibitsIndexing()) {        
-            doIndexing = false;
-            if (this.log.isFine()) this.log.logFine("Crawling of " + entry.url() + " prohibited by request.");
-        }        
-        
-        /* =========================================================================
-         * LOCAL IP ADDRESS CHECK
-         * 
-         * check if ip is local ip address // TODO: remove this procotol specific code here
-         * ========================================================================= */
-        final String urlRejectReason = crawlStacker.urlInAcceptedDomain(entry.url());
-        if (urlRejectReason != null) {
-            if (this.log.isFine()) this.log.logFine("Rejected URL '" + entry.url() + "': " + urlRejectReason);
-            doIndexing = false;
-        }
-        
-        /* =========================================================================
-         * STORING DATA
-         * 
-         * Now we store the response header and response content if 
-         * a) the user has configured to use the htcache or
-         * b) the content should be indexed
-         * ========================================================================= */        
-        if (((entry.profile() != null) && (entry.profile().storeHTCache())) || (doIndexing && supportError == null)) {
-            // store response header
-            /*
-            if (entry.writeResourceInfo()) {
-                this.log.logInfo("WROTE HEADER for " + entry.cacheFile());
-            }
-            */
-            
-            // work off unwritten files
-            if (entry.getContent() != null) {
-                final String error = (entry.initiator() == null) ? entry.shallStoreCacheForProxy() : null;
-                if (error == null) {
-                    Cache.storeFile(entry.url(), entry.getContent());
-                    if (this.log.isFine()) this.log.logFine("WROTE FILE (" + entry.getContent().length + " bytes) for " + entry.url());
-                } else {
-                    if (this.log.isWarning()) this.log.logWarning("WRITE OF FILE " + entry.url() + " FORBIDDEN: " + error);
-                }
-            //} else {
-                //this.log.logFine("EXISTING FILE (" + entry.cacheFile.length() + " bytes) for " + entry.cacheFile);
-            }
-        }
-        
-        /* =========================================================================
-         * INDEXING
-         * ========================================================================= */          
-        if (doIndexing && supportError == null) {
-            
-            // enqueue for further crawling
-            enQueue(entry);
-        } else {
-            if (!entry.profile().storeHTCache()) {
-                try {
-                    Cache.deleteFromCache(entry.url());
-                } catch (IOException e) {
-                    e.printStackTrace();
-                }                
-            }
-        }
-        
-        return true;
-    }
-    
    public void close() {
        log.logConfig("SWITCHBOARD SHUTDOWN STEP 1: sending termination signal to managed threads:");
        serverProfiling.stopSystemProfiling();
@ -1215,44 +1133,65 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
        log.logConfig("SWITCHBOARD SHUTDOWN TERMINATED");
    }
    
-    public void enQueue(final Response queueEntry) {
-        assert queueEntry != null;
+    public boolean toIndexer(final Response response) {
+        assert response != null;
        
        // get next queue entry and start a queue processing
-        if (queueEntry == null) {
+        if (response == null) {
            if (this.log.isFine()) log.logFine("deQueue: queue entry is null");
-            return;
+            return false;
        }
-        if (queueEntry.profile() == null) {
+        if (response.profile() == null) {
            if (this.log.isFine()) log.logFine("deQueue: profile is null");
-            return;
+            return false;
        }
        
-        // check if the document should be indexed
+        // check if the document should be indexed based on proxy/crawler rules
        String noIndexReason = "unspecified indexing error";
-        if (queueEntry.processCase(peers.mySeed().hash) == SwitchboardConstants.PROCESSCASE_4_PROXY_LOAD) {
+        if (response.processCase(peers.mySeed().hash) == SwitchboardConstants.PROCESSCASE_4_PROXY_LOAD) {
            // proxy-load
-            noIndexReason = queueEntry.shallIndexCacheForProxy();
+            noIndexReason = response.shallIndexCacheForProxy();
        } else {
            // normal crawling
-            noIndexReason = queueEntry.shallIndexCacheForCrawler();
+            noIndexReason = response.shallIndexCacheForCrawler();
+        }
+
+        // check if the parser supports the mime type
+        if (noIndexReason == null) {
+            noIndexReason = Parser.supports(response.url(), response.getMimeType());
        }
+
        
+        // check X-YACY-Index-Control
+        // With the X-YACY-Index-Control header set to "no-index" a client could disallow
+        // yacy to index the response returned as answer to a request
+        if (noIndexReason == null && response.requestProhibitsIndexing()) {
+            noIndexReason = "X-YACY-Index-Control header prohibits indexing";
+        }
+        
+        // check accepted domain / localhost accesses
+        if (noIndexReason == null) {
+            noIndexReason = crawlStacker.urlInAcceptedDomain(response.url());
+        }
+        
+        // in the noIndexReason is set, indexing is not allowed
        if (noIndexReason != null) {
-            // this document should not be indexed. log cause and close queue
-            final yacyURL referrerURL = queueEntry.referrerURL();
-            if (log.isFine()) log.logFine("deQueue: not indexed any word in URL " + queueEntry.url() + "; cause: " + noIndexReason);
-            addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? "" : referrerURL.hash(), queueEntry.initiator(), queueEntry.name(), noIndexReason);
+            // log cause and close queue
+            final yacyURL referrerURL = response.referrerURL();
+            if (log.isFine()) log.logFine("deQueue: not indexed any word in URL " + response.url() + "; cause: " + noIndexReason);
+            addURLtoErrorDB(response.url(), (referrerURL == null) ? "" : referrerURL.hash(), response.initiator(), response.name(), noIndexReason);
            // finish this entry
-            return;
+            return false;
        }

        // put document into the concurrent processing queue
-        if (log.isFinest()) log.logFinest("deQueue: passing entry to indexing queue");
+        if (log.isFinest()) log.logFinest("deQueue: passing to indexing queue: " + response.url().toNormalform(true, false));
        try {
-            this.indexingDocumentProcessor.enQueue(new indexingQueueEntry(queueEntry, null, null));
+            this.indexingDocumentProcessor.enQueue(new indexingQueueEntry(response, null, null));
+            return true;
        } catch (InterruptedException e) {
            e.printStackTrace();
+            return false;
        }
    }
    
@ -1649,7 +1588,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi

        try {
            // parse the document
-            document = Parser.parseSource(entry.url(), entry.getMimeType(), entry.getCharacterEncoding(), Cache.getResourceContent(entry.url()));
+            document = Parser.parseSource(entry.url(), entry.getMimeType(), entry.getCharacterEncoding(), Cache.getContent(entry.url()));
            assert(document != null) : "Unexpected error. Parser returned null.";
        } catch (final ParserException e) {
            this.log.logWarning("Unable to parse the resource '" + entry.url() + "'. " + e.getMessage());
--- a/source/de/anomic/ymage/ymageOSM.java
+++ b/source/de/anomic/ymage/ymageOSM.java
@ -77,12 +77,12 @@ public class ymageOSM {
            return null;
        }
        System.out.println("*** DEBUG: fetching OSM tile: " + tileURL.toNormalform(true, true));
-        InputStream tileStream = Cache.getResourceContentStream(tileURL);
+        InputStream tileStream = Cache.getContentStream(tileURL);
        if (tileStream == null) {
            // download resource using the crawler and keep resource in memory if possible
            Response entry = null;
            try {
-                entry = Switchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(tileURL, false, false);
+                entry = Switchboard.getSwitchboard().loader.load(tileURL, false, false);
            } catch (IOException e) {
                Log.logWarning("yamyOSM", "cannot load: " + e.getMessage());
                return null;