From 7bcfa033c93da424d5f535bc1a62bdb1a0990602 Mon Sep 17 00:00:00 2001
From: orbiter <orbiter@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Mon, 21 Jun 2010 14:54:54 +0000
Subject: [PATCH] more abstraction of the htcache when using the
 LoaderDispatcher: a cache access shall not made directly to the cache any
 more, all loading attempts shall use the LoaderDispatcher. To control the
 usage of the cache, a enum instance from CrawlProfile.CacheStrategy shall be
 used. Some direct loading methods without the usage of a cache strategy have
 been removed. This affects also the verify-option of the yacysearch servlet.
 If there is a 'verify=false' now after this commit this does not necessarily
 mean that no snippets are generated. Instead, all snippets that can be
 retrieved using the cache only are presented. This still means that the
 search hit was not verified because the snippet was generated using the
 cache. If a cache-based generation of snippets is not possible, then the
 verify=false causes that the link is not rejected.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6936 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 htroot/Bookmarks.java                         |  3 +-
 htroot/IndexControlRWIs_p.java                |  5 +-
 htroot/IndexControlURLs_p.java                |  3 +-
 htroot/RSSLoader_p.java                       |  3 +-
 htroot/ViewFile.java                          | 84 ++++---------------
 htroot/ViewImage.java                         |  3 +-
 htroot/yacy/search.java                       |  5 +-
 htroot/yacysearch.java                        | 27 +++---
 source/de/anomic/crawler/CrawlProfile.java    | 16 ++++
 source/de/anomic/crawler/CrawlQueues.java     |  3 +-
 source/de/anomic/search/MediaSnippet.java     |  5 +-
 source/de/anomic/search/QueryParams.java      | 11 +--
 source/de/anomic/search/ResultFetcher.java    | 29 +++----
 source/de/anomic/search/Segment.java          | 25 ++++--
 source/de/anomic/search/TextSnippet.java      | 20 +++--
 source/de/anomic/server/serverCore.java       |  2 +-
 .../net/yacy/document/parser/pdfParser.java   | 10 ++-
 .../net/yacy/repository/LoaderDispatcher.java | 76 +++++------------
 18 files changed, 142 insertions(+), 188 deletions(-)

diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java
index b99969e37..f59b54b34 100644
--- a/htroot/Bookmarks.java
+++ b/htroot/Bookmarks.java
@@ -43,6 +43,7 @@ import net.yacy.kelondro.logging.Log;
 import net.yacy.kelondro.util.DateFormatter;
 import net.yacy.repository.LoaderDispatcher;
 
+import de.anomic.crawler.CrawlProfile;
 import de.anomic.data.BookmarkHelper;
 import de.anomic.data.bookmarksDB;
 import de.anomic.data.listManager;
@@ -188,7 +189,7 @@ public class Bookmarks {
                         Document document = null;
                         if (urlentry != null) {
                             final URIMetadataRow.Components metadata = urlentry.metadata();
-                            document = LoaderDispatcher.retrieveDocument(metadata.url(), true, 5000, true, false, Long.MAX_VALUE);
+                            document = LoaderDispatcher.retrieveDocument(metadata.url(), CrawlProfile.CacheStrategy.IFEXIST, 5000, true, false, Long.MAX_VALUE);
                             prop.put("mode_edit", "0"); // create mode
                             prop.put("mode_url", metadata.url().toNormalform(false, true));
                             prop.putHTML("mode_title", metadata.dc_title());
diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java
index 8ad3555d3..b5763daa2 100644
--- a/htroot/IndexControlRWIs_p.java
+++ b/htroot/IndexControlRWIs_p.java
@@ -52,6 +52,7 @@ import net.yacy.kelondro.util.DateFormatter;
 import net.yacy.kelondro.util.FileUtils;
 import net.yacy.repository.Blacklist;
 
+import de.anomic.crawler.CrawlProfile;
 import de.anomic.data.listManager;
 import de.anomic.http.server.RequestHeader;
 import de.anomic.search.QueryParams;
@@ -162,7 +163,7 @@ public class IndexControlRWIs_p {
                     index = null;
                 }
                 if (delurlref) {
-                    segment.removeAllUrlReferences(urlb, sb.loader, true);
+                    segment.removeAllUrlReferences(urlb, sb.loader, CrawlProfile.CacheStrategy.IFEXIST);
                 }
                 // delete the word first because that is much faster than the deletion of the urls from the url database
                 segment.termIndex().delete(keyhash);
@@ -179,7 +180,7 @@ public class IndexControlRWIs_p {
             // delete selected URLs
             if (post.containsKey("keyhashdelete")) try {
                 if (delurlref) {
-                    segment.removeAllUrlReferences(urlb, sb.loader, true);
+                    segment.removeAllUrlReferences(urlb, sb.loader, CrawlProfile.CacheStrategy.IFEXIST);
                 }
                 if (delurl || delurlref) {
                     for (byte[] b: urlb) sb.urlRemove(segment, b);
diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java
index f8d8f3a89..1bed53899 100644
--- a/htroot/IndexControlURLs_p.java
+++ b/htroot/IndexControlURLs_p.java
@@ -38,6 +38,7 @@ import net.yacy.kelondro.order.Base64Order;
 import net.yacy.kelondro.order.RotateIterator;
 import net.yacy.kelondro.util.DateFormatter;
 
+import de.anomic.crawler.CrawlProfile;
 import de.anomic.http.server.RequestHeader;
 import de.anomic.search.MetadataRepository;
 import de.anomic.search.Segment;
@@ -140,7 +141,7 @@ public class IndexControlURLs_p {
         prop.put("result", " ");
 
         if (post.containsKey("urlhashdeleteall")) {
-            i = segment.removeAllUrlReferences(urlhash.getBytes(), sb.loader, true);
+            i = segment.removeAllUrlReferences(urlhash.getBytes(), sb.loader, CrawlProfile.CacheStrategy.IFEXIST);
             prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes.");
             prop.put("lurlexport", 0);
             prop.put("reload", 0);
diff --git a/htroot/RSSLoader_p.java b/htroot/RSSLoader_p.java
index 159300f69..295715492 100644
--- a/htroot/RSSLoader_p.java
+++ b/htroot/RSSLoader_p.java
@@ -33,6 +33,7 @@ import net.yacy.document.ParserException;
 import net.yacy.document.parser.rssParser;
 import net.yacy.kelondro.data.meta.DigestURI;
 
+import de.anomic.crawler.CrawlProfile;
 import de.anomic.crawler.retrieval.Response;
 import de.anomic.http.server.RequestHeader;
 import de.anomic.search.Switchboard;
@@ -63,7 +64,7 @@ public class RSSLoader_p {
         // if the resource body was not cached we try to load it from web
         Response entry = null;
         try {
-            entry = sb.loader.load(url, true, false, Long.MAX_VALUE);
+            entry = sb.loader.load(url, true, false, CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
         } catch (final Exception e) {
             return prop;
         }
diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java
index cb3bd045e..a2fba71d1 100644
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@@ -43,9 +43,9 @@ import net.yacy.document.parser.html.CharacterCoding;
 import net.yacy.document.parser.html.ImageEntry;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.data.meta.URIMetadataRow;
-import net.yacy.kelondro.logging.Log;
 import net.yacy.repository.LoaderDispatcher;
 
+import de.anomic.crawler.CrawlProfile;
 import de.anomic.crawler.retrieval.Response;
 import de.anomic.http.client.Cache;
 import de.anomic.http.server.RequestHeader;
@@ -150,7 +150,7 @@ public class ViewFile {
             }
 
             // define an url by post parameter
-            url = new DigestURI(urlString, null);
+            url = new DigestURI(MultiProtocolURI.unescape(urlString), null);
             urlHash = new String(url.hash());
             pre = post.get("pre", "false").equals("true");
         } catch (final MalformedURLException e) {}
@@ -168,87 +168,35 @@ public class ViewFile {
         // loading the resource content as byte array
         prop.put("error_incache", Cache.has(url) ? 1 : 0);
         
-        ResponseHeader responseHeader = null;
         String resMime = null;
+        ResponseHeader responseHeader = responseHeader = Cache.getResponseHeader(url);;
         byte[] resource = Cache.getContent(url);
         
-        if (resource == null && authorized) {
+        if ((resource == null || responseHeader == null) && authorized) {
             // load resource from net
             Response response = null;
             try {
-                response = sb.loader.load(url, true, false, Long.MAX_VALUE);
+                response = sb.loader.load(url, true, false, CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE);
             } catch (IOException e) {
-                Log.logException(e);
-            }
-            if (response != null) {
-                resource = response.getContent();
-                responseHeader = response.getResponseHeader();
-            }
-        }
-        
-        if (responseHeader == null) responseHeader = Cache.getResponseHeader(url);
-
-        // if the resource body was not cached we try to load it from web
-        if (resource == null) {
-            Response entry = null;
-            try {
-                entry = sb.loader.load(url, true, false, Long.MAX_VALUE);
-            } catch (final Exception e) {
                 prop.put("error", "4");
                 prop.putHTML("error_errorText", e.getMessage());
                 prop.put("viewMode", VIEW_MODE_NO_TEXT);
                 return prop;
             }
-
-            if (entry != null) {
-                resource = entry.getContent();
-            }
-
-            if (resource == null) {
-                prop.put("error", "4");
-                prop.put("error_errorText", "No resource available");
-                prop.put("viewMode", VIEW_MODE_NO_TEXT);
-                return prop;
+            if (response != null) {
+                resource = response.getContent();
+                responseHeader = response.getResponseHeader();
             }
         }
-
-        // try to load resource metadata
-        if (responseHeader == null) {
-
-            // try to load the metadata from cache
-            try {
-                responseHeader = Cache.getResponseHeader(url);
-            } catch (final Exception e) {
-                /* ignore this */
-            }
-
-            // if the metadata was not cached try to load it from web
-            if (responseHeader == null) {
-                final String protocol = url.getProtocol();
-                if (!((protocol.equals("http") || protocol.equals("https")))) {
-                    prop.put("error", "6");
-                    prop.put("viewMode", VIEW_MODE_NO_TEXT);
-                    return prop;
-                }
-
-                try {
-                    Response response = sb.loader.load(url, true, false, Long.MAX_VALUE);
-                    responseHeader = response.getResponseHeader();
-                    resource = response.getContent();
-                } catch (IOException e) {
-                    Log.logException(e);
-                }
-                if (responseHeader == null) {
-                    prop.put("error", "4");
-                    prop.put("error_errorText", "Unable to load resource metadata.");
-                    prop.put("viewMode", VIEW_MODE_NO_TEXT);
-                    return prop;
-                }
-                resMime = responseHeader.mime();
-            }
-        } else {
-            resMime = responseHeader.mime();
+        
+        // if resource not available just fail
+        if (resource == null || responseHeader == null) {
+            prop.put("error", "4");
+            prop.put("error_errorText", "No resource available");
+            prop.put("viewMode", VIEW_MODE_NO_TEXT);
+            return prop;
         }
+        resMime = responseHeader.mime();
         
         final String[] wordArray = wordArray(post.get("words", null));
 
diff --git a/htroot/ViewImage.java b/htroot/ViewImage.java
index dbf42e51d..92e62c2b3 100644
--- a/htroot/ViewImage.java
+++ b/htroot/ViewImage.java
@@ -35,6 +35,7 @@ import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.logging.Log;
 import net.yacy.kelondro.util.FileUtils;
 
+import de.anomic.crawler.CrawlProfile;
 import de.anomic.http.server.HeaderFramework;
 import de.anomic.http.server.RequestHeader;
 import de.anomic.search.Switchboard;
@@ -90,7 +91,7 @@ public class ViewImage {
         if (scaled == null) {
             byte[] resourceb = null;
             if (url != null) try {
-                resourceb = sb.loader.getResource(url, true, timeout, false, true);
+                resourceb = sb.loader.getResource(url, CrawlProfile.CacheStrategy.IFEXIST, timeout, false, true);
             } catch (IOException e) {
                 Log.logWarning("ViewImage", "cannot load: " + e.getMessage());
             }
diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java
index 0167692ba..54667a061 100644
--- a/htroot/yacy/search.java
+++ b/htroot/yacy/search.java
@@ -47,6 +47,7 @@ import net.yacy.kelondro.util.EventTracker;
 import net.yacy.kelondro.util.SortStack;
 import net.yacy.kelondro.util.ISO639;
 
+import de.anomic.crawler.CrawlProfile;
 import de.anomic.http.server.HeaderFramework;
 import de.anomic.http.server.RequestHeader;
 import de.anomic.net.natLib;
@@ -206,7 +207,7 @@ public final class search {
                     ContentDomain.contentdomParser(contentdom),
                     language,
                     "", // no navigation
-                    false,
+                    CrawlProfile.CacheStrategy.CACHEONLY,
                     count,
                     0,
                     filter,
@@ -259,7 +260,7 @@ public final class search {
                     ContentDomain.contentdomParser(contentdom),
                     language,
                     "", // no navigation
-                    false, 
+                    CrawlProfile.CacheStrategy.CACHEONLY, 
                     count, 
                     0, 
                     filter, 
diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java
index 003f0f486..a6b87c30c 100644
--- a/htroot/yacysearch.java
+++ b/htroot/yacysearch.java
@@ -51,6 +51,7 @@ import net.yacy.kelondro.util.SetTools;
 import net.yacy.kelondro.util.ISO639;
 import net.yacy.repository.LoaderDispatcher;
 
+import de.anomic.crawler.CrawlProfile;
 import de.anomic.data.DidYouMean;
 import de.anomic.data.LibraryProvider;
 import de.anomic.http.server.HeaderFramework;
@@ -67,7 +68,6 @@ import de.anomic.search.SwitchboardConstants;
 import de.anomic.server.serverCore;
 import de.anomic.server.serverObjects;
 import de.anomic.server.serverSwitch;
-import de.anomic.yacy.yacyNewsDB;
 import de.anomic.yacy.yacyNewsPool;
 import de.anomic.yacy.graphics.ProfilingGraph;
 
@@ -97,7 +97,8 @@ public class yacysearch {
         // get query
         String originalquerystring = (post == null) ? "" : post.get("query", post.get("search", "")).trim();
         String querystring =  originalquerystring.replace('+', ' ');
-        boolean fetchSnippets = (post != null && post.get("verify", "false").equals("true"));
+        CrawlProfile.CacheStrategy snippetFetchStrategy = (post != null && post.get("verify", "false").equals("true")) ? CrawlProfile.CacheStrategy.IFFRESH : CrawlProfile.CacheStrategy.parse(post.get("verify", "cacheonly"));
+        if (snippetFetchStrategy == null) snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY;
         final serverObjects prop = new serverObjects();
 
         // get segment
@@ -164,7 +165,7 @@ public class yacysearch {
         // collect search attributes
         boolean newsearch = post.hasValue("query") && post.hasValue("former") && !post.get("query","").equalsIgnoreCase(post.get("former","")); //new search term
         
-        int itemsPerPage = Math.min((authenticated) ? (fetchSnippets ? 100 : 1000) : (fetchSnippets ? 10 : 100), post.getInt("maximumRecords", post.getInt("count", 10))); // SRU syntax with old property as alternative
+        int itemsPerPage = Math.min((authenticated) ? (snippetFetchStrategy.isAllowedToFetchOnline() ? 100 : 1000) : (snippetFetchStrategy.isAllowedToFetchOnline() ? 10 : 100), post.getInt("maximumRecords", post.getInt("count", 10))); // SRU syntax with old property as alternative
         int offset = (newsearch) ? 0 : post.getInt("startRecord", post.getInt("offset", 0));
         
         boolean global = post.get("resource", "local").equals("global");
@@ -228,12 +229,12 @@ public class yacysearch {
         boolean block = false;
         if (Domains.matchesList(client, sb.networkBlacklist)) {
             global = false;
-            fetchSnippets = false;
+            snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY;
             block = true;
             Log.logWarning("LOCAL_SEARCH", "ACCECC CONTROL: BLACKLISTED CLIENT FROM " + client + " gets no permission to search");
         } else if (Domains.matchesList(client, sb.networkWhitelist)) {
             Log.logInfo("LOCAL_SEARCH", "ACCECC CONTROL: WHITELISTED CLIENT FROM " + client + " gets no search restrictions");
-        } else if (global || fetchSnippets) {
+        } else if (global || snippetFetchStrategy.isAllowedToFetchOnline()) {
             // in case that we do a global search or we want to fetch snippets, we check for DoS cases
             synchronized (trackerHandles) {
                 int accInOneSecond = trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 1000)).size();
@@ -242,21 +243,21 @@ public class yacysearch {
                 int accInTenMinutes = trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 600000)).size();
                 if (accInTenMinutes > 600) {
                     global = false;
-                    fetchSnippets = false;
+                    snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY;
                     block = true;
                     Log.logWarning("LOCAL_SEARCH", "ACCECC CONTROL: CLIENT FROM " + client + ": " + accInTenMinutes + " searches in ten minutes, fully blocked (no results generated)");
                 } else if (accInOneMinute > 200) {
                     global = false;
-                    fetchSnippets = false;
+                    snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY;
                     block = true;
                     Log.logWarning("LOCAL_SEARCH", "ACCECC CONTROL: CLIENT FROM " + client + ": " + accInOneMinute + " searches in one minute, fully blocked (no results generated)");
                 } else if (accInThreeSeconds > 1) {
                     global = false;
-                    fetchSnippets = false;
+                    snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY;
                     Log.logWarning("LOCAL_SEARCH", "ACCECC CONTROL: CLIENT FROM " + client + ": " + accInThreeSeconds + " searches in three seconds, blocked global search and snippets");
                 } else if (accInOneSecond > 2) {
                     global = false;
-                    fetchSnippets = false;
+                    snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY;
                     Log.logWarning("LOCAL_SEARCH", "ACCECC CONTROL: CLIENT FROM " + client + ": " + accInOneSecond + " searches in one second, blocked global search and snippets");
                 }
             }
@@ -428,7 +429,7 @@ public class yacysearch {
                 if (urlentry != null) {
                     final URIMetadataRow.Components metadata = urlentry.metadata();
                     Document document;
-                    document = LoaderDispatcher.retrieveDocument(metadata.url(), true, 5000, true, false, Long.MAX_VALUE);
+                    document = LoaderDispatcher.retrieveDocument(metadata.url(), CrawlProfile.CacheStrategy.IFEXIST, 5000, true, false, Long.MAX_VALUE);
                     if (document != null) {
                         // create a news message
                         final HashMap<String, String> map = new HashMap<String, String>();
@@ -460,7 +461,7 @@ public class yacysearch {
                     contentdom,
                     language,
                     navigation,
-                    fetchSnippets,
+                    snippetFetchStrategy,
                     itemsPerPage,
                     offset,
                     urlmask,
@@ -538,7 +539,7 @@ public class yacysearch {
     	                "&maximumRecords="+ theQuery.displayResults() +
     	                "&startRecord=" + (0 * theQuery.displayResults()) +
     	                "&resource=" + ((theQuery.isLocal()) ? "local" : "global") +
-    	                "&verify=" + ((theQuery.onlineSnippetFetch) ? "true" : "false") +
+    	                "&verify=" + (theQuery.snippetCacheStrategy.mustBeOffline() ? "false" : "true") +
     	                "&nav=" + theQuery.navigators +
     	                "&urlmaskfilter=" + originalUrlMask.toString() +
     	                "&prefermaskfilter=" + theQuery.prefer.toString() +
@@ -684,7 +685,7 @@ public class yacysearch {
         prop.putHTML("prefermaskfilter", prefermask);
         prop.put("indexof", (indexof) ? "on" : "off");
         prop.put("constraint", (constraint == null) ? "" : constraint.exportB64());
-        prop.put("verify", (fetchSnippets) ? "true" : "false");
+        prop.put("verify", snippetFetchStrategy.toName());
         prop.put("contentdom", (post == null ? "text" : post.get("contentdom", "text")));
         prop.put("searchdomswitches", sb.getConfigBool("search.text", true) || sb.getConfigBool("search.audio", true) || sb.getConfigBool("search.video", true) || sb.getConfigBool("search.image", true) || sb.getConfigBool("search.app", true) ? 1 : 0);
         prop.put("searchdomswitches_searchtext", sb.getConfigBool("search.text", true) ? 1 : 0);
diff --git a/source/de/anomic/crawler/CrawlProfile.java b/source/de/anomic/crawler/CrawlProfile.java
index 99cb89562..6d3b84174 100644
--- a/source/de/anomic/crawler/CrawlProfile.java
+++ b/source/de/anomic/crawler/CrawlProfile.java
@@ -259,6 +259,22 @@ public class CrawlProfile {
             for (CacheStrategy strategy: CacheStrategy.values()) if (strategy.code == code) return strategy;
             return NOCACHE;
         }
+        public static CacheStrategy parse(String name) {
+            if (name.equals("nocache")) return NOCACHE;
+            if (name.equals("iffresh")) return IFFRESH;
+            if (name.equals("ifexist")) return IFEXIST;
+            if (name.equals("cacheonly")) return CACHEONLY;
+            return null;
+        }
+        public String toName() {
+            return this.name().toLowerCase();
+        }
+        public boolean isAllowedToFetchOnline() {
+            return this.code < 3;
+        }
+        public boolean mustBeOffline() {
+            return this.code == 3;
+        }
     }
     
     public static class entry {
diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java
index 827db3285..172454036 100644
--- a/source/de/anomic/crawler/CrawlQueues.java
+++ b/source/de/anomic/crawler/CrawlQueues.java
@@ -563,7 +563,8 @@ public class CrawlQueues {
                     try {
                         request.setStatus("loading", WorkflowJob.STATUS_RUNNING);
                         final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
-                        Response response = sb.loader.load(request, maxFileSize);
+                        CrawlProfile.entry e = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle());
+                        Response response = sb.loader.load(request, e == null ? CrawlProfile.CacheStrategy.IFEXIST : e.cacheStrategy(), maxFileSize);
                         if (response == null) {
                             request.setStatus("error", WorkflowJob.STATUS_FINISHED);
                             if (log.isFine()) log.logFine("problem loading " + request.url().toString() + ": no content (possibly caused by cache policy)");
diff --git a/source/de/anomic/search/MediaSnippet.java b/source/de/anomic/search/MediaSnippet.java
index ada9138cc..0cbd5ddcd 100644
--- a/source/de/anomic/search/MediaSnippet.java
+++ b/source/de/anomic/search/MediaSnippet.java
@@ -30,6 +30,7 @@ import java.util.Iterator;
 import java.util.Map;
 import java.util.TreeSet;
 
+import de.anomic.crawler.CrawlProfile;
 import de.anomic.data.MimeTable;
 
 import net.yacy.cora.document.MultiProtocolURI;
@@ -111,13 +112,13 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
         return o1.compareTo(o2);
     }
     
-    public static ArrayList<MediaSnippet> retrieveMediaSnippets(final DigestURI url, final HandleSet queryhashes, final ContentDomain mediatype, final boolean fetchOnline, final int timeout, final boolean reindexing) {
+    public static ArrayList<MediaSnippet> retrieveMediaSnippets(final DigestURI url, final HandleSet queryhashes, final ContentDomain mediatype, final CrawlProfile.CacheStrategy cacheStrategy, final int timeout, final boolean reindexing) {
         if (queryhashes.isEmpty()) {
             Log.logFine("snippet fetch", "no query hashes given for url " + url);
             return new ArrayList<MediaSnippet>();
         }
         
-        final Document document = LoaderDispatcher.retrieveDocument(url, fetchOnline, timeout, false, reindexing, Long.MAX_VALUE);
+        final Document document = LoaderDispatcher.retrieveDocument(url, cacheStrategy, timeout, false, reindexing, Long.MAX_VALUE);
         final ArrayList<MediaSnippet> a = new ArrayList<MediaSnippet>();
         if (document != null) {
             if ((mediatype == ContentDomain.ALL) || (mediatype == ContentDomain.AUDIO)) a.addAll(computeMediaSnippets(document, queryhashes, ContentDomain.AUDIO));
diff --git a/source/de/anomic/search/QueryParams.java b/source/de/anomic/search/QueryParams.java
index 1c1826ff3..4ab21a73e 100644
--- a/source/de/anomic/search/QueryParams.java
+++ b/source/de/anomic/search/QueryParams.java
@@ -44,6 +44,7 @@ import net.yacy.kelondro.order.Bitfield;
 import net.yacy.kelondro.order.NaturalOrder;
 import net.yacy.kelondro.util.SetTools;
 
+import de.anomic.crawler.CrawlProfile;
 import de.anomic.yacy.yacySeed;
 
 public final class QueryParams {
@@ -79,7 +80,7 @@ public final class QueryParams {
     public final int maxDistance;
     public final Bitfield constraint;
     public final boolean allofconstraint;
-    public final boolean onlineSnippetFetch;
+    public final CrawlProfile.CacheStrategy snippetCacheStrategy;
     public final RankingProfile ranking;
     private final Segment indexSegment;
     public final String host; // this is the client host that starts the query, not a site operator
@@ -130,7 +131,7 @@ public final class QueryParams {
         this.domMaxTargets = 0;
         this.constraint = constraint;
         this.allofconstraint = false;
-        this.onlineSnippetFetch = false;
+        this.snippetCacheStrategy = CrawlProfile.CacheStrategy.CACHEONLY;
         this.host = null;
         this.sitehash = null;
         this.authorhash = null;
@@ -149,7 +150,7 @@ public final class QueryParams {
         final int maxDistance, final String prefer, final ContentDomain contentdom,
         final String language,
         final String navigators,
-        final boolean onlineSnippetFetch,
+        final CrawlProfile.CacheStrategy snippetCacheStrategy,
         final int itemsPerPage, final int offset, final String urlMask,
         final int domType, final int domMaxTargets,
         final Bitfield constraint, final boolean allofconstraint,
@@ -184,7 +185,7 @@ public final class QueryParams {
 		this.allofconstraint = allofconstraint;
 		this.sitehash = site; assert site == null || site.length() == 6;
 		this.authorhash = authorhash; assert authorhash == null || authorhash.length() > 0;
-		this.onlineSnippetFetch = onlineSnippetFetch;
+		this.snippetCacheStrategy = snippetCacheStrategy;
 		this.host = host;
         this.remotepeer = null;
 		this.handle = Long.valueOf(System.currentTimeMillis());
@@ -375,7 +376,7 @@ public final class QueryParams {
         "&maximumRecords="+ theQuery.displayResults() +
         "&startRecord=" + (page * theQuery.displayResults()) +
         "&resource=" + ((theQuery.isLocal()) ? "local" : "global") +
-        "&verify=" + ((theQuery.onlineSnippetFetch) ? "true" : "false") +
+        "&verify=" + (theQuery.snippetCacheStrategy.mustBeOffline() ? "false" : "true") +
         "&nav=" + nav +
         "&urlmaskfilter=" + originalUrlMask +
         "&prefermaskfilter=" + theQuery.prefer +
diff --git a/source/de/anomic/search/ResultFetcher.java b/source/de/anomic/search/ResultFetcher.java
index 228693432..622bc9aeb 100644
--- a/source/de/anomic/search/ResultFetcher.java
+++ b/source/de/anomic/search/ResultFetcher.java
@@ -42,6 +42,7 @@ import net.yacy.kelondro.util.SortStack;
 import net.yacy.kelondro.util.SortStore;
 import net.yacy.repository.LoaderDispatcher;
 
+import de.anomic.crawler.CrawlProfile;
 import de.anomic.search.MediaSnippet;
 import de.anomic.yacy.yacySeedDB;
 import de.anomic.yacy.graphics.ProfilingGraph;
@@ -105,9 +106,9 @@ public class ResultFetcher {
 
     public void deployWorker(int deployCount, int neededResults) {
     	if (anyWorkerAlive()) return;
-    	this.workerThreads = new Worker[(query.onlineSnippetFetch) ? deployCount : 1];
+    	this.workerThreads = new Worker[(query.snippetCacheStrategy.isAllowedToFetchOnline()) ? deployCount : 1];
     	for (int i = 0; i < workerThreads.length; i++) {
-    		this.workerThreads[i] = new Worker(i, 10000, (query.onlineSnippetFetch) ? 2 : 0, neededResults);
+    		this.workerThreads[i] = new Worker(i, 10000, query.snippetCacheStrategy, neededResults);
     		this.workerThreads[i].start();
         }
     }
@@ -135,12 +136,12 @@ public class ResultFetcher {
         private final long timeout; // the date until this thread should try to work
         private long lastLifeSign; // when the last time the run()-loop was executed
         private final int id;
-        private final int snippetMode;
+        private final CrawlProfile.CacheStrategy cacheStrategy;
         private final int neededResults;
         
-        public Worker(final int id, final long maxlifetime, int snippetMode, int neededResults) {
+        public Worker(final int id, final long maxlifetime, CrawlProfile.CacheStrategy cacheStrategy, int neededResults) {
             this.id = id;
-            this.snippetMode = snippetMode;
+            this.cacheStrategy = cacheStrategy;
             this.lastLifeSign = System.currentTimeMillis();
             this.timeout = System.currentTimeMillis() + Math.max(1000, maxlifetime);
             this.neededResults = neededResults;
@@ -166,7 +167,7 @@ public class ResultFetcher {
                     if (page == null) break;
                     if (failedURLs.has(page.hash())) continue;
                     
-                    final ResultEntry resultEntry = fetchSnippet(page, snippetMode); // does not fetch snippets if snippetMode == 0
+                    final ResultEntry resultEntry = fetchSnippet(page, cacheStrategy); // does not fetch snippets if snippetMode == 0
 
                     if (resultEntry == null) continue; // the entry had some problems, cannot be used
                     if (result.exists(resultEntry)) continue;
@@ -195,7 +196,7 @@ public class ResultFetcher {
         }
     }
     
-    protected ResultEntry fetchSnippet(final URIMetadataRow page, final int snippetMode) {
+    protected ResultEntry fetchSnippet(final URIMetadataRow page, CrawlProfile.CacheStrategy cacheStrategy) {
         // Snippet Fetching can has 3 modes:
         // 0 - do not fetch snippets
         // 1 - fetch snippets offline only
@@ -209,7 +210,7 @@ public class ResultFetcher {
         if (metadata == null) return null;
         final long dbRetrievalTime = System.currentTimeMillis() - startTime;
         
-        if (snippetMode == 0) {
+        if (cacheStrategy == null) {
             return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, 0); // result without snippet
         }
         
@@ -221,10 +222,10 @@ public class ResultFetcher {
                     this.loader,
                     metadata,
                     snippetFetchWordHashes,
-                    (snippetMode == 2),
+                    cacheStrategy,
                     ((query.constraint != null) && (query.constraint.get(Condenser.flag_cat_indexof))),
                     180,
-                    (snippetMode == 2) ? Integer.MAX_VALUE : 30000,
+                    Integer.MAX_VALUE,
                     query.isGlobal());
             final long snippetComputationTime = System.currentTimeMillis() - startTime;
             Log.logInfo("SEARCH", "text snippet load time for " + metadata.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")")));
@@ -232,26 +233,26 @@ public class ResultFetcher {
             if (snippet.getErrorCode() < 11) {
                 // we loaded the file and found the snippet
                 return new ResultEntry(page, query.getSegment(), peers, snippet, null, dbRetrievalTime, snippetComputationTime); // result with snippet attached
-            } else if (snippetMode == 1) {
+            } else if (cacheStrategy.mustBeOffline()) {
                 // we did not demand online loading, therefore a failure does not mean that the missing snippet causes a rejection of this result
                 // this may happen during a remote search, because snippet loading is omitted to retrieve results faster
                 return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, snippetComputationTime); // result without snippet
             } else {
                 // problems with snippet fetch
-                registerFailure(page.hash(), "no text snippet for URL " + metadata.url());
+                registerFailure(page.hash(), "no text snippet for URL " + metadata.url() + "; errorCode = " + snippet.getErrorCode());
                 return null;
             }
         } else {
             // attach media information
             startTime = System.currentTimeMillis();
-            final ArrayList<MediaSnippet> mediaSnippets = MediaSnippet.retrieveMediaSnippets(metadata.url(), snippetFetchWordHashes, query.contentdom, (snippetMode == 2), 6000, query.isGlobal());
+            final ArrayList<MediaSnippet> mediaSnippets = MediaSnippet.retrieveMediaSnippets(metadata.url(), snippetFetchWordHashes, query.contentdom, cacheStrategy, 6000, query.isGlobal());
             final long snippetComputationTime = System.currentTimeMillis() - startTime;
             Log.logInfo("SEARCH", "media snippet load time for " + metadata.url() + ": " + snippetComputationTime);
             
             if (mediaSnippets != null && !mediaSnippets.isEmpty()) {
                 // found media snippets, return entry
                 return new ResultEntry(page, query.getSegment(), peers, null, mediaSnippets, dbRetrievalTime, snippetComputationTime);
-            } else if (snippetMode == 1) {
+            } else if (cacheStrategy.mustBeOffline()) {
                 return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, snippetComputationTime);
             } else {
                 // problems with snippet fetch
diff --git a/source/de/anomic/search/Segment.java b/source/de/anomic/search/Segment.java
index 340395716..ac3e78145 100644
--- a/source/de/anomic/search/Segment.java
+++ b/source/de/anomic/search/Segment.java
@@ -62,6 +62,7 @@ import net.yacy.kelondro.util.ISO639;
 import net.yacy.repository.Blacklist;
 import net.yacy.repository.LoaderDispatcher;
 
+import de.anomic.crawler.CrawlProfile;
 import de.anomic.crawler.retrieval.Response;
 
 public class Segment {
@@ -360,18 +361,24 @@ public class Segment {
     
 
     // method for index deletion
-    public int removeAllUrlReferences(final DigestURI url, LoaderDispatcher loader, final boolean fetchOnline) {
-        return removeAllUrlReferences(url.hash(), loader, fetchOnline);
+    public int removeAllUrlReferences(final DigestURI url, LoaderDispatcher loader, final CrawlProfile.CacheStrategy cacheStrategy) {
+        return removeAllUrlReferences(url.hash(), loader, cacheStrategy);
     }
     
-    public void removeAllUrlReferences(final HandleSet urls, LoaderDispatcher loader, final boolean fetchOnline) {
-        for (byte[] urlhash: urls) removeAllUrlReferences(urlhash, loader, fetchOnline);
+    public void removeAllUrlReferences(final HandleSet urls, LoaderDispatcher loader, final CrawlProfile.CacheStrategy cacheStrategy) {
+        for (byte[] urlhash: urls) removeAllUrlReferences(urlhash, loader, cacheStrategy);
     }
     
-    public int removeAllUrlReferences(final byte[] urlhash, LoaderDispatcher loader, final boolean fetchOnline) {
-        // find all the words in a specific resource and remove the url reference from every word index
-        // finally, delete the url entry
-        
+    /**
+     * find all the words in a specific resource and remove the url reference from every word index
+     * finally, delete the url entry
+     * @param urlhash the hash of the url that shall be removed
+     * @param loader
+     * @param cacheStrategy
+     * @return number of removed words
+     */
+    public int removeAllUrlReferences(final byte[] urlhash, LoaderDispatcher loader, final CrawlProfile.CacheStrategy cacheStrategy) {
+
         if (urlhash == null) return 0;
         // determine the url string
         final URIMetadataRow entry = urlMetadata().load(urlhash, null, 0);
@@ -384,7 +391,7 @@ public class Segment {
             // get the resource content
             byte[] resourceb = null;
             try {
-                resourceb = loader.getResource(metadata.url(), fetchOnline, 10000, true, false);
+                resourceb = loader.getResource(metadata.url(), cacheStrategy, 10000, true, false);
             } catch (IOException e) {
                 Log.logWarning("removeAllUrlReferences", "cannot load: " + e.getMessage());
             }
diff --git a/source/de/anomic/search/TextSnippet.java b/source/de/anomic/search/TextSnippet.java
index f1f70ac15..091ba4b52 100644
--- a/source/de/anomic/search/TextSnippet.java
+++ b/source/de/anomic/search/TextSnippet.java
@@ -48,6 +48,7 @@ import net.yacy.kelondro.order.Base64Order;
 import net.yacy.kelondro.util.ByteArray;
 import net.yacy.repository.LoaderDispatcher;
 
+import de.anomic.crawler.CrawlProfile;
 import de.anomic.crawler.retrieval.Response;
 import de.anomic.http.client.Cache;
 import de.anomic.http.server.ResponseHeader;
@@ -308,7 +309,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
         return al;
     }
     
-    public static TextSnippet retrieveTextSnippet(final LoaderDispatcher loader, final URIMetadataRow.Components comp, final HandleSet queryhashes, final boolean fetchOnline, final boolean pre, final int snippetMaxLength, final int maxDocLen, final boolean reindexing) {
+    public static TextSnippet retrieveTextSnippet(final LoaderDispatcher loader, final URIMetadataRow.Components comp, final HandleSet queryhashes, final CrawlProfile.CacheStrategy cacheStrategy, final boolean pre, final int snippetMaxLength, final int maxDocLen, final boolean reindexing) {
         // heise = "0OQUNU3JSs05"
         final DigestURI url = comp.url();
         if (queryhashes.isEmpty()) {
@@ -351,11 +352,11 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
                 // trying to load the resource from the cache
                 resContent = Cache.getContent(url);
                 responseHeader = Cache.getResponseHeader(url);
-                if ((resContent == null || responseHeader == null) && fetchOnline) {
+                if ((resContent == null || responseHeader == null) && cacheStrategy.isAllowedToFetchOnline()) {
                     // if not found try to download it
                     
-                    // download resource using the crawler and keep resource in memory if possible
-                    final Response entry = loader.load(url, true, reindexing, Long.MAX_VALUE);
+                    // download resource or get it from the cache
+                    final Response entry = loader.load(url, true, reindexing, cacheStrategy, Long.MAX_VALUE);
                     
                     // get resource metadata (e.g. the http headers for http resources)
                     if (entry != null) {
@@ -371,10 +372,17 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
                         }
                     }
                     
-                    // if it is still not available, report an error
-                    if (resContent == null) return new TextSnippet(url, null, ERROR_RESOURCE_LOADING, queryhashes, "error loading resource from net, no cache entry");
                     source = SOURCE_WEB;
                 }
+                if (resContent == null) {
+                    // in case that we did not get any result we can still return a success when we are not allowed to go online
+                    if (cacheStrategy.mustBeOffline()) {
+                        return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "omitted network load (not allowed), no cache entry");
+                    }
+                    
+                    // if it is still not available, report an error
+                    return new TextSnippet(url, null, ERROR_RESOURCE_LOADING, queryhashes, "error loading resource from net, no cache entry");
+                }
             }
         } catch (final Exception e) {
             //Log.logException(e);
diff --git a/source/de/anomic/server/serverCore.java b/source/de/anomic/server/serverCore.java
index c3fbe608c..86d6f900a 100644
--- a/source/de/anomic/server/serverCore.java
+++ b/source/de/anomic/server/serverCore.java
@@ -783,7 +783,7 @@ public final class serverCore extends AbstractBusyThread implements BusyThread {
                     } catch (final Exception e) {
                         log.logSevere("command execution, generic exception " + e.getMessage() + " for client " + this.userAddress.getHostAddress(), e);
                         // whatever happens: the thread has to survive!
-                        writeLine("UNKNOWN REASON:" + this.commandObj.error(e));
+                        writeLine("UNKNOWN REASON:" + ((this.commandObj == null) ? "no command object" : this.commandObj.error(e)));
                         break;
                     }
                     // check if we should still keep this alive:
diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java
index 08048fdc5..bb3054cfb 100644
--- a/source/net/yacy/document/parser/pdfParser.java
+++ b/source/net/yacy/document/parser/pdfParser.java
@@ -134,10 +134,11 @@ public class pdfParser extends AbstractParser implements Idiom {
         
         Writer writer = null;
         File writerFile = null;
+        PDFTextStripper stripper = null;
         try {
             // create a writer for output
             writer = new CharBuffer();
-            final PDFTextStripper stripper = new PDFTextStripper();
+            stripper = new PDFTextStripper();
             stripper.writeText(theDocument, writer); // may throw a NPE
             theDocument.close();           
             writer.close();
@@ -150,11 +151,12 @@ public class pdfParser extends AbstractParser implements Idiom {
             if (writerFile != null) FileUtils.deletedelete(writerFile);
             throw new ParserException(e.getMessage(), location);
         }
-            
+
         String[] docKeywords = null;
         if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,");
         
         Document theDoc = null;
+        if (docTitle == null) docTitle = docSubject;
         
         if (writer instanceof CharBuffer) {
             byte[] contentBytes;
@@ -170,7 +172,7 @@ public class pdfParser extends AbstractParser implements Idiom {
                     "UTF-8",
                     null,
                     docKeywords,
-                    (docTitle == null) ? docSubject : docTitle,
+                    docTitle,
                     docAuthor,
                     docPublisher,
                     null,
@@ -186,7 +188,7 @@ public class pdfParser extends AbstractParser implements Idiom {
                     "UTF-8",
                     null,
                     docKeywords,
-                    (docTitle == null) ? docSubject : docTitle,
+                    docTitle,
                     docAuthor,
                     docPublisher,
                     null,
diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java
index fe02384c4..cabb7514a 100644
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@@ -99,19 +99,11 @@ public final class LoaderDispatcher {
         return (HashSet<String>) this.supportedProtocols.clone();
     }
     
-    public Response load(
-            final DigestURI url,
-            final boolean forText,
-            final boolean global,
-            final long maxFileSize) throws IOException {
-        return load(request(url, forText, global), maxFileSize);
-    }
-    
     /**
      * load a resource from the web, from ftp, from smb or a file
      * @param url
-     * @param forText
-     * @param global
+     * @param forText shows that this was a for-text crawling request
+     * @param global shows that this was a global crawling request
      * @param cacheStratgy strategy according to CACHE_STRATEGY_NOCACHE,CACHE_STRATEGY_IFFRESH,CACHE_STRATEGY_IFEXIST,CACHE_STRATEGY_CACHEONLY
      * @return the loaded entity in a Response object
      * @throws IOException
@@ -169,13 +161,6 @@ public final class LoaderDispatcher {
                     0);
     }
     
-    public Response load(final Request request, long maxFileSize) throws IOException {
-        CrawlProfile.entry crawlProfile = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle());
-        CrawlProfile.CacheStrategy cacheStrategy = CrawlProfile.CacheStrategy.IFEXIST;
-        if (crawlProfile != null) cacheStrategy = crawlProfile.cacheStrategy();
-        return load(request, cacheStrategy, maxFileSize);
-    }
-    
     public Response load(final Request request, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize) throws IOException {
         // get the protocol of the next URL
         final String protocol = request.url().getProtocol();
@@ -295,15 +280,10 @@ public final class LoaderDispatcher {
      * @return the content as {@link byte[]}
      * @throws IOException 
      */
-    public byte[] getResource(final DigestURI url, final boolean fetchOnline, final int socketTimeout, final boolean forText, final boolean reindexing) throws IOException {
-        byte[] resource = Cache.getContent(url);
-        if (resource != null) return resource;
-        
-        if (!fetchOnline) return null;
-        
+    public byte[] getResource(final DigestURI url, CrawlProfile.CacheStrategy cacheStrategy, final int socketTimeout, final boolean forText, final boolean reindexing) throws IOException {
         // try to download the resource using the loader
         final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
-        final Response entry = load(url, forText, reindexing, maxFileSize);
+        final Response entry = load(url, forText, reindexing, cacheStrategy, maxFileSize);
         if (entry == null) return null; // not found in web
         
         // read resource body (if it is there)
@@ -322,45 +302,27 @@ public final class LoaderDispatcher {
      * @param global the domain of the search. If global == true then the content is re-indexed
      * @return the parsed document as {@link Document}
      */
-    public static Document retrieveDocument(final DigestURI url, final boolean fetchOnline, final int timeout, final boolean forText, final boolean global, long maxFileSize) {
+    public static Document retrieveDocument(final DigestURI url, final CrawlProfile.CacheStrategy cacheStrategy, final int timeout, final boolean forText, final boolean global, long maxFileSize) {
 
         // load resource
         byte[] resContent = null;
         ResponseHeader responseHeader = null;
         try {
-            // trying to load the resource from the cache
-            resContent = Cache.getContent(url);
-            responseHeader = Cache.getResponseHeader(url);
-            if (resContent != null) {
-                // if the content was found
-            } else if (fetchOnline) {
-                // if not found try to download it
-                
-                // download resource using the crawler and keep resource in memory if possible
-                final Response entry = Switchboard.getSwitchboard().loader.load(url, forText, global, maxFileSize);
-                
-                // getting resource metadata (e.g. the http headers for http resources)
-                if (entry != null) {
+            final Response entry = Switchboard.getSwitchboard().loader.load(url, forText, global, cacheStrategy, maxFileSize);
+            if (entry == null) {
+                Log.logFine("snippet fetch", "no Response for url " + url);
+                return null;
+            }
 
-                    // read resource body (if it is there)
-                    final byte[] resourceArray = entry.getContent();
-                    if (resourceArray != null) {
-                        resContent = resourceArray;
-                    } else {
-                        resContent = Cache.getContent(url); 
-                    }
-                    
-                    // read a fresh header
-                    responseHeader = entry.getResponseHeader();
-                }
-                
-                // if it is still not available, report an error
-                if (resContent == null) {
-                    Log.logFine("snippet fetch", "plasmaHTCache.Entry cache is NULL for url " + url);
-                    return null;
-                }
-            } else {
-                Log.logFine("snippet fetch", "no resource available for url " + url);
+            // read resource body (if it is there)
+            resContent = entry.getContent();
+            
+            // read a fresh header
+            responseHeader = entry.getResponseHeader();
+        
+            // if it is still not available, report an error
+            if (resContent == null || responseHeader == null) {
+                Log.logFine("snippet fetch", "no Content available for url " + url);
                 return null;
             }
         } catch (final Exception e) {