From b1a8d0c020d958895a930fd340adf8c1b391dc90 Mon Sep 17 00:00:00 2001
From: orbiter <orbiter@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Tue, 22 Mar 2011 10:35:26 +0000
Subject: [PATCH] enhancements to web cache and less strict caching rules

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7620 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 defaults/yacy.init                                  | 2 +-
 htroot/CacheResource_p.java                         | 4 ++--
 htroot/api/util/getpageinfo_p.java                  | 2 +-
 source/de/anomic/crawler/CrawlProfile.java          | 6 +++---
 source/de/anomic/crawler/CrawlSwitchboard.java      | 2 +-
 source/de/anomic/http/client/Cache.java             | 9 ++++-----
 source/de/anomic/http/server/HTTPDProxyHandler.java | 8 ++++----
 source/de/anomic/search/Switchboard.java            | 2 +-
 source/de/anomic/yacy/graphics/OSMTile.java         | 2 +-
 source/net/yacy/kelondro/blob/HeapReader.java       | 1 +
 source/net/yacy/repository/LoaderDispatcher.java    | 8 ++++++--
 11 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/defaults/yacy.init b/defaults/yacy.init
index 48e3b626e..1f054a946 100644
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@@ -740,7 +740,7 @@ search.navigation=hosts,authors,namespace,topics
 #            consider content nevertheless as available and show result without snippet
 # false:     no link verification and not snippet generation:
              all search results are valid without verification
-search.verify = iffresh
+search.verify = ifexist
 
 # in case that a link verification fails then the corresponding index reference can be
 # deleted to clean up the index. If this property is set then failed index verification in
diff --git a/htroot/CacheResource_p.java b/htroot/CacheResource_p.java
index 3d29ab834..1007cd4e9 100644
--- a/htroot/CacheResource_p.java
+++ b/htroot/CacheResource_p.java
@@ -54,7 +54,7 @@ public class CacheResource_p {
         }
         
         byte[] resource = null;
-        resource = Cache.getContent(url);
+        resource = Cache.getContent(url.hash());
         if (resource == null) return prop;
         
         // check request type
@@ -63,7 +63,7 @@ public class CacheResource_p {
             return ImageParser.parse(u, resource);
         } else {
             // get response header and set mime type
-            ResponseHeader responseHeader = Cache.getResponseHeader(url);
+            ResponseHeader responseHeader = Cache.getResponseHeader(url.hash());
             String resMime = responseHeader == null ? null : responseHeader.mime();
             if (resMime != null) {
                 final ResponseHeader outgoingHeader = new ResponseHeader();
diff --git a/htroot/api/util/getpageinfo_p.java b/htroot/api/util/getpageinfo_p.java
index 137066838..026769b9c 100755
--- a/htroot/api/util/getpageinfo_p.java
+++ b/htroot/api/util/getpageinfo_p.java
@@ -56,7 +56,7 @@ public class getpageinfo_p {
                 }
                 ContentScraper scraper = null;
                 if (u != null) try {
-                    scraper = sb.loader.parseResource(u, CrawlProfile.CacheStrategy.IFFRESH);
+                    scraper = sb.loader.parseResource(u, CrawlProfile.CacheStrategy.IFEXIST);
                 } catch (final IOException e) {
                     // now thats a fail, do nothing      
                 }  
diff --git a/source/de/anomic/crawler/CrawlProfile.java b/source/de/anomic/crawler/CrawlProfile.java
index 19c2b519b..5deb25560 100644
--- a/source/de/anomic/crawler/CrawlProfile.java
+++ b/source/de/anomic/crawler/CrawlProfile.java
@@ -159,12 +159,12 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
     }
     public CacheStrategy cacheStrategy() {
         final String r = get(CACHE_STRAGEGY);
-        if (r == null) return CacheStrategy.IFFRESH;
+        if (r == null) return CacheStrategy.IFEXIST;
         try {
             return CacheStrategy.decode(Integer.parseInt(r));
         } catch (final NumberFormatException e) {
             Log.logException(e);
-            return CacheStrategy.IFFRESH;
+            return CacheStrategy.IFEXIST;
         }
     }
     public void setCacheStrategy(CacheStrategy newStrategy) {
@@ -260,7 +260,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
             if (name.equals("iffresh")) return IFFRESH;
             if (name.equals("ifexist")) return IFEXIST;
             if (name.equals("cacheonly")) return CACHEONLY;
-            if (name.equals("true")) return IFFRESH;
+            if (name.equals("true")) return IFEXIST;
             if (name.equals("false")) return null; // if this cache strategy is assigned as query attribute, null means "do not create a snippet"
             return null;
         }
diff --git a/source/de/anomic/crawler/CrawlSwitchboard.java b/source/de/anomic/crawler/CrawlSwitchboard.java
index 9090d1416..00f49e66d 100644
--- a/source/de/anomic/crawler/CrawlSwitchboard.java
+++ b/source/de/anomic/crawler/CrawlSwitchboard.java
@@ -225,7 +225,7 @@ public final class CrawlSwitchboard {
         if (this.defaultTextSnippetLocalProfile == null) {
             // generate new default entry for snippet fetch and optional crawling
             this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
+                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
             this.profilesActiveCrawls.put(this.defaultTextSnippetLocalProfile.handle().getBytes(), this.defaultTextSnippetLocalProfile);
         }
         if (this.defaultTextSnippetGlobalProfile == null) {
diff --git a/source/de/anomic/http/client/Cache.java b/source/de/anomic/http/client/Cache.java
index 4f1f887e4..ddcf81db8 100644
--- a/source/de/anomic/http/client/Cache.java
+++ b/source/de/anomic/http/client/Cache.java
@@ -202,11 +202,11 @@ public final class Cache {
      * @throws <b>UnsupportedProtocolException</b> if the protocol is not supported and therefore the
      * info object couldn't be created
      */
-    public static ResponseHeader getResponseHeader(final DigestURI url) {    
+    public static ResponseHeader getResponseHeader(final byte[] hash) {    
         
         // loading data from database
         Map<String, String> hdb;
-        hdb = responseHeaderDB.get(url.hash());
+        hdb = responseHeaderDB.get(hash);
         if (hdb == null) return null;
         
         return new ResponseHeader(null, hdb);
@@ -221,12 +221,11 @@ public final class Cache {
      * is returned.
      * @throws IOException 
      */
-    public static byte[] getContent(final DigestURI url) {
+    public static byte[] getContent(final byte[] hash) {
         // load the url as resource from the cache
         try {
-            byte[] b = fileDB.get(url.hash());
+            byte[] b = fileDB.get(hash);
             if (b == null) return null;
-            log.logInfo("cache hit for url " + url.toString() + ", " + b.length + " bytes");
             return b;
         } catch (UnsupportedEncodingException e) {
             Log.logException(e);
diff --git a/source/de/anomic/http/server/HTTPDProxyHandler.java b/source/de/anomic/http/server/HTTPDProxyHandler.java
index 96e1804f2..04975cc17 100644
--- a/source/de/anomic/http/server/HTTPDProxyHandler.java
+++ b/source/de/anomic/http/server/HTTPDProxyHandler.java
@@ -363,7 +363,7 @@ public final class HTTPDProxyHandler {
             // handle outgoing cookies
             handleOutgoingCookies(requestHeader, host, ip);
             prepareRequestHeader(conProp, requestHeader, hostlow);
-            ResponseHeader cachedResponseHeader = Cache.getResponseHeader(url);
+            ResponseHeader cachedResponseHeader = Cache.getResponseHeader(url.hash());
             
             // why are files unzipped upon arrival? why not zip all files in cache?
             // This follows from the following premises
@@ -409,7 +409,7 @@ public final class HTTPDProxyHandler {
                         "200 OK",
                         sb.crawler.defaultProxyProfile
                 );
-                byte[] cacheContent = Cache.getContent(url);
+                byte[] cacheContent = Cache.getContent(url.hash());
                 if (cacheContent != null && response.isFreshForProxy()) {
                     if (log.isFinest()) log.logFinest(reqID + " fulfill request from cache");
                     fulfillRequestFromCache(conProp, url, requestHeader, cachedResponseHeader, cacheContent, countedRespond);
@@ -500,9 +500,9 @@ public final class HTTPDProxyHandler {
                 long sizeBeforeDelete = -1;
                 if (cachedResponseHeader != null) {
                     // delete the cache
-                    ResponseHeader rh = Cache.getResponseHeader(url);
+                    ResponseHeader rh = Cache.getResponseHeader(url.hash());
                     if (rh != null && (sizeBeforeDelete = rh.getContentLength()) == 0) {
-                        byte[] b = Cache.getContent(url);
+                        byte[] b = Cache.getContent(url.hash());
                         if (b != null) sizeBeforeDelete = b.length;
                     }
                     Cache.delete(url);
diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java
index 467a46d97..54ae7a7da 100644
--- a/source/de/anomic/search/Switchboard.java
+++ b/source/de/anomic/search/Switchboard.java
@@ -1755,7 +1755,7 @@ public final class Switchboard extends serverSwitch {
         byte[] b = response.getContent();
         if (b == null) {
             // fetch the document from cache
-            b = Cache.getContent(response.url());
+            b = Cache.getContent(response.url().hash());
             if (b == null) {
                 this.log.logWarning("the resource '" + response.url() + "' is missing in the cache.");
                 addURLtoErrorDB(response.url(), response.referrerHash(), response.initiator(), response.name(), "missing in cache");
diff --git a/source/de/anomic/yacy/graphics/OSMTile.java b/source/de/anomic/yacy/graphics/OSMTile.java
index 5568736da..ee054f82a 100644
--- a/source/de/anomic/yacy/graphics/OSMTile.java
+++ b/source/de/anomic/yacy/graphics/OSMTile.java
@@ -107,7 +107,7 @@ public class OSMTile {
             return null;
         }
         //System.out.println("*** DEBUG: fetching OSM tile: " + tileURL.toNormalform(true, true));
-        byte[] tileb = Cache.getContent(tileURL);
+        byte[] tileb = Cache.getContent(tileURL.hash());
         if (tileb == null) {
             // download resource using the crawler and keep resource in memory if possible
             Response entry = null;
diff --git a/source/net/yacy/kelondro/blob/HeapReader.java b/source/net/yacy/kelondro/blob/HeapReader.java
index fa67aca3b..d0f25ccc2 100644
--- a/source/net/yacy/kelondro/blob/HeapReader.java
+++ b/source/net/yacy/kelondro/blob/HeapReader.java
@@ -440,6 +440,7 @@ public class HeapReader {
      * @throws IOException
      */
     public byte[] get(byte[] key) throws IOException, RowSpaceExceededException {
+        if (this.index == null) return null;
         key = normalizeKey(key);
        
         synchronized (this.index) {
diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java
index d16665ef7..b2082a256 100644
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@@ -191,8 +191,8 @@ public final class LoaderDispatcher {
             // we have passed a first test if caching is allowed
             // now see if there is a cache entry
         
-            ResponseHeader cachedResponse = (url.isLocal()) ? null : Cache.getResponseHeader(url);
-            byte[] content = (cachedResponse == null) ? null : Cache.getContent(url);
+            ResponseHeader cachedResponse = (url.isLocal()) ? null : Cache.getResponseHeader(url.hash());
+            byte[] content = (cachedResponse == null) ? null : Cache.getContent(url.hash());
             if (cachedResponse != null && content != null) {
                 // yes we have the content
                 
@@ -226,6 +226,10 @@ public final class LoaderDispatcher {
                 } else {
                     log.logInfo("cache hit/stale for: " + url.toNormalform(true, false));
                 }
+            } else if (cachedResponse != null) {
+                log.logWarning("HTCACHE contained response header, but not content for url " + url.toNormalform(true, false));
+            } else if (content != null) {
+                log.logWarning("HTCACHE contained content, but not response header for url " + url.toNormalform(true, false));
             }
         }