enhancements to web cache and less strict caching rules

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7620 6c8d7289-2bf4-0310-a012-ef5d649a1542
14 years ago · b1a8d0c020
parent f3baaca920
commit b1a8d0c020
11 changed files with 25 additions and 21 deletions
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@ -740,7 +740,7 @@ search.navigation=hosts,authors,namespace,topics
 #            consider content nevertheless as available and show result without snippet
 # false:     no link verification and not snippet generation:
             all search results are valid without verification
-search.verify = iffresh
+search.verify = ifexist

 # in case that a link verification fails then the corresponding index reference can be
 # deleted to clean up the index. If this property is set then failed index verification in
--- a/htroot/CacheResource_p.java
+++ b/htroot/CacheResource_p.java
@ -54,7 +54,7 @@ public class CacheResource_p {
        }
        
        byte[] resource = null;
-        resource = Cache.getContent(url);
+        resource = Cache.getContent(url.hash());
        if (resource == null) return prop;
        
        // check request type
@ -63,7 +63,7 @@ public class CacheResource_p {
            return ImageParser.parse(u, resource);
        } else {
            // get response header and set mime type
-            ResponseHeader responseHeader = Cache.getResponseHeader(url);
+            ResponseHeader responseHeader = Cache.getResponseHeader(url.hash());
            String resMime = responseHeader == null ? null : responseHeader.mime();
            if (resMime != null) {
                final ResponseHeader outgoingHeader = new ResponseHeader();
--- a/htroot/api/util/getpageinfo_p.java
+++ b/htroot/api/util/getpageinfo_p.java
@ -56,7 +56,7 @@ public class getpageinfo_p {
                }
                ContentScraper scraper = null;
                if (u != null) try {
-                    scraper = sb.loader.parseResource(u, CrawlProfile.CacheStrategy.IFFRESH);
+                    scraper = sb.loader.parseResource(u, CrawlProfile.CacheStrategy.IFEXIST);
                } catch (final IOException e) {
                    // now thats a fail, do nothing      
                }  
--- a/source/de/anomic/crawler/CrawlProfile.java
+++ b/source/de/anomic/crawler/CrawlProfile.java
@ -159,12 +159,12 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
    }
    public CacheStrategy cacheStrategy() {
        final String r = get(CACHE_STRAGEGY);
-        if (r == null) return CacheStrategy.IFFRESH;
+        if (r == null) return CacheStrategy.IFEXIST;
        try {
            return CacheStrategy.decode(Integer.parseInt(r));
        } catch (final NumberFormatException e) {
            Log.logException(e);
-            return CacheStrategy.IFFRESH;
+            return CacheStrategy.IFEXIST;
        }
    }
    public void setCacheStrategy(CacheStrategy newStrategy) {
@ -260,7 +260,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
            if (name.equals("iffresh")) return IFFRESH;
            if (name.equals("ifexist")) return IFEXIST;
            if (name.equals("cacheonly")) return CACHEONLY;
-            if (name.equals("true")) return IFFRESH;
+            if (name.equals("true")) return IFEXIST;
            if (name.equals("false")) return null; // if this cache strategy is assigned as query attribute, null means "do not create a snippet"
            return null;
        }
--- a/source/de/anomic/crawler/CrawlSwitchboard.java
+++ b/source/de/anomic/crawler/CrawlSwitchboard.java
@ -225,7 +225,7 @@ public final class CrawlSwitchboard {
        if (this.defaultTextSnippetLocalProfile == null) {
            // generate new default entry for snippet fetch and optional crawling
            this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
+                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
            this.profilesActiveCrawls.put(this.defaultTextSnippetLocalProfile.handle().getBytes(), this.defaultTextSnippetLocalProfile);
        }
        if (this.defaultTextSnippetGlobalProfile == null) {
--- a/source/de/anomic/http/client/Cache.java
+++ b/source/de/anomic/http/client/Cache.java
@ -202,11 +202,11 @@ public final class Cache {
     * @throws <b>UnsupportedProtocolException</b> if the protocol is not supported and therefore the
     * info object couldn't be created
     */
-    public static ResponseHeader getResponseHeader(final DigestURI url) {    
+    public static ResponseHeader getResponseHeader(final byte[] hash) {    
        
        // loading data from database
        Map<String, String> hdb;
-        hdb = responseHeaderDB.get(url.hash());
+        hdb = responseHeaderDB.get(hash);
        if (hdb == null) return null;
        
        return new ResponseHeader(null, hdb);
@ -221,12 +221,11 @@ public final class Cache {
     * is returned.
     * @throws IOException 
     */
-    public static byte[] getContent(final DigestURI url) {
+    public static byte[] getContent(final byte[] hash) {
        // load the url as resource from the cache
        try {
-            byte[] b = fileDB.get(url.hash());
+            byte[] b = fileDB.get(hash);
            if (b == null) return null;
-            log.logInfo("cache hit for url " + url.toString() + ", " + b.length + " bytes");
            return b;
        } catch (UnsupportedEncodingException e) {
            Log.logException(e);
--- a/source/de/anomic/http/server/HTTPDProxyHandler.java
+++ b/source/de/anomic/http/server/HTTPDProxyHandler.java
@ -363,7 +363,7 @@ public final class HTTPDProxyHandler {
            // handle outgoing cookies
            handleOutgoingCookies(requestHeader, host, ip);
            prepareRequestHeader(conProp, requestHeader, hostlow);
-            ResponseHeader cachedResponseHeader = Cache.getResponseHeader(url);
+            ResponseHeader cachedResponseHeader = Cache.getResponseHeader(url.hash());
            
            // why are files unzipped upon arrival? why not zip all files in cache?
            // This follows from the following premises
@ -409,7 +409,7 @@ public final class HTTPDProxyHandler {
                        "200 OK",
                        sb.crawler.defaultProxyProfile
                );
-                byte[] cacheContent = Cache.getContent(url);
+                byte[] cacheContent = Cache.getContent(url.hash());
                if (cacheContent != null && response.isFreshForProxy()) {
                    if (log.isFinest()) log.logFinest(reqID + " fulfill request from cache");
                    fulfillRequestFromCache(conProp, url, requestHeader, cachedResponseHeader, cacheContent, countedRespond);
@ -500,9 +500,9 @@ public final class HTTPDProxyHandler {
                long sizeBeforeDelete = -1;
                if (cachedResponseHeader != null) {
                    // delete the cache
-                    ResponseHeader rh = Cache.getResponseHeader(url);
+                    ResponseHeader rh = Cache.getResponseHeader(url.hash());
                    if (rh != null && (sizeBeforeDelete = rh.getContentLength()) == 0) {
-                        byte[] b = Cache.getContent(url);
+                        byte[] b = Cache.getContent(url.hash());
                        if (b != null) sizeBeforeDelete = b.length;
                    }
                    Cache.delete(url);
--- a/source/de/anomic/search/Switchboard.java
+++ b/source/de/anomic/search/Switchboard.java
@ -1755,7 +1755,7 @@ public final class Switchboard extends serverSwitch {
        byte[] b = response.getContent();
        if (b == null) {
            // fetch the document from cache
-            b = Cache.getContent(response.url());
+            b = Cache.getContent(response.url().hash());
            if (b == null) {
                this.log.logWarning("the resource '" + response.url() + "' is missing in the cache.");
                addURLtoErrorDB(response.url(), response.referrerHash(), response.initiator(), response.name(), "missing in cache");
--- a/source/de/anomic/yacy/graphics/OSMTile.java
+++ b/source/de/anomic/yacy/graphics/OSMTile.java
@ -107,7 +107,7 @@ public class OSMTile {
            return null;
        }
        //System.out.println("*** DEBUG: fetching OSM tile: " + tileURL.toNormalform(true, true));
-        byte[] tileb = Cache.getContent(tileURL);
+        byte[] tileb = Cache.getContent(tileURL.hash());
        if (tileb == null) {
            // download resource using the crawler and keep resource in memory if possible
            Response entry = null;
--- a/source/net/yacy/kelondro/blob/HeapReader.java
+++ b/source/net/yacy/kelondro/blob/HeapReader.java
@ -440,6 +440,7 @@ public class HeapReader {
     * @throws IOException
     */
    public byte[] get(byte[] key) throws IOException, RowSpaceExceededException {
+        if (this.index == null) return null;
        key = normalizeKey(key);
       
        synchronized (this.index) {
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@ -191,8 +191,8 @@ public final class LoaderDispatcher {
            // we have passed a first test if caching is allowed
            // now see if there is a cache entry
        
-            ResponseHeader cachedResponse = (url.isLocal()) ? null : Cache.getResponseHeader(url);
-            byte[] content = (cachedResponse == null) ? null : Cache.getContent(url);
+            ResponseHeader cachedResponse = (url.isLocal()) ? null : Cache.getResponseHeader(url.hash());
+            byte[] content = (cachedResponse == null) ? null : Cache.getContent(url.hash());
            if (cachedResponse != null && content != null) {
                // yes we have the content
                
@ -226,6 +226,10 @@ public final class LoaderDispatcher {
                } else {
                    log.logInfo("cache hit/stale for: " + url.toNormalform(true, false));
                }
+            } else if (cachedResponse != null) {
+                log.logWarning("HTCACHE contained response header, but not content for url " + url.toNormalform(true, false));
+            } else if (content != null) {
+                log.logWarning("HTCACHE contained content, but not response header for url " + url.toNormalform(true, false));
            }
        }