fixed a bug in snippet fetch strategy: cache only does not help if resource can only be found in web

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6930 6c8d7289-2bf4-0310-a012-ef5d649a1542
15 years ago · 73f03e05ee
parent fbf021bb50
commit 73f03e05ee
4 changed files with 9 additions and 6 deletions
--- a/source/de/anomic/crawler/CrawlProfile.java
+++ b/source/de/anomic/crawler/CrawlProfile.java
@ -396,6 +396,9 @@ public class CrawlProfile {
                return CacheStrategy.IFFRESH;
            }
        }
+        public void setCacheStrategy(CacheStrategy newStrategy) {
+            mem.put(CACHE_STRAGEGY, newStrategy.toString());
+        }
        public long recrawlIfOlder() {
            // returns a long (millis) that is the minimum age that
            // an entry must have to be re-crawled
--- a/source/de/anomic/crawler/CrawlSwitchboard.java
+++ b/source/de/anomic/crawler/CrawlSwitchboard.java
@ -30,6 +30,8 @@ import java.io.File;
 import java.io.IOException;
 import java.util.Iterator;

+import de.anomic.crawler.CrawlProfile.CacheStrategy;
+
 import net.yacy.kelondro.logging.Log;
 import net.yacy.kelondro.util.FileUtils;
 import net.yacy.kelondro.util.kelondroException;
@ -183,8 +185,9 @@ public final class CrawlSwitchboard {
        if (this.defaultTextSnippetGlobalProfile == null) {
            // generate new default entry for snippet fetch and optional crawling
            defaultTextSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0,
-                    this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.CACHEONLY);
+                    this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
        }
+        this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST);
        if (this.defaultMediaSnippetLocalProfile == null) {
            // generate new default entry for snippet fetch and optional crawling
            defaultMediaSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0,
--- a/source/de/anomic/search/TextSnippet.java
+++ b/source/de/anomic/search/TextSnippet.java
@ -352,10 +352,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
                // trying to load the resource from the cache
                resContent = Cache.getContent(url);
                responseHeader = Cache.getResponseHeader(url);
-                if (resContent != null && !fetchOnline && resContent.length > maxDocLen) {
-                    // content may be too large to be parsed here. To be fast, we omit calculation of snippet here
-                    return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "resource available, but too large: " + resContent.length + " bytes");
-                } else if (fetchOnline) {
+                if ((resContent == null || responseHeader == null) && fetchOnline) {
                    // if not found try to download it
                    
                    // download resource using the crawler and keep resource in memory if possible
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@ -233,7 +233,7 @@ public final class LoaderDispatcher {
        // check case where we want results from the cache exclusively, and never from the internet (offline mode)
        if (cacheStrategy == CrawlProfile.CacheStrategy.CACHEONLY) {
            // we had a chance to get the content from the cache .. its over. We don't have it.
-            return null;
+            throw new IOException("cache only strategy");
        }
        
        // now forget about the cache, nothing there. Try to load the content from the internet