diff --git a/defaults/yacy.init b/defaults/yacy.init index e0daf6d0d..8d2662f60 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -226,8 +226,9 @@ mimeConfig = httpd.mime # This will be used if the server is addressed as a proxy proxyCache = DATA/HTCACHE -# the maximum disc cache size for files in proxyCache in megabytes -proxyCacheSize = 1024 +# the maximum disc cache size for files in Cache in megabytes +# default: 32 Gigabyte +proxyCacheSize = 32768 # a path to the surrogate input directory surrogates.in = DATA/SURROGATES/in diff --git a/htroot/CrawlStart_p.html b/htroot/CrawlStart_p.html index 63c84919a..4af4ae456 100644 --- a/htroot/CrawlStart_p.html +++ b/htroot/CrawlStart_p.html @@ -196,6 +196,22 @@ This option is used by default for proxy prefetch, but is not needed for explicit crawling. + + : + + no cache    + if exist    + if fresh    + cache only + + + The caching policy states when to use the cache during crawling: + no cache: never use the cache, all content from fresh internet source; + if fresh: use the cache if the cache exists and is fresh using the proxy-fresh rules; + if exist: use the cache if the cache exist. Do no check freshness. Othervise use online source; + cache only: never go online, use all content from cache. If no cache exist, treat content as unavailable + + Do Local Indexing: diff --git a/htroot/WatchCrawler_p.java b/htroot/WatchCrawler_p.java index 268bed847..b33d1aadb 100644 --- a/htroot/WatchCrawler_p.java +++ b/htroot/WatchCrawler_p.java @@ -182,6 +182,13 @@ public class WatchCrawler_p { final boolean storeHTCache = post.get("storeHTCache", "off").equals("on"); env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false"); + final String cachePolicyString = post.get("cachePolicy", "iffresh"); + int cachePolicy = CrawlProfile.CACHE_STRATEGY_IFFRESH; + if (cachePolicyString.equals("nocache")) cachePolicy = CrawlProfile.CACHE_STRATEGY_NOCACHE; + if (cachePolicyString.equals("iffresh")) cachePolicy = CrawlProfile.CACHE_STRATEGY_IFFRESH; + if (cachePolicyString.equals("ifexist")) cachePolicy = CrawlProfile.CACHE_STRATEGY_IFEXIST; + if (cachePolicyString.equals("cacheonly")) cachePolicy = CrawlProfile.CACHE_STRATEGY_CACHEONLY; + final boolean xsstopw = post.get("xsstopw", "off").equals("on"); env.setConfig("xsstopw", (xsstopw) ? "true" : "false"); @@ -224,7 +231,7 @@ public class WatchCrawler_p { crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages, crawlingQ, indexText, indexMedia, - storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, CrawlProfile.CACHE_STRATEGY_IFFRESH); + storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cachePolicy); final String reasonString = sb.crawlStacker.stackCrawl(new Request( sb.peers.mySeed().hash, url, @@ -351,7 +358,7 @@ public class WatchCrawler_p { true, crawlOrder, xsstopw, xdstopw, xpstopw, - CrawlProfile.CACHE_STRATEGY_IFFRESH); + cachePolicy); // pause local crawl here sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); @@ -411,7 +418,7 @@ public class WatchCrawler_p { indexText, indexMedia, storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, - CrawlProfile.CACHE_STRATEGY_IFFRESH); + cachePolicy); // create a new sitemap importer final SitemapImporter importerThread = new SitemapImporter(sb, sb.dbImportManager, new yacyURL(sitemapURLStr, null), pe); diff --git a/htroot/js/IndexCreate.js b/htroot/js/IndexCreate.js index dcb88ab98..421adcd97 100644 --- a/htroot/js/IndexCreate.js +++ b/htroot/js/IndexCreate.js @@ -14,7 +14,7 @@ function handleResponse(){ // document.getElementById("title").innerHTML=doctitle; document.WatchCrawler.bookmarkTitle.value=doctitle - // deterime if crawling is allowed by the robots.txt + // determine if crawling is allowed by the robots.txt docrobotsOK=""; if(response.getElementsByTagName("robots")[0].firstChild!=null){ docrobotsOK=response.getElementsByTagName("robots")[0].firstChild.nodeValue; diff --git a/source/de/anomic/crawler/Balancer.java b/source/de/anomic/crawler/Balancer.java index 272318300..4d976ee30 100644 --- a/source/de/anomic/crawler/Balancer.java +++ b/source/de/anomic/crawler/Balancer.java @@ -53,7 +53,6 @@ public class Balancer { private final File cacheStacksPath; private long minimumLocalDelta; private long minimumGlobalDelta; - private int profileErrors; private long lastDomainStackFill; public Balancer(final File cachePath, final String stackname, final boolean fullram, @@ -70,7 +69,6 @@ public class Balancer { cacheStacksPath.mkdirs(); File f = new File(cacheStacksPath, stackname + indexSuffix); urlFileIndex = new Table(f, Request.rowdef, (fullram) ? Table.tailCacheUsageAuto : Table.tailCacheDenyUsage, EcoFSBufferSize, 0); - profileErrors = 0; lastDomainStackFill = 0; Log.logInfo("Balancer", "opened balancer file with " + urlFileIndex.size() + " entries from " + f.toString()); } @@ -285,7 +283,7 @@ public class Balancer { * crawl-delay time which is always respected. In case the minimum time cannot ensured, this method pauses * the necessary time until the url is released and returned as CrawlEntry object. In case that a profile * for the computed Entry does not exist, null is returned - * @param delay + * @param delay true if the requester demands forced delays using explicit thread sleep * @param profile * @return a url in a CrawlEntry object * @throws IOException @@ -330,13 +328,14 @@ public class Balancer { // at this point we must check if the crawlEntry has relevancy because the crawl profile still exists // if not: return null. A calling method must handle the null value and try again - if (profile != null && !profile.hasEntry(crawlEntry.profileHandle())) { - profileErrors++; - if (profileErrors < 20) Log.logInfo("Balancer", "no profile entry for handle " + crawlEntry.profileHandle()); + CrawlProfile.entry profileEntry = (profile == null) ? null : profile.getEntry(crawlEntry.profileHandle()); + if (profileEntry == null) { + Log.logWarning("Balancer", "no profile entry for handle " + crawlEntry.profileHandle()); return null; } - sleeptime = Latency.waitingRemaining(crawlEntry.url(), minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server - + // depending on the caching policy we need sleep time to avoid DoS-like situations + sleeptime = (profileEntry.cacheStrategy() == CrawlProfile.CACHE_STRATEGY_CACHEONLY) ? 0 : Latency.waitingRemaining(crawlEntry.url(), minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server + assert result.equals(new String(rowEntry.getPrimaryKeyBytes())) : "result = " + result + ", rowEntry.getPrimaryKeyBytes() = " + new String(rowEntry.getPrimaryKeyBytes()); assert result.equals(crawlEntry.url().hash()) : "result = " + result + ", crawlEntry.url().hash() = " + crawlEntry.url().hash(); if (this.domainStacks.size() <= 1) break; diff --git a/source/de/anomic/crawler/CrawlProfile.java b/source/de/anomic/crawler/CrawlProfile.java index 8dab24559..78fcbc14d 100644 --- a/source/de/anomic/crawler/CrawlProfile.java +++ b/source/de/anomic/crawler/CrawlProfile.java @@ -256,10 +256,10 @@ public class CrawlProfile { } - public final static int CACHE_STRATEGY_NOCACHE = 0; - public final static int CACHE_STRATEGY_IFEXIST = 1; - public final static int CACHE_STRATEGY_IFFRESH = 2; - public final static int CACHE_STRATEGY_CACHEONLY = 3; + public final static int CACHE_STRATEGY_NOCACHE = 0; // never use the cache, all content from fresh internet source + public final static int CACHE_STRATEGY_IFFRESH = 1; // use the cache if the cache exists and is fresh using the proxy-fresh rules + public final static int CACHE_STRATEGY_IFEXIST = 2; // use the cache if the cache exist. Do no check freshness. Othervise use online source. + public final static int CACHE_STRATEGY_CACHEONLY = 3; // never go online, use all content from cache. If no cache exist, treat content as unavailable public static class entry { // this is a simple record structure that hold all properties of a single crawl start diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index 0c23af58f..eca4c0962 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -532,17 +532,21 @@ public class CrawlQueues { this.request.setStatus("worker-loading", serverProcessorJob.STATUS_RUNNING); String result = null; - // load a resource, store it to htcache and push queue entry to switchboard queue + // load a resource and push queue entry to switchboard queue // returns null if everything went fine, a fail reason string if a problem occurred - Response response; try { request.setStatus("loading", serverProcessorJob.STATUS_RUNNING); - response = sb.loader.load(request); - assert response != null; - request.setStatus("loaded", serverProcessorJob.STATUS_RUNNING); - final boolean stored = sb.toIndexer(response); - request.setStatus("enqueued-" + ((stored) ? "ok" : "fail"), serverProcessorJob.STATUS_FINISHED); - result = (stored) ? null : "not enqueued to indexer"; + Response response = sb.loader.load(request); + if (response == null) { + request.setStatus("error", serverProcessorJob.STATUS_FINISHED); + if (log.isFine()) log.logFine("problem loading " + request.url().toString() + ": no content (possibly caused by cache policy)"); + result = "no content (possibly caused by cache policy)"; + } else { + request.setStatus("loaded", serverProcessorJob.STATUS_RUNNING); + final boolean stored = sb.toIndexer(response); + request.setStatus("enqueued-" + ((stored) ? "ok" : "fail"), serverProcessorJob.STATUS_FINISHED); + result = (stored) ? null : "not enqueued to indexer"; + } } catch (IOException e) { request.setStatus("error", serverProcessorJob.STATUS_FINISHED); if (log.isFine()) log.logFine("problem loading " + request.url().toString() + ": " + e.getMessage());