- added cache usage properties to crawl start

- added special rule to balancer to omit forced delays if cache is used exclusively - extended the htCache size by default to 32GB git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6241 6c8d7289-2bf4-0310-a012-ef5d649a1542
16 years ago · c6c97f23ad
parent c4ae2cd03f
commit c6c97f23ad
7 changed files with 53 additions and 26 deletions
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@ -226,8 +226,9 @@ mimeConfig = httpd.mime
 # This will be used if the server is addressed as a proxy
 proxyCache = DATA/HTCACHE
-# the maximum disc cache size for files in proxyCache in megabytes
+# the maximum disc cache size for files in Cache in megabytes
-proxyCacheSize = 1024
+# default: 32 Gigabyte
 proxyCacheSize = 32768
 # a path to the surrogate input directory
 surrogates.in = DATA/SURROGATES/in
--- a/htroot/CrawlStart_p.html
+++ b/htroot/CrawlStart_p.html
@ -196,6 +196,22 @@
            This option is used by default for proxy prefetch, but is not needed for explicit crawling.
          </td>
        </tr>
        <tr valign="top" class="TableCellLight">
          <td><label for="mustmatch">Policy for usage of Web Cache</label>:</td>
          <td>
 			<input type="radio" name="cachePolicy" value="nocache" />no&nbsp;cache&nbsp;&nbsp;&nbsp;
 			<input type="radio" name="cachePolicy" value="ifexist" />if&nbsp;exist&nbsp;&nbsp;&nbsp;
 			<input type="radio" name="cachePolicy" value="iffresh" checked="checked" />if&nbsp;fresh&nbsp;&nbsp;&nbsp;
 			<input type="radio" name="cachePolicy" value="cacheonly" />cache&nbsp;only
 		  </td>
          <td>
            The caching policy states when to use the cache during crawling:
              <b>no&nbsp;cache</b>: never use the cache, all content from fresh internet source;
              <b>if&nbsp;fresh</b>: use the cache if the cache exists and is fresh using the proxy-fresh rules;
              <b>if&nbsp;exist</b>: use the cache if the cache exist. Do no check freshness. Othervise use online source;
              <b>cache&nbsp;only</b>: never go online, use all content from cache. If no cache exist, treat content as unavailable
          </td>
        </tr>
        <tr valign="top" class="TableCellDark">
          <td>Do Local Indexing:</td>
          <td>
--- a/htroot/WatchCrawler_p.java
+++ b/htroot/WatchCrawler_p.java
@ -182,6 +182,13 @@ public class WatchCrawler_p {
                    final boolean storeHTCache = post.get("storeHTCache", "off").equals("on");
                    env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false");
                    final String cachePolicyString = post.get("cachePolicy", "iffresh");
                    int cachePolicy = CrawlProfile.CACHE_STRATEGY_IFFRESH;
                    if (cachePolicyString.equals("nocache")) cachePolicy = CrawlProfile.CACHE_STRATEGY_NOCACHE;
                    if (cachePolicyString.equals("iffresh")) cachePolicy = CrawlProfile.CACHE_STRATEGY_IFFRESH;
                    if (cachePolicyString.equals("ifexist")) cachePolicy = CrawlProfile.CACHE_STRATEGY_IFEXIST;
                    if (cachePolicyString.equals("cacheonly")) cachePolicy = CrawlProfile.CACHE_STRATEGY_CACHEONLY;
                    final boolean xsstopw = post.get("xsstopw", "off").equals("on");
                    env.setConfig("xsstopw", (xsstopw) ? "true" : "false");
@ -224,7 +231,7 @@ public class WatchCrawler_p {
                                    crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages,
                                    crawlingQ,
                                    indexText, indexMedia,
-                                    storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, CrawlProfile.CACHE_STRATEGY_IFFRESH);
+                                    storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cachePolicy);
                            final String reasonString = sb.crawlStacker.stackCrawl(new Request(
                                    sb.peers.mySeed().hash,
                                    url,
@ -351,7 +358,7 @@ public class WatchCrawler_p {
                                        true,
                                        crawlOrder,
                                        xsstopw, xdstopw, xpstopw,
-                                        CrawlProfile.CACHE_STRATEGY_IFFRESH);
+                                        cachePolicy);
                                // pause local crawl here
                                sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
@ -411,7 +418,7 @@ public class WatchCrawler_p {
                    				indexText, indexMedia,
                    				storeHTCache, true, crawlOrder,
                    				xsstopw, xdstopw, xpstopw,
-                    				CrawlProfile.CACHE_STRATEGY_IFFRESH);
+                    				cachePolicy);
                    		// create a new sitemap importer
                    		final SitemapImporter importerThread = new SitemapImporter(sb, sb.dbImportManager, new yacyURL(sitemapURLStr, null), pe);
--- a/htroot/js/IndexCreate.js
+++ b/htroot/js/IndexCreate.js
@ -14,7 +14,7 @@ function handleResponse(){
 		// document.getElementById("title").innerHTML=doctitle;
 		document.WatchCrawler.bookmarkTitle.value=doctitle
-		// deterime if crawling is allowed by the robots.txt
+		// determine if crawling is allowed by the robots.txt
        docrobotsOK="";		
        if(response.getElementsByTagName("robots")[0].firstChild!=null){
 	        docrobotsOK=response.getElementsByTagName("robots")[0].firstChild.nodeValue;
--- a/source/de/anomic/crawler/Balancer.java
+++ b/source/de/anomic/crawler/Balancer.java
@ -53,7 +53,6 @@ public class Balancer {
    private   final File   cacheStacksPath;
    private   long         minimumLocalDelta;
    private   long         minimumGlobalDelta;
    private   int          profileErrors;
    private   long         lastDomainStackFill;
    public Balancer(final File cachePath, final String stackname, final boolean fullram,
@ -70,7 +69,6 @@ public class Balancer {
        cacheStacksPath.mkdirs();
        File f = new File(cacheStacksPath, stackname + indexSuffix);
        urlFileIndex = new Table(f, Request.rowdef, (fullram) ? Table.tailCacheUsageAuto : Table.tailCacheDenyUsage, EcoFSBufferSize, 0);
        profileErrors = 0;
        lastDomainStackFill = 0;
        Log.logInfo("Balancer", "opened balancer file with " + urlFileIndex.size() + " entries from " + f.toString());
    }
@ -285,7 +283,7 @@ public class Balancer {
     * crawl-delay time which is always respected. In case the minimum time cannot ensured, this method pauses
     * the necessary time until the url is released and returned as CrawlEntry object. In case that a profile
     * for the computed Entry does not exist, null is returned
-     * @param delay
+     * @param delay true if the requester demands forced delays using explicit thread sleep
     * @param profile
     * @return a url in a CrawlEntry object
     * @throws IOException
@ -330,12 +328,13 @@ public class Balancer {
 		        // at this point we must check if the crawlEntry has relevancy because the crawl profile still exists
 		        // if not: return null. A calling method must handle the null value and try again
-		        if (profile != null && !profile.hasEntry(crawlEntry.profileHandle())) {
+		        CrawlProfile.entry profileEntry = (profile == null) ? null : profile.getEntry(crawlEntry.profileHandle());
-		        	profileErrors++;
+		        if (profileEntry == null) {
-		        	if (profileErrors < 20) Log.logInfo("Balancer", "no profile entry for handle " + crawlEntry.profileHandle());
+		        	Log.logWarning("Balancer", "no profile entry for handle " + crawlEntry.profileHandle());
 		        	return null;
 		        }
-		        sleeptime = Latency.waitingRemaining(crawlEntry.url(), minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
+		        // depending on the caching policy we need sleep time to avoid DoS-like situations
 		        sleeptime = (profileEntry.cacheStrategy() == CrawlProfile.CACHE_STRATEGY_CACHEONLY) ? 0 : Latency.waitingRemaining(crawlEntry.url(), minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
 		        assert result.equals(new String(rowEntry.getPrimaryKeyBytes())) : "result = " + result + ", rowEntry.getPrimaryKeyBytes() = " + new String(rowEntry.getPrimaryKeyBytes());
 		        assert result.equals(crawlEntry.url().hash()) : "result = " + result + ", crawlEntry.url().hash() = " + crawlEntry.url().hash();
--- a/source/de/anomic/crawler/CrawlProfile.java
+++ b/source/de/anomic/crawler/CrawlProfile.java
@ -256,10 +256,10 @@ public class CrawlProfile {
    }
-    public final static int CACHE_STRATEGY_NOCACHE = 0;
+    public final static int CACHE_STRATEGY_NOCACHE = 0;    // never use the cache, all content from fresh internet source
-    public final static int CACHE_STRATEGY_IFEXIST = 1;
+    public final static int CACHE_STRATEGY_IFFRESH = 1;    // use the cache if the cache exists and is fresh using the proxy-fresh rules
-    public final static int CACHE_STRATEGY_IFFRESH = 2;
+    public final static int CACHE_STRATEGY_IFEXIST = 2;    // use the cache if the cache exist. Do no check freshness. Othervise use online source.
-    public final static int CACHE_STRATEGY_CACHEONLY = 3;
+    public final static int CACHE_STRATEGY_CACHEONLY = 3;  // never go online, use all content from cache. If no cache exist, treat content as unavailable
    public static class entry {
        // this is a simple record structure that hold all properties of a single crawl start
--- a/source/de/anomic/crawler/CrawlQueues.java
+++ b/source/de/anomic/crawler/CrawlQueues.java
@ -532,17 +532,21 @@ public class CrawlQueues {
                    this.request.setStatus("worker-loading", serverProcessorJob.STATUS_RUNNING);
                    String result = null;
-                    // load a resource, store it to htcache and push queue entry to switchboard queue
+                    // load a resource and push queue entry to switchboard queue
                    // returns null if everything went fine, a fail reason string if a problem occurred
                    Response response;
                    try {
                        request.setStatus("loading", serverProcessorJob.STATUS_RUNNING);
-                        response = sb.loader.load(request);
+                        Response response = sb.loader.load(request);
-                        assert response != null;
+                        if (response == null) {
-                        request.setStatus("loaded", serverProcessorJob.STATUS_RUNNING);
+                            request.setStatus("error", serverProcessorJob.STATUS_FINISHED);
-                        final boolean stored = sb.toIndexer(response);
+                            if (log.isFine()) log.logFine("problem loading " + request.url().toString() + ": no content (possibly caused by cache policy)");
-                        request.setStatus("enqueued-" + ((stored) ? "ok" : "fail"), serverProcessorJob.STATUS_FINISHED);
+                            result = "no content (possibly caused by cache policy)";
-                        result = (stored) ? null : "not enqueued to indexer";
+                        } else {
                            request.setStatus("loaded", serverProcessorJob.STATUS_RUNNING);
                            final boolean stored = sb.toIndexer(response);
                            request.setStatus("enqueued-" + ((stored) ? "ok" : "fail"), serverProcessorJob.STATUS_FINISHED);
                            result = (stored) ? null : "not enqueued to indexer";
                        }
                    } catch (IOException e) {
                        request.setStatus("error", serverProcessorJob.STATUS_FINISHED);
                        if (log.isFine()) log.logFine("problem loading " + request.url().toString() + ": " + e.getMessage());