diff --git a/defaults/yacy.init b/defaults/yacy.init
index e0daf6d0d..8d2662f60 100644
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@@ -226,8 +226,9 @@ mimeConfig = httpd.mime
# This will be used if the server is addressed as a proxy
proxyCache = DATA/HTCACHE
-# the maximum disc cache size for files in proxyCache in megabytes
-proxyCacheSize = 1024
+# the maximum disc cache size for files in Cache in megabytes
+# default: 32 Gigabyte
+proxyCacheSize = 32768
# a path to the surrogate input directory
surrogates.in = DATA/SURROGATES/in
diff --git a/htroot/CrawlStart_p.html b/htroot/CrawlStart_p.html
index 63c84919a..4af4ae456 100644
--- a/htroot/CrawlStart_p.html
+++ b/htroot/CrawlStart_p.html
@@ -196,6 +196,22 @@
This option is used by default for proxy prefetch, but is not needed for explicit crawling.
+
Do Local Indexing: |
diff --git a/htroot/WatchCrawler_p.java b/htroot/WatchCrawler_p.java
index 268bed847..b33d1aadb 100644
--- a/htroot/WatchCrawler_p.java
+++ b/htroot/WatchCrawler_p.java
@@ -182,6 +182,13 @@ public class WatchCrawler_p {
final boolean storeHTCache = post.get("storeHTCache", "off").equals("on");
env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false");
+ final String cachePolicyString = post.get("cachePolicy", "iffresh");
+ int cachePolicy = CrawlProfile.CACHE_STRATEGY_IFFRESH;
+ if (cachePolicyString.equals("nocache")) cachePolicy = CrawlProfile.CACHE_STRATEGY_NOCACHE;
+ if (cachePolicyString.equals("iffresh")) cachePolicy = CrawlProfile.CACHE_STRATEGY_IFFRESH;
+ if (cachePolicyString.equals("ifexist")) cachePolicy = CrawlProfile.CACHE_STRATEGY_IFEXIST;
+ if (cachePolicyString.equals("cacheonly")) cachePolicy = CrawlProfile.CACHE_STRATEGY_CACHEONLY;
+
final boolean xsstopw = post.get("xsstopw", "off").equals("on");
env.setConfig("xsstopw", (xsstopw) ? "true" : "false");
@@ -224,7 +231,7 @@ public class WatchCrawler_p {
crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages,
crawlingQ,
indexText, indexMedia,
- storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, CrawlProfile.CACHE_STRATEGY_IFFRESH);
+ storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cachePolicy);
final String reasonString = sb.crawlStacker.stackCrawl(new Request(
sb.peers.mySeed().hash,
url,
@@ -351,7 +358,7 @@ public class WatchCrawler_p {
true,
crawlOrder,
xsstopw, xdstopw, xpstopw,
- CrawlProfile.CACHE_STRATEGY_IFFRESH);
+ cachePolicy);
// pause local crawl here
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
@@ -411,7 +418,7 @@ public class WatchCrawler_p {
indexText, indexMedia,
storeHTCache, true, crawlOrder,
xsstopw, xdstopw, xpstopw,
- CrawlProfile.CACHE_STRATEGY_IFFRESH);
+ cachePolicy);
// create a new sitemap importer
final SitemapImporter importerThread = new SitemapImporter(sb, sb.dbImportManager, new yacyURL(sitemapURLStr, null), pe);
diff --git a/htroot/js/IndexCreate.js b/htroot/js/IndexCreate.js
index dcb88ab98..421adcd97 100644
--- a/htroot/js/IndexCreate.js
+++ b/htroot/js/IndexCreate.js
@@ -14,7 +14,7 @@ function handleResponse(){
// document.getElementById("title").innerHTML=doctitle;
document.WatchCrawler.bookmarkTitle.value=doctitle
- // deterime if crawling is allowed by the robots.txt
+ // determine if crawling is allowed by the robots.txt
docrobotsOK="";
if(response.getElementsByTagName("robots")[0].firstChild!=null){
docrobotsOK=response.getElementsByTagName("robots")[0].firstChild.nodeValue;
diff --git a/source/de/anomic/crawler/Balancer.java b/source/de/anomic/crawler/Balancer.java
index 272318300..4d976ee30 100644
--- a/source/de/anomic/crawler/Balancer.java
+++ b/source/de/anomic/crawler/Balancer.java
@@ -53,7 +53,6 @@ public class Balancer {
private final File cacheStacksPath;
private long minimumLocalDelta;
private long minimumGlobalDelta;
- private int profileErrors;
private long lastDomainStackFill;
public Balancer(final File cachePath, final String stackname, final boolean fullram,
@@ -70,7 +69,6 @@ public class Balancer {
cacheStacksPath.mkdirs();
File f = new File(cacheStacksPath, stackname + indexSuffix);
urlFileIndex = new Table(f, Request.rowdef, (fullram) ? Table.tailCacheUsageAuto : Table.tailCacheDenyUsage, EcoFSBufferSize, 0);
- profileErrors = 0;
lastDomainStackFill = 0;
Log.logInfo("Balancer", "opened balancer file with " + urlFileIndex.size() + " entries from " + f.toString());
}
@@ -285,7 +283,7 @@ public class Balancer {
* crawl-delay time which is always respected. In case the minimum time cannot ensured, this method pauses
* the necessary time until the url is released and returned as CrawlEntry object. In case that a profile
* for the computed Entry does not exist, null is returned
- * @param delay
+ * @param delay true if the requester demands forced delays using explicit thread sleep
* @param profile
* @return a url in a CrawlEntry object
* @throws IOException
@@ -330,13 +328,14 @@ public class Balancer {
// at this point we must check if the crawlEntry has relevancy because the crawl profile still exists
// if not: return null. A calling method must handle the null value and try again
- if (profile != null && !profile.hasEntry(crawlEntry.profileHandle())) {
- profileErrors++;
- if (profileErrors < 20) Log.logInfo("Balancer", "no profile entry for handle " + crawlEntry.profileHandle());
+ CrawlProfile.entry profileEntry = (profile == null) ? null : profile.getEntry(crawlEntry.profileHandle());
+ if (profileEntry == null) {
+ Log.logWarning("Balancer", "no profile entry for handle " + crawlEntry.profileHandle());
return null;
}
- sleeptime = Latency.waitingRemaining(crawlEntry.url(), minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
-
+ // depending on the caching policy we need sleep time to avoid DoS-like situations
+ sleeptime = (profileEntry.cacheStrategy() == CrawlProfile.CACHE_STRATEGY_CACHEONLY) ? 0 : Latency.waitingRemaining(crawlEntry.url(), minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
+
assert result.equals(new String(rowEntry.getPrimaryKeyBytes())) : "result = " + result + ", rowEntry.getPrimaryKeyBytes() = " + new String(rowEntry.getPrimaryKeyBytes());
assert result.equals(crawlEntry.url().hash()) : "result = " + result + ", crawlEntry.url().hash() = " + crawlEntry.url().hash();
if (this.domainStacks.size() <= 1) break;
diff --git a/source/de/anomic/crawler/CrawlProfile.java b/source/de/anomic/crawler/CrawlProfile.java
index 8dab24559..78fcbc14d 100644
--- a/source/de/anomic/crawler/CrawlProfile.java
+++ b/source/de/anomic/crawler/CrawlProfile.java
@@ -256,10 +256,10 @@ public class CrawlProfile {
}
- public final static int CACHE_STRATEGY_NOCACHE = 0;
- public final static int CACHE_STRATEGY_IFEXIST = 1;
- public final static int CACHE_STRATEGY_IFFRESH = 2;
- public final static int CACHE_STRATEGY_CACHEONLY = 3;
+ public final static int CACHE_STRATEGY_NOCACHE = 0; // never use the cache, all content from fresh internet source
+ public final static int CACHE_STRATEGY_IFFRESH = 1; // use the cache if the cache exists and is fresh using the proxy-fresh rules
+ public final static int CACHE_STRATEGY_IFEXIST = 2; // use the cache if the cache exist. Do no check freshness. Othervise use online source.
+ public final static int CACHE_STRATEGY_CACHEONLY = 3; // never go online, use all content from cache. If no cache exist, treat content as unavailable
public static class entry {
// this is a simple record structure that hold all properties of a single crawl start
diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java
index 0c23af58f..eca4c0962 100644
--- a/source/de/anomic/crawler/CrawlQueues.java
+++ b/source/de/anomic/crawler/CrawlQueues.java
@@ -532,17 +532,21 @@ public class CrawlQueues {
this.request.setStatus("worker-loading", serverProcessorJob.STATUS_RUNNING);
String result = null;
- // load a resource, store it to htcache and push queue entry to switchboard queue
+ // load a resource and push queue entry to switchboard queue
// returns null if everything went fine, a fail reason string if a problem occurred
- Response response;
try {
request.setStatus("loading", serverProcessorJob.STATUS_RUNNING);
- response = sb.loader.load(request);
- assert response != null;
- request.setStatus("loaded", serverProcessorJob.STATUS_RUNNING);
- final boolean stored = sb.toIndexer(response);
- request.setStatus("enqueued-" + ((stored) ? "ok" : "fail"), serverProcessorJob.STATUS_FINISHED);
- result = (stored) ? null : "not enqueued to indexer";
+ Response response = sb.loader.load(request);
+ if (response == null) {
+ request.setStatus("error", serverProcessorJob.STATUS_FINISHED);
+ if (log.isFine()) log.logFine("problem loading " + request.url().toString() + ": no content (possibly caused by cache policy)");
+ result = "no content (possibly caused by cache policy)";
+ } else {
+ request.setStatus("loaded", serverProcessorJob.STATUS_RUNNING);
+ final boolean stored = sb.toIndexer(response);
+ request.setStatus("enqueued-" + ((stored) ? "ok" : "fail"), serverProcessorJob.STATUS_FINISHED);
+ result = (stored) ? null : "not enqueued to indexer";
+ }
} catch (IOException e) {
request.setStatus("error", serverProcessorJob.STATUS_FINISHED);
if (log.isFine()) log.logFine("problem loading " + request.url().toString() + ": " + e.getMessage());
|