- added cache usage properties to crawl start

- added special rule to balancer to omit forced delays if cache is used exclusively
- extended the htCache size by default to 32GB

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6241 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent c4ae2cd03f
commit c6c97f23ad

@ -226,8 +226,9 @@ mimeConfig = httpd.mime
# This will be used if the server is addressed as a proxy
proxyCache = DATA/HTCACHE
# the maximum disc cache size for files in proxyCache in megabytes
proxyCacheSize = 1024
# the maximum disc cache size for files in Cache in megabytes
# default: 32 Gigabyte
proxyCacheSize = 32768
# a path to the surrogate input directory
surrogates.in = DATA/SURROGATES/in

@ -196,6 +196,22 @@
This option is used by default for proxy prefetch, but is not needed for explicit crawling.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td><label for="mustmatch">Policy for usage of Web Cache</label>:</td>
<td>
<input type="radio" name="cachePolicy" value="nocache" />no&nbsp;cache&nbsp;&nbsp;&nbsp;
<input type="radio" name="cachePolicy" value="ifexist" />if&nbsp;exist&nbsp;&nbsp;&nbsp;
<input type="radio" name="cachePolicy" value="iffresh" checked="checked" />if&nbsp;fresh&nbsp;&nbsp;&nbsp;
<input type="radio" name="cachePolicy" value="cacheonly" />cache&nbsp;only
</td>
<td>
The caching policy states when to use the cache during crawling:
<b>no&nbsp;cache</b>: never use the cache, all content from fresh internet source;
<b>if&nbsp;fresh</b>: use the cache if the cache exists and is fresh using the proxy-fresh rules;
<b>if&nbsp;exist</b>: use the cache if the cache exist. Do no check freshness. Othervise use online source;
<b>cache&nbsp;only</b>: never go online, use all content from cache. If no cache exist, treat content as unavailable
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td>Do Local Indexing:</td>
<td>

@ -182,6 +182,13 @@ public class WatchCrawler_p {
final boolean storeHTCache = post.get("storeHTCache", "off").equals("on");
env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false");
final String cachePolicyString = post.get("cachePolicy", "iffresh");
int cachePolicy = CrawlProfile.CACHE_STRATEGY_IFFRESH;
if (cachePolicyString.equals("nocache")) cachePolicy = CrawlProfile.CACHE_STRATEGY_NOCACHE;
if (cachePolicyString.equals("iffresh")) cachePolicy = CrawlProfile.CACHE_STRATEGY_IFFRESH;
if (cachePolicyString.equals("ifexist")) cachePolicy = CrawlProfile.CACHE_STRATEGY_IFEXIST;
if (cachePolicyString.equals("cacheonly")) cachePolicy = CrawlProfile.CACHE_STRATEGY_CACHEONLY;
final boolean xsstopw = post.get("xsstopw", "off").equals("on");
env.setConfig("xsstopw", (xsstopw) ? "true" : "false");
@ -224,7 +231,7 @@ public class WatchCrawler_p {
crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages,
crawlingQ,
indexText, indexMedia,
storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, CrawlProfile.CACHE_STRATEGY_IFFRESH);
storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cachePolicy);
final String reasonString = sb.crawlStacker.stackCrawl(new Request(
sb.peers.mySeed().hash,
url,
@ -351,7 +358,7 @@ public class WatchCrawler_p {
true,
crawlOrder,
xsstopw, xdstopw, xpstopw,
CrawlProfile.CACHE_STRATEGY_IFFRESH);
cachePolicy);
// pause local crawl here
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
@ -411,7 +418,7 @@ public class WatchCrawler_p {
indexText, indexMedia,
storeHTCache, true, crawlOrder,
xsstopw, xdstopw, xpstopw,
CrawlProfile.CACHE_STRATEGY_IFFRESH);
cachePolicy);
// create a new sitemap importer
final SitemapImporter importerThread = new SitemapImporter(sb, sb.dbImportManager, new yacyURL(sitemapURLStr, null), pe);

@ -14,7 +14,7 @@ function handleResponse(){
// document.getElementById("title").innerHTML=doctitle;
document.WatchCrawler.bookmarkTitle.value=doctitle
// deterime if crawling is allowed by the robots.txt
// determine if crawling is allowed by the robots.txt
docrobotsOK="";
if(response.getElementsByTagName("robots")[0].firstChild!=null){
docrobotsOK=response.getElementsByTagName("robots")[0].firstChild.nodeValue;

@ -53,7 +53,6 @@ public class Balancer {
private final File cacheStacksPath;
private long minimumLocalDelta;
private long minimumGlobalDelta;
private int profileErrors;
private long lastDomainStackFill;
public Balancer(final File cachePath, final String stackname, final boolean fullram,
@ -70,7 +69,6 @@ public class Balancer {
cacheStacksPath.mkdirs();
File f = new File(cacheStacksPath, stackname + indexSuffix);
urlFileIndex = new Table(f, Request.rowdef, (fullram) ? Table.tailCacheUsageAuto : Table.tailCacheDenyUsage, EcoFSBufferSize, 0);
profileErrors = 0;
lastDomainStackFill = 0;
Log.logInfo("Balancer", "opened balancer file with " + urlFileIndex.size() + " entries from " + f.toString());
}
@ -285,7 +283,7 @@ public class Balancer {
* crawl-delay time which is always respected. In case the minimum time cannot ensured, this method pauses
* the necessary time until the url is released and returned as CrawlEntry object. In case that a profile
* for the computed Entry does not exist, null is returned
* @param delay
* @param delay true if the requester demands forced delays using explicit thread sleep
* @param profile
* @return a url in a CrawlEntry object
* @throws IOException
@ -330,13 +328,14 @@ public class Balancer {
// at this point we must check if the crawlEntry has relevancy because the crawl profile still exists
// if not: return null. A calling method must handle the null value and try again
if (profile != null && !profile.hasEntry(crawlEntry.profileHandle())) {
profileErrors++;
if (profileErrors < 20) Log.logInfo("Balancer", "no profile entry for handle " + crawlEntry.profileHandle());
CrawlProfile.entry profileEntry = (profile == null) ? null : profile.getEntry(crawlEntry.profileHandle());
if (profileEntry == null) {
Log.logWarning("Balancer", "no profile entry for handle " + crawlEntry.profileHandle());
return null;
}
sleeptime = Latency.waitingRemaining(crawlEntry.url(), minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
// depending on the caching policy we need sleep time to avoid DoS-like situations
sleeptime = (profileEntry.cacheStrategy() == CrawlProfile.CACHE_STRATEGY_CACHEONLY) ? 0 : Latency.waitingRemaining(crawlEntry.url(), minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
assert result.equals(new String(rowEntry.getPrimaryKeyBytes())) : "result = " + result + ", rowEntry.getPrimaryKeyBytes() = " + new String(rowEntry.getPrimaryKeyBytes());
assert result.equals(crawlEntry.url().hash()) : "result = " + result + ", crawlEntry.url().hash() = " + crawlEntry.url().hash();
if (this.domainStacks.size() <= 1) break;

@ -256,10 +256,10 @@ public class CrawlProfile {
}
public final static int CACHE_STRATEGY_NOCACHE = 0;
public final static int CACHE_STRATEGY_IFEXIST = 1;
public final static int CACHE_STRATEGY_IFFRESH = 2;
public final static int CACHE_STRATEGY_CACHEONLY = 3;
public final static int CACHE_STRATEGY_NOCACHE = 0; // never use the cache, all content from fresh internet source
public final static int CACHE_STRATEGY_IFFRESH = 1; // use the cache if the cache exists and is fresh using the proxy-fresh rules
public final static int CACHE_STRATEGY_IFEXIST = 2; // use the cache if the cache exist. Do no check freshness. Othervise use online source.
public final static int CACHE_STRATEGY_CACHEONLY = 3; // never go online, use all content from cache. If no cache exist, treat content as unavailable
public static class entry {
// this is a simple record structure that hold all properties of a single crawl start

@ -532,17 +532,21 @@ public class CrawlQueues {
this.request.setStatus("worker-loading", serverProcessorJob.STATUS_RUNNING);
String result = null;
// load a resource, store it to htcache and push queue entry to switchboard queue
// load a resource and push queue entry to switchboard queue
// returns null if everything went fine, a fail reason string if a problem occurred
Response response;
try {
request.setStatus("loading", serverProcessorJob.STATUS_RUNNING);
response = sb.loader.load(request);
assert response != null;
request.setStatus("loaded", serverProcessorJob.STATUS_RUNNING);
final boolean stored = sb.toIndexer(response);
request.setStatus("enqueued-" + ((stored) ? "ok" : "fail"), serverProcessorJob.STATUS_FINISHED);
result = (stored) ? null : "not enqueued to indexer";
Response response = sb.loader.load(request);
if (response == null) {
request.setStatus("error", serverProcessorJob.STATUS_FINISHED);
if (log.isFine()) log.logFine("problem loading " + request.url().toString() + ": no content (possibly caused by cache policy)");
result = "no content (possibly caused by cache policy)";
} else {
request.setStatus("loaded", serverProcessorJob.STATUS_RUNNING);
final boolean stored = sb.toIndexer(response);
request.setStatus("enqueued-" + ((stored) ? "ok" : "fail"), serverProcessorJob.STATUS_FINISHED);
result = (stored) ? null : "not enqueued to indexer";
}
} catch (IOException e) {
request.setStatus("error", serverProcessorJob.STATUS_FINISHED);
if (log.isFine()) log.logFine("problem loading " + request.url().toString() + ": " + e.getMessage());

Loading…
Cancel
Save