- added cache usage properties to crawl start

- added special rule to balancer to omit forced delays if cache is used exclusively
- extended the htCache size by default to 32GB

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6241 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent c4ae2cd03f
commit c6c97f23ad

@ -226,8 +226,9 @@ mimeConfig = httpd.mime
# This will be used if the server is addressed as a proxy # This will be used if the server is addressed as a proxy
proxyCache = DATA/HTCACHE proxyCache = DATA/HTCACHE
# the maximum disc cache size for files in proxyCache in megabytes # the maximum disc cache size for files in Cache in megabytes
proxyCacheSize = 1024 # default: 32 Gigabyte
proxyCacheSize = 32768
# a path to the surrogate input directory # a path to the surrogate input directory
surrogates.in = DATA/SURROGATES/in surrogates.in = DATA/SURROGATES/in

@ -196,6 +196,22 @@
This option is used by default for proxy prefetch, but is not needed for explicit crawling. This option is used by default for proxy prefetch, but is not needed for explicit crawling.
</td> </td>
</tr> </tr>
<tr valign="top" class="TableCellLight">
<td><label for="mustmatch">Policy for usage of Web Cache</label>:</td>
<td>
<input type="radio" name="cachePolicy" value="nocache" />no&nbsp;cache&nbsp;&nbsp;&nbsp;
<input type="radio" name="cachePolicy" value="ifexist" />if&nbsp;exist&nbsp;&nbsp;&nbsp;
<input type="radio" name="cachePolicy" value="iffresh" checked="checked" />if&nbsp;fresh&nbsp;&nbsp;&nbsp;
<input type="radio" name="cachePolicy" value="cacheonly" />cache&nbsp;only
</td>
<td>
The caching policy states when to use the cache during crawling:
<b>no&nbsp;cache</b>: never use the cache, all content from fresh internet source;
<b>if&nbsp;fresh</b>: use the cache if the cache exists and is fresh using the proxy-fresh rules;
<b>if&nbsp;exist</b>: use the cache if the cache exist. Do no check freshness. Othervise use online source;
<b>cache&nbsp;only</b>: never go online, use all content from cache. If no cache exist, treat content as unavailable
</td>
</tr>
<tr valign="top" class="TableCellDark"> <tr valign="top" class="TableCellDark">
<td>Do Local Indexing:</td> <td>Do Local Indexing:</td>
<td> <td>

@ -182,6 +182,13 @@ public class WatchCrawler_p {
final boolean storeHTCache = post.get("storeHTCache", "off").equals("on"); final boolean storeHTCache = post.get("storeHTCache", "off").equals("on");
env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false"); env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false");
final String cachePolicyString = post.get("cachePolicy", "iffresh");
int cachePolicy = CrawlProfile.CACHE_STRATEGY_IFFRESH;
if (cachePolicyString.equals("nocache")) cachePolicy = CrawlProfile.CACHE_STRATEGY_NOCACHE;
if (cachePolicyString.equals("iffresh")) cachePolicy = CrawlProfile.CACHE_STRATEGY_IFFRESH;
if (cachePolicyString.equals("ifexist")) cachePolicy = CrawlProfile.CACHE_STRATEGY_IFEXIST;
if (cachePolicyString.equals("cacheonly")) cachePolicy = CrawlProfile.CACHE_STRATEGY_CACHEONLY;
final boolean xsstopw = post.get("xsstopw", "off").equals("on"); final boolean xsstopw = post.get("xsstopw", "off").equals("on");
env.setConfig("xsstopw", (xsstopw) ? "true" : "false"); env.setConfig("xsstopw", (xsstopw) ? "true" : "false");
@ -224,7 +231,7 @@ public class WatchCrawler_p {
crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages, crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages,
crawlingQ, crawlingQ,
indexText, indexMedia, indexText, indexMedia,
storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, CrawlProfile.CACHE_STRATEGY_IFFRESH); storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cachePolicy);
final String reasonString = sb.crawlStacker.stackCrawl(new Request( final String reasonString = sb.crawlStacker.stackCrawl(new Request(
sb.peers.mySeed().hash, sb.peers.mySeed().hash,
url, url,
@ -351,7 +358,7 @@ public class WatchCrawler_p {
true, true,
crawlOrder, crawlOrder,
xsstopw, xdstopw, xpstopw, xsstopw, xdstopw, xpstopw,
CrawlProfile.CACHE_STRATEGY_IFFRESH); cachePolicy);
// pause local crawl here // pause local crawl here
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
@ -411,7 +418,7 @@ public class WatchCrawler_p {
indexText, indexMedia, indexText, indexMedia,
storeHTCache, true, crawlOrder, storeHTCache, true, crawlOrder,
xsstopw, xdstopw, xpstopw, xsstopw, xdstopw, xpstopw,
CrawlProfile.CACHE_STRATEGY_IFFRESH); cachePolicy);
// create a new sitemap importer // create a new sitemap importer
final SitemapImporter importerThread = new SitemapImporter(sb, sb.dbImportManager, new yacyURL(sitemapURLStr, null), pe); final SitemapImporter importerThread = new SitemapImporter(sb, sb.dbImportManager, new yacyURL(sitemapURLStr, null), pe);

@ -14,7 +14,7 @@ function handleResponse(){
// document.getElementById("title").innerHTML=doctitle; // document.getElementById("title").innerHTML=doctitle;
document.WatchCrawler.bookmarkTitle.value=doctitle document.WatchCrawler.bookmarkTitle.value=doctitle
// deterime if crawling is allowed by the robots.txt // determine if crawling is allowed by the robots.txt
docrobotsOK=""; docrobotsOK="";
if(response.getElementsByTagName("robots")[0].firstChild!=null){ if(response.getElementsByTagName("robots")[0].firstChild!=null){
docrobotsOK=response.getElementsByTagName("robots")[0].firstChild.nodeValue; docrobotsOK=response.getElementsByTagName("robots")[0].firstChild.nodeValue;

@ -53,7 +53,6 @@ public class Balancer {
private final File cacheStacksPath; private final File cacheStacksPath;
private long minimumLocalDelta; private long minimumLocalDelta;
private long minimumGlobalDelta; private long minimumGlobalDelta;
private int profileErrors;
private long lastDomainStackFill; private long lastDomainStackFill;
public Balancer(final File cachePath, final String stackname, final boolean fullram, public Balancer(final File cachePath, final String stackname, final boolean fullram,
@ -70,7 +69,6 @@ public class Balancer {
cacheStacksPath.mkdirs(); cacheStacksPath.mkdirs();
File f = new File(cacheStacksPath, stackname + indexSuffix); File f = new File(cacheStacksPath, stackname + indexSuffix);
urlFileIndex = new Table(f, Request.rowdef, (fullram) ? Table.tailCacheUsageAuto : Table.tailCacheDenyUsage, EcoFSBufferSize, 0); urlFileIndex = new Table(f, Request.rowdef, (fullram) ? Table.tailCacheUsageAuto : Table.tailCacheDenyUsage, EcoFSBufferSize, 0);
profileErrors = 0;
lastDomainStackFill = 0; lastDomainStackFill = 0;
Log.logInfo("Balancer", "opened balancer file with " + urlFileIndex.size() + " entries from " + f.toString()); Log.logInfo("Balancer", "opened balancer file with " + urlFileIndex.size() + " entries from " + f.toString());
} }
@ -285,7 +283,7 @@ public class Balancer {
* crawl-delay time which is always respected. In case the minimum time cannot ensured, this method pauses * crawl-delay time which is always respected. In case the minimum time cannot ensured, this method pauses
* the necessary time until the url is released and returned as CrawlEntry object. In case that a profile * the necessary time until the url is released and returned as CrawlEntry object. In case that a profile
* for the computed Entry does not exist, null is returned * for the computed Entry does not exist, null is returned
* @param delay * @param delay true if the requester demands forced delays using explicit thread sleep
* @param profile * @param profile
* @return a url in a CrawlEntry object * @return a url in a CrawlEntry object
* @throws IOException * @throws IOException
@ -330,12 +328,13 @@ public class Balancer {
// at this point we must check if the crawlEntry has relevancy because the crawl profile still exists // at this point we must check if the crawlEntry has relevancy because the crawl profile still exists
// if not: return null. A calling method must handle the null value and try again // if not: return null. A calling method must handle the null value and try again
if (profile != null && !profile.hasEntry(crawlEntry.profileHandle())) { CrawlProfile.entry profileEntry = (profile == null) ? null : profile.getEntry(crawlEntry.profileHandle());
profileErrors++; if (profileEntry == null) {
if (profileErrors < 20) Log.logInfo("Balancer", "no profile entry for handle " + crawlEntry.profileHandle()); Log.logWarning("Balancer", "no profile entry for handle " + crawlEntry.profileHandle());
return null; return null;
} }
sleeptime = Latency.waitingRemaining(crawlEntry.url(), minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server // depending on the caching policy we need sleep time to avoid DoS-like situations
sleeptime = (profileEntry.cacheStrategy() == CrawlProfile.CACHE_STRATEGY_CACHEONLY) ? 0 : Latency.waitingRemaining(crawlEntry.url(), minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
assert result.equals(new String(rowEntry.getPrimaryKeyBytes())) : "result = " + result + ", rowEntry.getPrimaryKeyBytes() = " + new String(rowEntry.getPrimaryKeyBytes()); assert result.equals(new String(rowEntry.getPrimaryKeyBytes())) : "result = " + result + ", rowEntry.getPrimaryKeyBytes() = " + new String(rowEntry.getPrimaryKeyBytes());
assert result.equals(crawlEntry.url().hash()) : "result = " + result + ", crawlEntry.url().hash() = " + crawlEntry.url().hash(); assert result.equals(crawlEntry.url().hash()) : "result = " + result + ", crawlEntry.url().hash() = " + crawlEntry.url().hash();

@ -256,10 +256,10 @@ public class CrawlProfile {
} }
public final static int CACHE_STRATEGY_NOCACHE = 0; public final static int CACHE_STRATEGY_NOCACHE = 0; // never use the cache, all content from fresh internet source
public final static int CACHE_STRATEGY_IFEXIST = 1; public final static int CACHE_STRATEGY_IFFRESH = 1; // use the cache if the cache exists and is fresh using the proxy-fresh rules
public final static int CACHE_STRATEGY_IFFRESH = 2; public final static int CACHE_STRATEGY_IFEXIST = 2; // use the cache if the cache exist. Do no check freshness. Othervise use online source.
public final static int CACHE_STRATEGY_CACHEONLY = 3; public final static int CACHE_STRATEGY_CACHEONLY = 3; // never go online, use all content from cache. If no cache exist, treat content as unavailable
public static class entry { public static class entry {
// this is a simple record structure that hold all properties of a single crawl start // this is a simple record structure that hold all properties of a single crawl start

@ -532,17 +532,21 @@ public class CrawlQueues {
this.request.setStatus("worker-loading", serverProcessorJob.STATUS_RUNNING); this.request.setStatus("worker-loading", serverProcessorJob.STATUS_RUNNING);
String result = null; String result = null;
// load a resource, store it to htcache and push queue entry to switchboard queue // load a resource and push queue entry to switchboard queue
// returns null if everything went fine, a fail reason string if a problem occurred // returns null if everything went fine, a fail reason string if a problem occurred
Response response;
try { try {
request.setStatus("loading", serverProcessorJob.STATUS_RUNNING); request.setStatus("loading", serverProcessorJob.STATUS_RUNNING);
response = sb.loader.load(request); Response response = sb.loader.load(request);
assert response != null; if (response == null) {
request.setStatus("loaded", serverProcessorJob.STATUS_RUNNING); request.setStatus("error", serverProcessorJob.STATUS_FINISHED);
final boolean stored = sb.toIndexer(response); if (log.isFine()) log.logFine("problem loading " + request.url().toString() + ": no content (possibly caused by cache policy)");
request.setStatus("enqueued-" + ((stored) ? "ok" : "fail"), serverProcessorJob.STATUS_FINISHED); result = "no content (possibly caused by cache policy)";
result = (stored) ? null : "not enqueued to indexer"; } else {
request.setStatus("loaded", serverProcessorJob.STATUS_RUNNING);
final boolean stored = sb.toIndexer(response);
request.setStatus("enqueued-" + ((stored) ? "ok" : "fail"), serverProcessorJob.STATUS_FINISHED);
result = (stored) ? null : "not enqueued to indexer";
}
} catch (IOException e) { } catch (IOException e) {
request.setStatus("error", serverProcessorJob.STATUS_FINISHED); request.setStatus("error", serverProcessorJob.STATUS_FINISHED);
if (log.isFine()) log.logFine("problem loading " + request.url().toString() + ": " + e.getMessage()); if (log.isFine()) log.logFine("problem loading " + request.url().toString() + ": " + e.getMessage());

Loading…
Cancel
Save