- fixed a not working selection rule in balancer

- more security about crawl-delay, be more fail-save
- better logging in case of long forced crawl-delays

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6027 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent f5602404d5
commit 5fdba0fa51

@ -684,16 +684,16 @@ crawler.clientTimeout=9000
crawler.http.acceptEncoding=gzip
crawler.http.acceptLanguage=en-us,en;q=0.5
crawler.http.acceptCharset=ISO-8859-1,utf-8;q=0.7,*;q=0.7
crawler.http.maxFileSize=262144
crawler.http.maxFileSize=1048576
# ftp crawler specific settings; size in bytes
crawler.ftp.maxFileSize=262144
crawler.ftp.maxFileSize=1048576
# maximum number of crawler threads
crawler.MaxActiveThreads = 30
crawler.MaxActiveThreads = 50
# maximum size of indexing queue
indexer.slots = 40
indexer.slots = 100
# maximum size of stacker queue
stacker.slots = 2000

@ -264,12 +264,15 @@ public class Balancer {
public synchronized CrawlEntry pop(boolean delay, CrawlProfile profile) throws IOException {
// returns a crawl entry from the stack and ensures minimum delta times
filltop(delay, 600000, false);
filltop(delay, 60000, false);
filltop(delay, 10000, false);
filltop(delay, 6000, false);
filltop(delay, 3000, false);
filltop(delay, 1000, false);
filltop(delay, -600000, false);
filltop(delay, -60000, false);
filltop(delay, -10000, false);
filltop(delay, -6000, false);
filltop(delay, -4000, false);
filltop(delay, -3000, false);
filltop(delay, -2000, false);
filltop(delay, -1000, false);
filltop(delay, -500, false);
filltop(delay, 0, true);
String result = null; // the result
@ -306,16 +309,29 @@ public class Balancer {
// in best case, this should never happen if the balancer works propertly
// this is only to protection against the worst case, where the crawler could
// behave in a DoS-manner
Log.logInfo("BALANCER", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ((sleeptime > Math.max(minimumLocalDelta, minimumGlobalDelta)) ? " (forced latency)" : ""));
try {synchronized(this) { this.wait(sleeptime); }} catch (final InterruptedException e) {}
if (sleeptime > 1000 && this.domainStacks.size() > 1) this.domainStacks.remove(crawlEntry.url().hash().substring(6));
Log.logInfo("BALANCER", "forcing crawl-delay of " + (sleeptime / 1000) + " seconds for " + crawlEntry.url().getHost() + ((sleeptime > Math.max(minimumLocalDelta, minimumGlobalDelta)) ? " (forced latency)" : ""));
long loops = sleeptime / 3000;
long rest = sleeptime % 3000;
if (loops < 2) {
rest = rest + 3000 * loops;
loops = 0;
}
try {synchronized(this) { this.wait(rest); }} catch (final InterruptedException e) {}
for (int i = 0; i < loops; i++) {
Log.logInfo("BALANCER", "waiting for " + crawlEntry.url().getHost() + ": " + ((loops - i) * 3) + " seconds remaining...");
try {synchronized(this) { this.wait(3000); }} catch (final InterruptedException e) {}
}
if (sleeptime > 3000 && this.domainStacks.size() > 1) this.domainStacks.remove(crawlEntry.url().hash().substring(6));
}
Latency.update(crawlEntry.url().hash().substring(6), crawlEntry.url().getHost());
return crawlEntry;
}
private void filltop(boolean delay, long maximumwaiting, boolean acceptonebest) {
if (this.top.size() > 0) return;
//System.out.println("*** DEBUG started filltop delay=" + ((delay) ? "true":"false") + ", maximumwaiting=" + maximumwaiting + ", acceptonebest=" + ((acceptonebest) ? "true":"false"));
// check if we need to get entries from the file index
try {
fillDomainStacks(800);
@ -345,6 +361,7 @@ public class Balancer {
}
continue;
}
//System.out.println("*** accepting " + n + " : " + w);
}
n = entry.getValue().removeFirst();
this.top.add(n);

@ -254,7 +254,13 @@ public class CrawlQueues {
+ ", must-not-match=" + profile.mustNotMatchPattern().toString()
+ ", permission=" + ((sb.peers == null) ? "undefined" : (((sb.peers.mySeed().isSenior()) || (sb.peers.mySeed().isPrincipal())) ? "true" : "false")));
processLocalCrawling(urlEntry, stats);
// work off one Crawl stack entry
if ((urlEntry == null) || (urlEntry.url() == null)) {
log.logInfo(stats + ": urlEntry = null");
} else {
new crawlWorker(urlEntry);
}
} else {
this.log.logSevere("Unsupported protocol in URL '" + url.toString());
}
@ -498,18 +504,6 @@ public class CrawlQueues {
}
}
private void processLocalCrawling(final CrawlEntry entry, final String stats) {
// work off one Crawl stack entry
if ((entry == null) || (entry.url() == null)) {
log.logInfo(stats + ": urlEntry = null");
return;
}
new crawlWorker(entry);
log.logInfo(stats + ": enqueued for load " + entry.url() + " [" + entry.url().hash() + "]");
return;
}
public Document loadResourceFromWeb(
final yacyURL url,
final int socketTimeout,

@ -45,6 +45,17 @@ public class Latency {
}
}
public static void update(String hosthash, String host) {
assert hosthash.length() == 6;
Host h = map.get(hosthash);
if (h == null) {
h = new Host(host, 3000);
map.put(hosthash, h);
} else {
h.update();
}
}
public static void slowdown(String hosthash, String host) {
assert hosthash.length() == 6;
Host h = map.get(hosthash);
@ -94,7 +105,8 @@ public class Latency {
* @param urlhash
* @param minimumLocalDelta
* @param minimumGlobalDelta
* @return the remaining waiting time in milliseconds
* @return the remaining waiting time in milliseconds. The return value may be negative
* which expresses how long the time is over the minimum waiting time.
*/
public static long waitingRemainingGuessed(String hosthash, final long minimumLocalDelta, final long minimumGlobalDelta) {
assert hosthash.length() == 12 || hosthash.length() == 6;
@ -121,7 +133,7 @@ public class Latency {
// return time that is remaining
//System.out.println("Latency: " + (waiting - timeSinceLastAccess));
return Math.max(0, waiting - timeSinceLastAccess);
return waiting - timeSinceLastAccess;
}
/**
@ -193,6 +205,9 @@ public class Latency {
this.timeacc += time;
this.count++;
}
public void update() {
this.lastacc = System.currentTimeMillis();
}
public void slowdown() {
this.lastacc = System.currentTimeMillis();
this.timeacc = Math.min(60000, average() * 2);

Loading…
Cancel
Save