because of the strongly enhanced indexing speed when using the new IndexCell RWI data structures (> 2000PPM on my notebook), it is now necessary to control the crawling speed depending on the response time of the target server (which is also YaCy in case of some intranet indexing use cases). The latency factor in crawl delay times is derived from the time that a target hosts takes to answer on http requests. For internet domains, the crawl delay is a minimum of twice the response time, in intranet cases the delay time is now a halve of the response time. - added API to monitor the latency times of the crawler: a new api at /api/latency_p.xml returns the current response times of domains, the time when the domain was accessed by the crawler the last time and many more attributes. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5733 6c8d7289-2bf4-0310-a012-ef5d649a1542pull/1/head
parent
7426dde6a6
commit
67aaffc0a2
@ -0,0 +1,64 @@
|
||||
// latency_p.java
|
||||
// ------------
|
||||
// (C) 2009 by Michael Peter Christen; mc@yacy.net
|
||||
// first published 19.03.2009 on http://yacy.net
|
||||
//
|
||||
// $LastChangedDate: 2009-03-16 19:08:43 +0100 (Mo, 16 Mrz 2009) $
|
||||
// $LastChangedRevision: 5723 $
|
||||
// $LastChangedBy: borg-0300 $
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
import java.util.Date;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
|
||||
import de.anomic.crawler.Latency;
|
||||
import de.anomic.crawler.NoticedURL;
|
||||
import de.anomic.crawler.Latency.Host;
|
||||
import de.anomic.http.httpRequestHeader;
|
||||
import de.anomic.kelondro.util.DateFormatter;
|
||||
import de.anomic.server.serverObjects;
|
||||
import de.anomic.server.serverSwitch;
|
||||
|
||||
public class latency_p {
|
||||
|
||||
public static serverObjects respond(final httpRequestHeader header, final serverObjects post, final serverSwitch<?> env) {
|
||||
|
||||
final serverObjects prop = new serverObjects();
|
||||
//final plasmaSwitchboard sb = (plasmaSwitchboard) env;
|
||||
final Iterator<Map.Entry<String, Host>> i = Latency.iterator();
|
||||
Map.Entry<String, Host> e;
|
||||
int c = 0;
|
||||
Latency.Host host;
|
||||
while (i.hasNext()) {
|
||||
e = i.next();
|
||||
host = e.getValue();
|
||||
prop.putXML("domains_" + c + "_hosthash", e.getKey());
|
||||
prop.putXML("domains_" + c + "_host", host.host());
|
||||
prop.putXML("domains_" + c + "_lastaccess", DateFormatter.formatShortSecond(new Date(host.lastacc())));
|
||||
prop.put("domains_" + c + "_count", host.count());
|
||||
prop.put("domains_" + c + "_average", host.average());
|
||||
prop.put("domains_" + c + "_robots", host.robotsDelay());
|
||||
prop.put("domains_" + c + "_flux", host.flux(NoticedURL.minimumGlobalDeltaInit));
|
||||
c++;
|
||||
}
|
||||
prop.put("domains", c);
|
||||
|
||||
// return rewrite properties
|
||||
return prop;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,12 @@
|
||||
<?xml version="1.0"?>
|
||||
<latency>
|
||||
#{domains}#
|
||||
<domain host="#[host]#" id="#[hosthash]#">
|
||||
<lastaccess>#[lastaccess]#</lastaccess>
|
||||
<count>#[count]#</count>
|
||||
<average>#[average]#</average>
|
||||
<robots>#[robots]#</robots>
|
||||
<flux>#[flux]#</flux>
|
||||
</domain>
|
||||
#{/domains}#
|
||||
</latency>
|
@ -0,0 +1,198 @@
|
||||
// Latency.java
|
||||
// ------------
|
||||
// (C) 2009 by Michael Peter Christen; mc@yacy.net
|
||||
// first published 19.03.2009 on http://yacy.net
|
||||
//
|
||||
// $LastChangedDate: 2009-03-16 19:08:43 +0100 (Mo, 16 Mrz 2009) $
|
||||
// $LastChangedRevision: 5723 $
|
||||
// $LastChangedBy: borg-0300 $
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.crawler;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.yacy.yacyURL;
|
||||
|
||||
public class Latency {
|
||||
|
||||
private static final ConcurrentHashMap<String, Host> map = new ConcurrentHashMap<String, Host>();
|
||||
|
||||
public static void update(String hosthash, String host, long time) {
|
||||
assert hosthash.length() == 6;
|
||||
Host h = map.get(hosthash);
|
||||
if (h == null) {
|
||||
h = new Host(host, time);
|
||||
map.put(hosthash, h);
|
||||
} else {
|
||||
h.update(time);
|
||||
}
|
||||
}
|
||||
|
||||
public static Host host(String hosthash) {
|
||||
assert hosthash.length() == 6;
|
||||
return map.get(hosthash);
|
||||
}
|
||||
|
||||
public static int average(String hosthash) {
|
||||
assert hosthash.length() == 6;
|
||||
Host h = map.get(hosthash);
|
||||
if (h == null) return 1000;
|
||||
return h.average();
|
||||
}
|
||||
|
||||
public static Iterator<Map.Entry<String, Host>> iterator() {
|
||||
return map.entrySet().iterator();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* calculate the time since the last access of the domain as referenced by the url hash
|
||||
* @param urlhash
|
||||
* @return a time in milliseconds since last access of the domain or Long.MAX_VALUE if the domain was not accessed before
|
||||
*/
|
||||
public static long lastAccessDelta(final String urlhash) {
|
||||
assert urlhash.length() == 12 || urlhash.length() == 6;
|
||||
final Latency.Host host = Latency.host((urlhash.length() == 6) ? urlhash : urlhash.substring(6));
|
||||
if (host == null) return Long.MAX_VALUE; // never accessed
|
||||
return System.currentTimeMillis() - host.lastacc();
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* guess a minimum waiting time
|
||||
* the time is not correct, because if the domain was not checked yet by the robots.txt delay value, it is too low
|
||||
* also the 'isCGI' property is missing, because the full text of the domain is unknown here
|
||||
* @param urlhash
|
||||
* @param minimumLocalDelta
|
||||
* @param minimumGlobalDelta
|
||||
* @return the remaining waiting time in milliseconds
|
||||
*/
|
||||
public static long waitingRemainingGuessed(String urlhash, final long minimumLocalDelta, final long minimumGlobalDelta) {
|
||||
assert urlhash.length() == 12 || urlhash.length() == 6;
|
||||
Latency.Host latency = Latency.host((urlhash.length() == 6) ? urlhash : urlhash.substring(6));
|
||||
if (latency == null) return 0;
|
||||
|
||||
final long delta = System.currentTimeMillis() - latency.lastacc();
|
||||
final boolean local = yacyURL.isLocal(urlhash);
|
||||
long deltaBase = (local) ? minimumLocalDelta : minimumGlobalDelta;
|
||||
final long genericDelta = Math.min(
|
||||
60000,
|
||||
Math.max(
|
||||
deltaBase + ((latency == null || local) ? 0 : latency.flux(deltaBase)),
|
||||
(local || latency == null) ? 0 : latency.robotsDelay())
|
||||
); // prevent that that robots file can stop our indexer completely
|
||||
return (delta < genericDelta) ? genericDelta - delta : 0;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* calculates how long should be waited until the domain can be accessed again
|
||||
* this follows from:
|
||||
* - given minimum access times
|
||||
* - the fact that an url is a CGI url or not
|
||||
* - the times that the domain was accessed (flux factor)
|
||||
* - the response latency of the domain
|
||||
* - and a given minimum access time as given in robots.txt
|
||||
* @param minimumLocalDelta
|
||||
* @param minimumGlobalDelta
|
||||
* @return the remaining waiting time in milliseconds
|
||||
*/
|
||||
public static long waitingRemaining(yacyURL url, final long minimumLocalDelta, final long minimumGlobalDelta) {
|
||||
|
||||
// first check if the domain was _ever_ accessed before
|
||||
String hosthash = url.hash().substring(6);
|
||||
Host host = host(hosthash);
|
||||
if (host == null) return 0; // no delay
|
||||
|
||||
// the time since last access to the domain is the basis of the remaining calculation
|
||||
final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc();
|
||||
|
||||
// find the minimum waiting time based on the network domain (local or global)
|
||||
final boolean local = url.isLocal();
|
||||
long waiting = (local) ? minimumLocalDelta : minimumGlobalDelta;
|
||||
|
||||
// for CGI accesses, we double the minimum time
|
||||
// mostly there is a database access in the background
|
||||
// which creates a lot of unwanted IO on target site
|
||||
if (url.isCGI()) waiting = waiting * 2;
|
||||
|
||||
// if we have accessed the domain many times, get slower (the flux factor)
|
||||
if (!local) waiting += host.flux(waiting);
|
||||
|
||||
// find the delay as given by robots.txt on target site
|
||||
long robotsDelay = (local) ? 0 : plasmaSwitchboard.getSwitchboard().robots.crawlDelayMillis(url);
|
||||
waiting = Math.max(waiting, robotsDelay);
|
||||
|
||||
// use the access latency as rule how fast we can access the server
|
||||
// this applies also to localhost, but differently, because it is not necessary to
|
||||
// consider so many external accesses
|
||||
waiting = Math.max(waiting, (local) ? host.average() / 2 : host.average() * 2);
|
||||
|
||||
// prevent that that a robots file can stop our indexer completely
|
||||
waiting = Math.min(60000, waiting);
|
||||
|
||||
// return time that is remaining
|
||||
System.out.println("Latency: " + (waiting - timeSinceLastAccess));
|
||||
return Math.max(0, waiting - timeSinceLastAccess);
|
||||
}
|
||||
|
||||
public static final class Host {
|
||||
private long timeacc;
|
||||
private long lastacc;
|
||||
private int count;
|
||||
private String host;
|
||||
private long robotsMinDelay;
|
||||
public Host(String host, long time) {
|
||||
this.host = host;
|
||||
this.timeacc = time;
|
||||
this.count = 1;
|
||||
this.lastacc = System.currentTimeMillis();
|
||||
this.robotsMinDelay = 0;
|
||||
}
|
||||
public void update(long time) {
|
||||
this.lastacc = System.currentTimeMillis();
|
||||
this.timeacc += time;
|
||||
this.count++;
|
||||
}
|
||||
public int count() {
|
||||
return this.count;
|
||||
}
|
||||
public int average() {
|
||||
return (int) (this.timeacc / this.count);
|
||||
}
|
||||
public long lastacc() {
|
||||
return this.lastacc;
|
||||
}
|
||||
public String host() {
|
||||
return this.host;
|
||||
}
|
||||
public void robotsDelay(long ur) {
|
||||
this.robotsMinDelay = ur;
|
||||
}
|
||||
public long robotsDelay() {
|
||||
return this.robotsMinDelay;
|
||||
}
|
||||
public long flux(long range) {
|
||||
return count >= 1000 ? range * Math.min(5000, count) / 1000 : range / (1000 - count);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in new issue