You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
311 lines
13 KiB
311 lines
13 KiB
// Latency.java
|
|
// ------------
|
|
// (C) 2009 by Michael Peter Christen; mc@yacy.net
|
|
// first published 19.03.2009 on http://yacy.net
|
|
//
|
|
// $LastChangedDate$
|
|
// $LastChangedRevision$
|
|
// $LastChangedBy$
|
|
//
|
|
// This program is free software; you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation; either version 2 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU General Public License
|
|
// along with this program; if not, write to the Free Software
|
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
package de.anomic.crawler;
|
|
|
|
import java.io.IOException;
|
|
import java.util.Iterator;
|
|
import java.util.Map;
|
|
import java.util.Set;
|
|
import java.util.concurrent.ConcurrentHashMap;
|
|
|
|
import net.yacy.cora.document.MultiProtocolURI;
|
|
import net.yacy.cora.protocol.Domains;
|
|
import net.yacy.kelondro.util.MemoryControl;
|
|
|
|
import de.anomic.search.Switchboard;
|
|
|
|
public class Latency {
|
|
|
|
// the map is a mapping from host names to host configurations
|
|
private static final ConcurrentHashMap<String, Host> map = new ConcurrentHashMap<String, Host>();
|
|
|
|
public static void update(MultiProtocolURI url, long time) {
|
|
String host = url.getHost();
|
|
if (host == null) return;
|
|
Host h = map.get(host);
|
|
if (h == null) {
|
|
h = new Host(host, time);
|
|
if (map.size() > 1000 || MemoryControl.shortStatus()) map.clear();
|
|
map.put(host, h);
|
|
} else {
|
|
h.update(time);
|
|
}
|
|
}
|
|
|
|
public static void update(MultiProtocolURI url) {
|
|
String host = url.getHost();
|
|
if (host == null) return;
|
|
Host h = map.get(host);
|
|
if (h == null) {
|
|
h = new Host(host, 3000);
|
|
if (map.size() > 1000 || MemoryControl.shortStatus()) map.clear();
|
|
map.put(host, h);
|
|
} else {
|
|
h.update();
|
|
}
|
|
}
|
|
|
|
public static void slowdown(MultiProtocolURI url) {
|
|
String host = url.getHost();
|
|
if (host == null) return;
|
|
Host h = map.get(host);
|
|
if (h == null) {
|
|
h = new Host(host, 3000);
|
|
if (map.size() > 1000 || MemoryControl.shortStatus()) map.clear();
|
|
map.put(host, h);
|
|
} else {
|
|
h.slowdown();
|
|
}
|
|
}
|
|
|
|
public static Host host(MultiProtocolURI url) {
|
|
String host = url.getHost();
|
|
if (host == null) return null;
|
|
return map.get(host);
|
|
}
|
|
|
|
public static int average(MultiProtocolURI url) {
|
|
String host = url.getHost();
|
|
if (host == null) return 0;
|
|
Host h = map.get(host);
|
|
if (h == null) return 0;
|
|
return h.average();
|
|
}
|
|
|
|
public static Iterator<Map.Entry<String, Host>> iterator() {
|
|
return map.entrySet().iterator();
|
|
}
|
|
|
|
|
|
/**
|
|
* calculate the time since the last access of the domain as referenced by the url hash
|
|
* @param urlhash
|
|
* @return a time in milliseconds since last access of the domain or Long.MAX_VALUE if the domain was not accessed before
|
|
*/
|
|
public static long lastAccessDelta(MultiProtocolURI url) {
|
|
final Latency.Host host = Latency.host(url);
|
|
if (host == null) return Long.MAX_VALUE; // never accessed
|
|
return System.currentTimeMillis() - host.lastacc();
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
* guess a minimum waiting time
|
|
* the time is not correct, because if the domain was not checked yet by the robots.txt delay value, it is too low
|
|
* also the 'isCGI' property is missing, because the full text of the domain is unknown here
|
|
* @param hostname
|
|
* @param minimumLocalDelta
|
|
* @param minimumGlobalDelta
|
|
* @return the remaining waiting time in milliseconds. The return value may be negative
|
|
* which expresses how long the time is over the minimum waiting time.
|
|
*/
|
|
public static long waitingRemainingGuessed(String hostname, final long minimumLocalDelta, final long minimumGlobalDelta) {
|
|
if (hostname == null) return 0;
|
|
Host host = map.get(hostname);
|
|
if (host == null) return 0;
|
|
|
|
// the time since last access to the domain is the basis of the remaining calculation
|
|
final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc();
|
|
|
|
// find the minimum waiting time based on the network domain (local or global)
|
|
final boolean local = Domains.isLocal(hostname);
|
|
long waiting = (local) ? minimumLocalDelta : minimumGlobalDelta;
|
|
|
|
// if we have accessed the domain many times, get slower (the flux factor)
|
|
if (!local) waiting += host.flux(waiting);
|
|
|
|
// use the access latency as rule how fast we can access the server
|
|
// this applies also to localhost, but differently, because it is not necessary to
|
|
// consider so many external accesses
|
|
waiting = Math.max(waiting, (local) ? host.average() / 2 : host.average() * 2);
|
|
|
|
// prevent that that a robots file can stop our indexer completely
|
|
waiting = Math.min(60000, waiting);
|
|
|
|
// return time that is remaining
|
|
//System.out.println("Latency: " + (waiting - timeSinceLastAccess));
|
|
return waiting - timeSinceLastAccess;
|
|
}
|
|
|
|
/**
|
|
* calculates how long should be waited until the domain can be accessed again
|
|
* this follows from:
|
|
* - given minimum access times
|
|
* - the fact that an url is a CGI url or not
|
|
* - the times that the domain was accessed (flux factor)
|
|
* - the response latency of the domain
|
|
* - and a given minimum access time as given in robots.txt
|
|
* @param minimumLocalDelta
|
|
* @param minimumGlobalDelta
|
|
* @return the remaining waiting time in milliseconds
|
|
*/
|
|
public static long waitingRemaining(MultiProtocolURI url, final Set<String> thisAgents, final long minimumLocalDelta, final long minimumGlobalDelta) {
|
|
|
|
// first check if the domain was _ever_ accessed before
|
|
Host host = host(url);
|
|
if (host == null) return Long.MIN_VALUE; // no delay if host is new
|
|
|
|
// find the minimum waiting time based on the network domain (local or global)
|
|
final boolean local = url.isLocal();
|
|
if (local) return minimumLocalDelta;
|
|
long waiting = (local) ? minimumLocalDelta : minimumGlobalDelta;
|
|
|
|
// the time since last access to the domain is the basis of the remaining calculation
|
|
final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc();
|
|
|
|
// for CGI accesses, we double the minimum time
|
|
// mostly there is a database access in the background
|
|
// which creates a lot of unwanted IO on target site
|
|
if (url.isCGI()) waiting = waiting * 2;
|
|
|
|
// if we have accessed the domain many times, get slower (the flux factor)
|
|
if (!local && host != null) waiting += host.flux(waiting);
|
|
|
|
// find the delay as given by robots.txt on target site
|
|
long robotsDelay = 0;
|
|
if (!local) {
|
|
RobotsTxtEntry robotsEntry;
|
|
try {
|
|
robotsEntry = Switchboard.getSwitchboard().robots.getEntry(url, thisAgents);
|
|
} catch (IOException e) {
|
|
robotsEntry = null;
|
|
}
|
|
robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis();
|
|
if (robotsEntry != null && robotsDelay == 0 && robotsEntry.getAgentName() != null) return 0; // no limits if granted exclusively for this peer
|
|
}
|
|
waiting = Math.max(waiting, robotsDelay);
|
|
|
|
// use the access latency as rule how fast we can access the server
|
|
// this applies also to localhost, but differently, because it is not necessary to
|
|
// consider so many external accesses
|
|
waiting = Math.max(waiting, (local) ? host.average() / 2 : host.average() * 2);
|
|
|
|
// prevent that that a robots file can stop our indexer completely
|
|
waiting = Math.min(60000, waiting);
|
|
|
|
// return time that is remaining
|
|
//System.out.println("Latency: " + (waiting - timeSinceLastAccess));
|
|
return Math.max(0, waiting - timeSinceLastAccess);
|
|
}
|
|
|
|
|
|
public static String waitingRemainingExplain(MultiProtocolURI url, final Set<String> thisAgents, final long minimumLocalDelta, final long minimumGlobalDelta) {
|
|
|
|
// first check if the domain was _ever_ accessed before
|
|
Host host = host(url);
|
|
if (host == null) return "host " + host + " never accessed before -> 0"; // no delay if host is new
|
|
|
|
StringBuilder s = new StringBuilder(50);
|
|
|
|
// find the minimum waiting time based on the network domain (local or global)
|
|
final boolean local = url.isLocal();
|
|
long waiting = (local) ? minimumLocalDelta : minimumGlobalDelta;
|
|
s.append("minimumDelta = ").append(waiting);
|
|
|
|
// the time since last access to the domain is the basis of the remaining calculation
|
|
final long timeSinceLastAccess = (host == null) ? 0 : System.currentTimeMillis() - host.lastacc();
|
|
s.append(", timeSinceLastAccess = ").append(timeSinceLastAccess);
|
|
|
|
// for CGI accesses, we double the minimum time
|
|
// mostly there is a database access in the background
|
|
// which creates a lot of unwanted IO on target site
|
|
if (url.isCGI()) s.append(", isCGI = true -> double");
|
|
|
|
// if we have accessed the domain many times, get slower (the flux factor)
|
|
if (!local && host != null) s.append(", flux = ").append(host.flux(waiting));
|
|
|
|
// find the delay as given by robots.txt on target site
|
|
long robotsDelay = 0;
|
|
if (!local) {
|
|
RobotsTxtEntry robotsEntry;
|
|
try {
|
|
robotsEntry = Switchboard.getSwitchboard().robots.getEntry(url, thisAgents);
|
|
} catch (IOException e) {
|
|
robotsEntry = null;
|
|
}
|
|
robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis();
|
|
if (robotsEntry != null && robotsDelay == 0 && robotsEntry.getAgentName() != null) return "no waiting for exclusive granted peer"; // no limits if granted exclusively for this peer
|
|
}
|
|
s.append(", robots.delay = ").append(robotsDelay);
|
|
|
|
// use the access latency as rule how fast we can access the server
|
|
// this applies also to localhost, but differently, because it is not necessary to
|
|
// consider so many external accesses
|
|
if (host != null) s.append(", host.average = ").append(host.average());
|
|
|
|
return s.toString();
|
|
}
|
|
|
|
public static final class Host {
|
|
private long timeacc;
|
|
private long lastacc;
|
|
private int count;
|
|
private final String host;
|
|
private long robotsMinDelay;
|
|
public Host(String host, long time) {
|
|
this.host = host;
|
|
this.timeacc = time;
|
|
this.count = 1;
|
|
this.lastacc = System.currentTimeMillis();
|
|
this.robotsMinDelay = 0;
|
|
}
|
|
public void update(long time) {
|
|
this.lastacc = System.currentTimeMillis();
|
|
this.timeacc += Math.min(30000, time);
|
|
this.count++;
|
|
}
|
|
public void update() {
|
|
this.lastacc = System.currentTimeMillis();
|
|
}
|
|
public void slowdown() {
|
|
this.lastacc = System.currentTimeMillis();
|
|
this.timeacc = Math.min(60000, average() * 2);
|
|
this.count = 1;
|
|
}
|
|
public int count() {
|
|
return this.count;
|
|
}
|
|
public int average() {
|
|
return (int) (this.timeacc / this.count);
|
|
}
|
|
public long lastacc() {
|
|
return this.lastacc;
|
|
}
|
|
public String host() {
|
|
return this.host;
|
|
}
|
|
public void robotsDelay(long ur) {
|
|
this.robotsMinDelay = ur;
|
|
}
|
|
public long robotsDelay() {
|
|
return this.robotsMinDelay;
|
|
}
|
|
public long flux(long range) {
|
|
return count >= 1000 ? range * Math.min(5000, count) / 1000 : range / (1000 - count);
|
|
}
|
|
}
|
|
|
|
}
|