- added Latency control to the crawler:

because of the strongly enhanced indexing speed when using the new IndexCell RWI data structures (> 2000PPM on my notebook), it is now necessary to control the crawling speed depending on the response time of the target server (which is also YaCy in case of some intranet indexing use cases).
The latency factor in crawl delay times is derived from the time that a target hosts takes to answer on http requests. For internet domains, the crawl delay is a minimum of twice the response time, in intranet cases the delay time is now a halve of the response time.

- added API to monitor the latency times of the crawler:
a new api at /api/latency_p.xml returns the current response times of domains, the time when the domain was accessed by the crawler the last time and many more attributes.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5733 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 7426dde6a6
commit 67aaffc0a2

@ -0,0 +1,64 @@
// latency_p.java
// ------------
// (C) 2009 by Michael Peter Christen; mc@yacy.net
// first published 19.03.2009 on http://yacy.net
//
// $LastChangedDate: 2009-03-16 19:08:43 +0100 (Mo, 16 Mrz 2009) $
// $LastChangedRevision: 5723 $
// $LastChangedBy: borg-0300 $
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.util.Date;
import java.util.Iterator;
import java.util.Map;
import de.anomic.crawler.Latency;
import de.anomic.crawler.NoticedURL;
import de.anomic.crawler.Latency.Host;
import de.anomic.http.httpRequestHeader;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
public class latency_p {
public static serverObjects respond(final httpRequestHeader header, final serverObjects post, final serverSwitch<?> env) {
final serverObjects prop = new serverObjects();
//final plasmaSwitchboard sb = (plasmaSwitchboard) env;
final Iterator<Map.Entry<String, Host>> i = Latency.iterator();
Map.Entry<String, Host> e;
int c = 0;
Latency.Host host;
while (i.hasNext()) {
e = i.next();
host = e.getValue();
prop.putXML("domains_" + c + "_hosthash", e.getKey());
prop.putXML("domains_" + c + "_host", host.host());
prop.putXML("domains_" + c + "_lastaccess", DateFormatter.formatShortSecond(new Date(host.lastacc())));
prop.put("domains_" + c + "_count", host.count());
prop.put("domains_" + c + "_average", host.average());
prop.put("domains_" + c + "_robots", host.robotsDelay());
prop.put("domains_" + c + "_flux", host.flux(NoticedURL.minimumGlobalDeltaInit));
c++;
}
prop.put("domains", c);
// return rewrite properties
return prop;
}
}

@ -0,0 +1,12 @@
<?xml version="1.0"?>
<latency>
#{domains}#
<domain host="#[host]#" id="#[hosthash]#">
<lastaccess>#[lastaccess]#</lastaccess>
<count>#[count]#</count>
<average>#[average]#</average>
<robots>#[robots]#</robots>
<flux>#[flux]#</flux>
</domain>
#{/domains}#
</latency>

@ -1,3 +1,25 @@
// webstructure.java
// ------------
// (C) 2009 by Michael Peter Christen; mc@yacy.net
// first published 01.05.2008 on http://yacy.net
//
// $LastChangedDate: 2009-03-16 19:08:43 +0100 (Mo, 16 Mrz 2009) $
// $LastChangedRevision: 5723 $
// $LastChangedBy: borg-0300 $
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.util.Iterator;

@ -288,7 +288,7 @@ public class Balancer {
while (i.hasNext() && c < max) {
entry = i.next();
list = entry.getValue();
if (onlyReadyForAccess && CrawlEntry.waitingRemainingGuessed(list.getFirst(), minimumLocalDelta, minimumGlobalDelta) > 0) continue;
if (onlyReadyForAccess && Latency.waitingRemainingGuessed(list.getFirst(), minimumLocalDelta, minimumGlobalDelta) > 0) continue;
if (ram) {
urlRAMStack.add(list.removeFirst());
} else try {
@ -415,7 +415,7 @@ public class Balancer {
String besthash = null;
while (i.hasNext()) {
urlhash = i.next();
waitingtime = CrawlEntry.waitingRemainingGuessed(urlhash, minimumLocalDelta, minimumGlobalDelta);
waitingtime = Latency.waitingRemainingGuessed(urlhash, minimumLocalDelta, minimumGlobalDelta);
if (waitingtime == 0) {
// zero waiting is a good one
result = urlhash;
@ -474,7 +474,7 @@ public class Balancer {
while (hitlist.size() > 0) {
domhash = hitlist.remove(hitlist.lastKey());
if (maxhash == null) maxhash = domhash; // remember first entry
waitingtime = CrawlEntry.waitingRemainingGuessed(domhash, minimumLocalDelta, minimumGlobalDelta);
waitingtime = Latency.waitingRemainingGuessed(domhash, minimumLocalDelta, minimumGlobalDelta);
if (waitingtime < 100) {
domlist = domainStacks.get(domhash);
result = domlist.removeFirst();
@ -498,7 +498,7 @@ public class Balancer {
while (i.hasNext()) {
entry = i.next();
domhash = entry.getKey();
waitingtime = CrawlEntry.waitingRemainingGuessed(domhash, minimumLocalDelta, minimumGlobalDelta);
waitingtime = Latency.waitingRemainingGuessed(domhash, minimumLocalDelta, minimumGlobalDelta);
if (waitingtime == 0) {
// zero waiting is a good one
domlist = entry.getValue();
@ -542,7 +542,7 @@ public class Balancer {
// check if the time after retrieval of last hash from same
// domain is not shorter than the minimumDelta
long waitingtime = CrawlEntry.waitingRemainingGuessed(nexthash, minimumLocalDelta, minimumGlobalDelta);
long waitingtime = Latency.waitingRemainingGuessed(nexthash, minimumLocalDelta, minimumGlobalDelta);
if (waitingtime == 0) {
// the entry is fine
result = new String((top) ? urlFileStack.pop().getColBytes(0) : urlFileStack.pot().getColBytes(0));
@ -571,7 +571,7 @@ public class Balancer {
// at this point we must check if the crawlEntry has relevancy because the crawl profile still exists
// if not: return null. A calling method must handle the null value and try again
if (profile != null && !profile.hasEntry(crawlEntry.profileHandle())) return null;
long sleeptime = crawlEntry.waitingRemaining(minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
long sleeptime = Latency.waitingRemaining(crawlEntry.url(), minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
if (delay && sleeptime > 0) {
// force a busy waiting here
@ -586,11 +586,7 @@ public class Balancer {
sleeptime -= this.lastPrepare - t;
}
if (sleeptime > 0) try {synchronized(this) { this.wait(sleeptime); }} catch (final InterruptedException e) {}
}
// update statistical data
crawlEntry.updateAccess();
}
return crawlEntry;
}

@ -29,13 +29,11 @@ package de.anomic.crawler;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.Date;
import java.util.concurrent.ConcurrentHashMap;
import de.anomic.kelondro.index.Row;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.Bitfield;
import de.anomic.kelondro.order.NaturalOrder;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverProcessorJob;
import de.anomic.yacy.yacySeedDB;
import de.anomic.yacy.yacyURL;
@ -62,9 +60,6 @@ public class CrawlEntry extends serverProcessorJob {
Base64Order.enhancedCoder
);
// a shared domainAccess map for all balancers. the key is a domain-hash (6 bytes)
public static final ConcurrentHashMap<String, domaccess> domainAccess = new ConcurrentHashMap<String, domaccess>();
private String initiator; // the initiator hash, is NULL or "" if it is the own proxy;
// if this is generated by a crawl, the own peer hash in entered
private String refhash; // the url's referrer hash
@ -83,26 +78,6 @@ public class CrawlEntry extends serverProcessorJob {
private String statusMessage;
private int initialHash; // to provide a object hash that does not change even if the url changes because of redirection
public static class domaccess {
public long time;
public long robotsMinDelay;
public int count;
public String host;
public domaccess(String host) {
this.host = host;
this.time = System.currentTimeMillis();
this.robotsMinDelay = 0;
this.count = 0;
}
public void update() {
this.time = System.currentTimeMillis();
this.count++;
}
public long flux(long range) {
return count >= 1000 ? range * Math.min(5000, count) / 1000 : range / (1000 - count);
}
}
/**
* @param initiator the hash of the initiator peer
@ -292,79 +267,4 @@ public class CrawlEntry extends serverProcessorJob {
return this.profileHandle;
}
/**
* check a domain flag so it can be calculated when a domain was accessed the last time
*/
public void updateAccess() {
String domhash = url.hash().substring(6);
domaccess lastAccess = domainAccess.get(domhash);
if (lastAccess == null) {
lastAccess = new domaccess(url.getHost());
domainAccess.put(domhash, lastAccess);
} else {
lastAccess.update();
}
}
/**
* calculates how long should be waited until the domain can be accessed again
* this follows from given minimum access times, the fact that an url is a CGI url or now, the times that the domain was accessed
* and a given minimum access time as given in robots.txt
* @param minimumLocalDelta
* @param minimumGlobalDelta
* @return the remaining waiting time in milliseconds
*/
public long waitingRemaining(final long minimumLocalDelta, final long minimumGlobalDelta) {
final long delta = lastAccessDelta(this.url.hash());
if (delta == Long.MAX_VALUE) return 0;
final boolean local = this.url.isLocal();
long deltaBase = (local) ? minimumLocalDelta : minimumGlobalDelta;
if (this.url.isCGI()) deltaBase = deltaBase * 2; // mostly there is a database access in the background which creates a lot of unwanted IO on target site
domaccess lastAccess = domainAccess.get(this.url.hash().substring(6));
lastAccess.robotsMinDelay = (local) ? 0 : plasmaSwitchboard.getSwitchboard().robots.crawlDelayMillis(this.url);
final long genericDelta = Math.min(
60000,
Math.max(
deltaBase + ((lastAccess == null || local) ? 0 : lastAccess.flux(deltaBase)),
(local || lastAccess == null) ? 0 : lastAccess.robotsMinDelay)
); // prevent that that robots file can stop our indexer completely
return (delta < genericDelta) ? genericDelta - delta : 0;
}
/**
* guess a minimum waiting time
* the time is not correct, because if the domain was not checked yet by the robots.txt delay value, it is too low
* also the 'isCGI' property is missing, because the full text of the domain is unknown here
* @param urlhash
* @param minimumLocalDelta
* @param minimumGlobalDelta
* @return the remaining waiting time in milliseconds
*/
public static long waitingRemainingGuessed(String urlhash, final long minimumLocalDelta, final long minimumGlobalDelta) {
final long delta = lastAccessDelta(urlhash);
if (delta == Long.MAX_VALUE) return 0;
final boolean local = yacyURL.isLocal(urlhash);
long deltaBase = (local) ? minimumLocalDelta : minimumGlobalDelta;
domaccess lastAccess = domainAccess.get(urlhash.substring(6));
final long genericDelta = Math.min(
60000,
Math.max(
deltaBase + ((lastAccess == null || local) ? 0 : lastAccess.flux(deltaBase)),
(local || lastAccess == null) ? 0 : lastAccess.robotsMinDelay)
); // prevent that that robots file can stop our indexer completely
return (delta < genericDelta) ? genericDelta - delta : 0;
}
/**
* calculates the time since the last access of the domain as referenced by the url hash
* @param urlhash
* @return a time in milliseconds since last access of the domain or Long.MAX_VALUE if the domain was not accessed before
*/
private static long lastAccessDelta(final String hash) {
assert hash != null;
assert hash.length() == 6 || hash.length() == 12;
final domaccess lastAccess = domainAccess.get((hash.length() > 6) ? hash.substring(6) : hash);
if (lastAccess == null) return Long.MAX_VALUE; // never accessed
return System.currentTimeMillis() - lastAccess.time;
}
}

@ -78,6 +78,8 @@ public class FTPLoader {
* @return
*/
public Document load(final CrawlEntry entry) throws IOException {
long start = System.currentTimeMillis();
final yacyURL entryUrl = entry.url();
final String fullPath = getPath(entryUrl);
@ -146,7 +148,8 @@ public class FTPLoader {
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "server download" + detail);
throw new IOException("FTPLoader: Unable to download URL " + entry.url().toString() + detail);
}
Latency.update(entry.url().hash().substring(6), entry.url().getHost(), System.currentTimeMillis() - start);
return htCache;
}

@ -99,7 +99,10 @@ public final class HTTPLoader {
}
public Document load(final CrawlEntry entry, final String parserMode) throws IOException {
return load(entry, parserMode, DEFAULT_CRAWLING_RETRY_COUNT);
long start = System.currentTimeMillis();
Document doc = load(entry, parserMode, DEFAULT_CRAWLING_RETRY_COUNT);
Latency.update(entry.url().hash().substring(6), entry.url().getHost(), System.currentTimeMillis() - start);
return doc;
}
private Document load(final CrawlEntry entry, final String parserMode, final int retryCount) throws IOException {
@ -242,90 +245,6 @@ public final class HTTPLoader {
}
}*/
return htCache;
/*
} catch (final Exception e) {
final String errorMsg = e.getMessage();
String failreason = null;
if ((e instanceof IOException) &&
(errorMsg != null) &&
(errorMsg.indexOf("socket closed") >= 0) &&
(Thread.currentThread().isInterrupted())
) {
this.log.logInfo("CRAWLER Interruption detected because of server shutdown.");
failreason = ErrorURL.DENIED_SERVER_SHUTDOWN;
} else if (e instanceof httpdLimitExceededException) {
this.log.logWarning("CRAWLER Max file size limit '" + maxFileSize + "' exceeded while downloading URL " + entry.url());
failreason = ErrorURL.DENIED_FILESIZE_LIMIT_EXCEEDED;
} else if (e instanceof MalformedURLException) {
this.log.logWarning("CRAWLER Malformed URL '" + entry.url().toString() + "' detected. ");
failreason = ErrorURL.DENIED_MALFORMED_URL;
} else if (e instanceof NoRouteToHostException) {
this.log.logWarning("CRAWLER No route to host found while trying to crawl URL '" + entry.url().toString() + "'.");
failreason = ErrorURL.DENIED_NO_ROUTE_TO_HOST;
} else if ((e instanceof UnknownHostException) ||
((errorMsg != null) && (errorMsg.indexOf("unknown host") >= 0))) {
final yacyURL u = (entry.referrerhash() == null) ? null : sb.getURL(entry.referrerhash());
this.log.logWarning("CRAWLER Unknown host in URL '" + entry.url() + "'. " +
"Referer URL: " + ((u == null) ? "Unknown" : u.toNormalform(true, true)));
failreason = ErrorURL.DENIED_UNKNOWN_HOST;
} else if (e instanceof java.net.BindException) {
this.log.logWarning("CRAWLER BindException detected while trying to download content from '" + entry.url().toString() +
"'. Retrying request.");
failreason = ErrorURL.DENIED_CONNECTION_BIND_EXCEPTION;
} else if ((errorMsg != null) && (
(errorMsg.indexOf("Corrupt GZIP trailer") >= 0) ||
(errorMsg.indexOf("Not in GZIP format") >= 0) ||
(errorMsg.indexOf("Unexpected end of ZLIB") >= 0)
)) {
this.log.logWarning("CRAWLER Problems detected while receiving gzip encoded content from '" + entry.url().toString() +
"'. Retrying request without using gzip content encoding.");
failreason = ErrorURL.DENIED_CONTENT_DECODING_ERROR;
} else if ((errorMsg != null) && (errorMsg.indexOf("The host did not accept the connection within timeout of") >= 0)) {
this.log.logWarning("CRAWLER Timeout while trying to connect to '" + entry.url().toString() +
"'. Retrying request.");
failreason = ErrorURL.DENIED_CONNECTION_TIMEOUT;
} else if ((errorMsg != null) && (errorMsg.indexOf("Read timed out") >= 0)) {
this.log.logWarning("CRAWLER Read timeout while receiving content from '" + entry.url().toString() +
"'. Retrying request.");
failreason = ErrorURL.DENIED_CONNECTION_TIMEOUT;
} else if ((errorMsg != null) && (errorMsg.indexOf("connect timed out") >= 0)) {
this.log.logWarning("CRAWLER Timeout while trying to connect to '" + entry.url().toString() +
"'. Retrying request.");
failreason = ErrorURL.DENIED_CONNECTION_TIMEOUT;
} else if ((errorMsg != null) && (errorMsg.indexOf("Connection timed out") >= 0)) {
this.log.logWarning("CRAWLER Connection timeout while receiving content from '" + entry.url().toString() +
"'. Retrying request.");
failreason = ErrorURL.DENIED_CONNECTION_TIMEOUT;
} else if ((errorMsg != null) && (errorMsg.indexOf("Connection refused") >= 0)) {
this.log.logWarning("CRAWLER Connection refused while trying to connect to '" + entry.url().toString() + "'.");
failreason = ErrorURL.DENIED_CONNECTION_REFUSED;
} else if ((errorMsg != null) && (errorMsg.indexOf("Circular redirect to '")>= 0)) {
this.log.logWarning("CRAWLER Redirect Error with URL '" + entry.url().toString() + "': "+ e.toString());
failreason = ErrorURL.DENIED_REDIRECTION_COUNTER_EXCEEDED;
} else if ((errorMsg != null) && (errorMsg.indexOf("There is not enough space on the disk") >= 0)) {
this.log.logSevere("CRAWLER Not enough space on the disk detected while crawling '" + entry.url().toString() + "'. " +
"Pausing crawlers. ");
sb.pauseCrawlJob(plasmaSwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
sb.pauseCrawlJob(plasmaSwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
failreason = ErrorURL.DENIED_OUT_OF_DISK_SPACE;
} else if ((errorMsg != null) && (errorMsg.indexOf("Network is unreachable") >=0)) {
this.log.logSevere("CRAWLER Network is unreachable while trying to crawl URL '" + entry.url().toString() + "'. ");
failreason = ErrorURL.DENIED_NETWORK_IS_UNREACHABLE;
} else if ((errorMsg != null) && (errorMsg.indexOf("No trusted certificate found")>= 0)) {
this.log.logSevere("CRAWLER No trusted certificate found for URL '" + entry.url().toString() + "'. ");
failreason = ErrorURL.DENIED_SSL_UNTRUSTED_CERT;
} else {
this.log.logSevere("CRAWLER Unexpected Error with URL '" + entry.url().toString() + "': " + e.toString(), e);
failreason = ErrorURL.DENIED_CONNECTION_ERROR;
}
if (failreason != null) {
// add url into error db
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, failreason);
}
return null;
}*/
}
}

@ -0,0 +1,198 @@
// Latency.java
// ------------
// (C) 2009 by Michael Peter Christen; mc@yacy.net
// first published 19.03.2009 on http://yacy.net
//
// $LastChangedDate: 2009-03-16 19:08:43 +0100 (Mo, 16 Mrz 2009) $
// $LastChangedRevision: 5723 $
// $LastChangedBy: borg-0300 $
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.crawler;
import java.util.Iterator;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.yacy.yacyURL;
public class Latency {
private static final ConcurrentHashMap<String, Host> map = new ConcurrentHashMap<String, Host>();
public static void update(String hosthash, String host, long time) {
assert hosthash.length() == 6;
Host h = map.get(hosthash);
if (h == null) {
h = new Host(host, time);
map.put(hosthash, h);
} else {
h.update(time);
}
}
public static Host host(String hosthash) {
assert hosthash.length() == 6;
return map.get(hosthash);
}
public static int average(String hosthash) {
assert hosthash.length() == 6;
Host h = map.get(hosthash);
if (h == null) return 1000;
return h.average();
}
public static Iterator<Map.Entry<String, Host>> iterator() {
return map.entrySet().iterator();
}
/**
* calculate the time since the last access of the domain as referenced by the url hash
* @param urlhash
* @return a time in milliseconds since last access of the domain or Long.MAX_VALUE if the domain was not accessed before
*/
public static long lastAccessDelta(final String urlhash) {
assert urlhash.length() == 12 || urlhash.length() == 6;
final Latency.Host host = Latency.host((urlhash.length() == 6) ? urlhash : urlhash.substring(6));
if (host == null) return Long.MAX_VALUE; // never accessed
return System.currentTimeMillis() - host.lastacc();
}
/**
* guess a minimum waiting time
* the time is not correct, because if the domain was not checked yet by the robots.txt delay value, it is too low
* also the 'isCGI' property is missing, because the full text of the domain is unknown here
* @param urlhash
* @param minimumLocalDelta
* @param minimumGlobalDelta
* @return the remaining waiting time in milliseconds
*/
public static long waitingRemainingGuessed(String urlhash, final long minimumLocalDelta, final long minimumGlobalDelta) {
assert urlhash.length() == 12 || urlhash.length() == 6;
Latency.Host latency = Latency.host((urlhash.length() == 6) ? urlhash : urlhash.substring(6));
if (latency == null) return 0;
final long delta = System.currentTimeMillis() - latency.lastacc();
final boolean local = yacyURL.isLocal(urlhash);
long deltaBase = (local) ? minimumLocalDelta : minimumGlobalDelta;
final long genericDelta = Math.min(
60000,
Math.max(
deltaBase + ((latency == null || local) ? 0 : latency.flux(deltaBase)),
(local || latency == null) ? 0 : latency.robotsDelay())
); // prevent that that robots file can stop our indexer completely
return (delta < genericDelta) ? genericDelta - delta : 0;
}
/**
* calculates how long should be waited until the domain can be accessed again
* this follows from:
* - given minimum access times
* - the fact that an url is a CGI url or not
* - the times that the domain was accessed (flux factor)
* - the response latency of the domain
* - and a given minimum access time as given in robots.txt
* @param minimumLocalDelta
* @param minimumGlobalDelta
* @return the remaining waiting time in milliseconds
*/
public static long waitingRemaining(yacyURL url, final long minimumLocalDelta, final long minimumGlobalDelta) {
// first check if the domain was _ever_ accessed before
String hosthash = url.hash().substring(6);
Host host = host(hosthash);
if (host == null) return 0; // no delay
// the time since last access to the domain is the basis of the remaining calculation
final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc();
// find the minimum waiting time based on the network domain (local or global)
final boolean local = url.isLocal();
long waiting = (local) ? minimumLocalDelta : minimumGlobalDelta;
// for CGI accesses, we double the minimum time
// mostly there is a database access in the background
// which creates a lot of unwanted IO on target site
if (url.isCGI()) waiting = waiting * 2;
// if we have accessed the domain many times, get slower (the flux factor)
if (!local) waiting += host.flux(waiting);
// find the delay as given by robots.txt on target site
long robotsDelay = (local) ? 0 : plasmaSwitchboard.getSwitchboard().robots.crawlDelayMillis(url);
waiting = Math.max(waiting, robotsDelay);
// use the access latency as rule how fast we can access the server
// this applies also to localhost, but differently, because it is not necessary to
// consider so many external accesses
waiting = Math.max(waiting, (local) ? host.average() / 2 : host.average() * 2);
// prevent that that a robots file can stop our indexer completely
waiting = Math.min(60000, waiting);
// return time that is remaining
System.out.println("Latency: " + (waiting - timeSinceLastAccess));
return Math.max(0, waiting - timeSinceLastAccess);
}
public static final class Host {
private long timeacc;
private long lastacc;
private int count;
private String host;
private long robotsMinDelay;
public Host(String host, long time) {
this.host = host;
this.timeacc = time;
this.count = 1;
this.lastacc = System.currentTimeMillis();
this.robotsMinDelay = 0;
}
public void update(long time) {
this.lastacc = System.currentTimeMillis();
this.timeacc += time;
this.count++;
}
public int count() {
return this.count;
}
public int average() {
return (int) (this.timeacc / this.count);
}
public long lastacc() {
return this.lastacc;
}
public String host() {
return this.host;
}
public void robotsDelay(long ur) {
this.robotsMinDelay = ur;
}
public long robotsDelay() {
return this.robotsMinDelay;
}
public long flux(long range) {
return count >= 1000 ? range * Math.min(5000, count) / 1000 : range / (1000 - count);
}
}
}

@ -43,8 +43,8 @@ public class NoticedURL {
public static final int STACK_TYPE_MOVIE = 12; // put on movie stack
public static final int STACK_TYPE_MUSIC = 13; // put on music stack
private static final long minimumLocalDeltaInit = 0; // the minimum time difference between access of the same local domain
private static final long minimumGlobalDeltaInit = 500; // the minimum time difference between access of the same global domain
public static final long minimumLocalDeltaInit = 0; // the minimum time difference between access of the same local domain
public static final long minimumGlobalDeltaInit = 500; // the minimum time difference between access of the same global domain
private Balancer coreStack; // links found by crawling to depth-1
private Balancer limitStack; // links found by crawling at target depth

@ -251,30 +251,50 @@ public final class ReferenceContainerArray {
System.out.println("*** DEBUG mergeOldest: vvvvvvvvv array has " + this.array.entries() + " entries vvvvvvvvv");
System.out.println("*** DEBUG mergeOldest: unmounted " + f1.getName());
System.out.println("*** DEBUG mergeOldest: unmounted " + f2.getName());
File newFile = merge(f1, f2);
if (newFile == null) return true;
this.array.mountBLOB(newFile);
System.out.println("*** DEBUG mergeOldest: mounted " + newFile.getName());
System.out.println("*** DEBUG mergeOldest: ^^^^^^^^^^^ array has " + this.array.entries() + " entries ^^^^^^^^^^^");
return true;
}
private synchronized File merge(File f1, File f2) throws IOException {
// iterate both files and write a new one
CloneableIterator<ReferenceContainer> i1 = new blobFileEntries(f1, this.payloadrow);
CloneableIterator<ReferenceContainer> i2 = new blobFileEntries(f2, this.payloadrow);
ReferenceContainer c1, c2, c1o, c2o;
c1 = (i1.hasNext()) ? i1.next() : null;
c2 = (i2.hasNext()) ? i2.next() : null;
if (c1 == null && c2 == null) {
if (!f1.delete()) f1.deleteOnExit();
if (!f2.delete()) f2.deleteOnExit();
return true;
}
if (c1 == null) {
if (!f1.delete()) f1.deleteOnExit();
this.array.mountBLOB(f2);
return true;
}
if (c2 == null) {
if (!i1.hasNext()) {
if (i2.hasNext()) {
if (!f1.delete()) f1.deleteOnExit();
return f2;
} else {
if (!f1.delete()) f1.deleteOnExit();
if (!f2.delete()) f2.deleteOnExit();
return null;
}
} else if (!i2.hasNext()) {
if (!f2.delete()) f2.deleteOnExit();
this.array.mountBLOB(f1);
return true;
return f1;
}
assert i1.hasNext();
assert i2.hasNext();
File newFile = newContainerBLOBFile();
HeapWriter writer = new HeapWriter(newFile, this.array.keylength(), this.array.ordering());
merge(i1, i2, writer);
writer.close(true);
// we don't need the old files any more
if (!f1.delete()) f1.deleteOnExit();
if (!f2.delete()) f2.deleteOnExit();
return newFile;
}
private synchronized void merge(CloneableIterator<ReferenceContainer> i1, CloneableIterator<ReferenceContainer> i2, HeapWriter writer) throws IOException {
assert i1.hasNext();
assert i2.hasNext();
ReferenceContainer c1, c2, c1o, c2o;
c1 = i1.next();
c2 = i2.next();
int e;
while (true) {
assert c1 != null;
@ -338,14 +358,6 @@ public final class ReferenceContainerArray {
break;
}
// finished with writing
writer.close(true);
// we don't need the old files any more
if (!f1.delete()) f1.deleteOnExit();
if (!f2.delete()) f2.deleteOnExit();
this.array.mountBLOB(newFile);
System.out.println("*** DEBUG mergeOldest: mounted " + newFile.getName());
System.out.println("*** DEBUG mergeOldest: ^^^^^^^^^^^ array has " + this.array.entries() + " entries ^^^^^^^^^^^");
return true;
}
}

Loading…
Cancel
Save