diff --git a/htroot/api/latency_p.java b/htroot/api/latency_p.java new file mode 100644 index 000000000..8c5a3bc3e --- /dev/null +++ b/htroot/api/latency_p.java @@ -0,0 +1,64 @@ +// latency_p.java +// ------------ +// (C) 2009 by Michael Peter Christen; mc@yacy.net +// first published 19.03.2009 on http://yacy.net +// +// $LastChangedDate: 2009-03-16 19:08:43 +0100 (Mo, 16 Mrz 2009) $ +// $LastChangedRevision: 5723 $ +// $LastChangedBy: borg-0300 $ +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +import java.util.Date; +import java.util.Iterator; +import java.util.Map; + +import de.anomic.crawler.Latency; +import de.anomic.crawler.NoticedURL; +import de.anomic.crawler.Latency.Host; +import de.anomic.http.httpRequestHeader; +import de.anomic.kelondro.util.DateFormatter; +import de.anomic.server.serverObjects; +import de.anomic.server.serverSwitch; + +public class latency_p { + + public static serverObjects respond(final httpRequestHeader header, final serverObjects post, final serverSwitch env) { + + final serverObjects prop = new serverObjects(); + //final plasmaSwitchboard sb = (plasmaSwitchboard) env; + final Iterator> i = Latency.iterator(); + Map.Entry e; + int c = 0; + Latency.Host host; + while (i.hasNext()) { + e = i.next(); + host = e.getValue(); + prop.putXML("domains_" + c + "_hosthash", e.getKey()); + prop.putXML("domains_" + c + "_host", host.host()); + prop.putXML("domains_" + c + "_lastaccess", DateFormatter.formatShortSecond(new Date(host.lastacc()))); + prop.put("domains_" + c + "_count", host.count()); + prop.put("domains_" + c + "_average", host.average()); + prop.put("domains_" + c + "_robots", host.robotsDelay()); + prop.put("domains_" + c + "_flux", host.flux(NoticedURL.minimumGlobalDeltaInit)); + c++; + } + prop.put("domains", c); + + // return rewrite properties + return prop; + } + +} diff --git a/htroot/api/latency_p.xml b/htroot/api/latency_p.xml new file mode 100644 index 000000000..df0bc1c78 --- /dev/null +++ b/htroot/api/latency_p.xml @@ -0,0 +1,12 @@ + + +#{domains}# + + #[lastaccess]# + #[count]# + #[average]# + #[robots]# + #[flux]# + +#{/domains}# + \ No newline at end of file diff --git a/htroot/api/webstructure.java b/htroot/api/webstructure.java index 67fe37f65..a41e5a745 100644 --- a/htroot/api/webstructure.java +++ b/htroot/api/webstructure.java @@ -1,3 +1,25 @@ +// webstructure.java +// ------------ +// (C) 2009 by Michael Peter Christen; mc@yacy.net +// first published 01.05.2008 on http://yacy.net +// +// $LastChangedDate: 2009-03-16 19:08:43 +0100 (Mo, 16 Mrz 2009) $ +// $LastChangedRevision: 5723 $ +// $LastChangedBy: borg-0300 $ +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA import java.util.Iterator; diff --git a/source/de/anomic/crawler/Balancer.java b/source/de/anomic/crawler/Balancer.java index 33dfbb115..dee37b2ee 100644 --- a/source/de/anomic/crawler/Balancer.java +++ b/source/de/anomic/crawler/Balancer.java @@ -288,7 +288,7 @@ public class Balancer { while (i.hasNext() && c < max) { entry = i.next(); list = entry.getValue(); - if (onlyReadyForAccess && CrawlEntry.waitingRemainingGuessed(list.getFirst(), minimumLocalDelta, minimumGlobalDelta) > 0) continue; + if (onlyReadyForAccess && Latency.waitingRemainingGuessed(list.getFirst(), minimumLocalDelta, minimumGlobalDelta) > 0) continue; if (ram) { urlRAMStack.add(list.removeFirst()); } else try { @@ -415,7 +415,7 @@ public class Balancer { String besthash = null; while (i.hasNext()) { urlhash = i.next(); - waitingtime = CrawlEntry.waitingRemainingGuessed(urlhash, minimumLocalDelta, minimumGlobalDelta); + waitingtime = Latency.waitingRemainingGuessed(urlhash, minimumLocalDelta, minimumGlobalDelta); if (waitingtime == 0) { // zero waiting is a good one result = urlhash; @@ -474,7 +474,7 @@ public class Balancer { while (hitlist.size() > 0) { domhash = hitlist.remove(hitlist.lastKey()); if (maxhash == null) maxhash = domhash; // remember first entry - waitingtime = CrawlEntry.waitingRemainingGuessed(domhash, minimumLocalDelta, minimumGlobalDelta); + waitingtime = Latency.waitingRemainingGuessed(domhash, minimumLocalDelta, minimumGlobalDelta); if (waitingtime < 100) { domlist = domainStacks.get(domhash); result = domlist.removeFirst(); @@ -498,7 +498,7 @@ public class Balancer { while (i.hasNext()) { entry = i.next(); domhash = entry.getKey(); - waitingtime = CrawlEntry.waitingRemainingGuessed(domhash, minimumLocalDelta, minimumGlobalDelta); + waitingtime = Latency.waitingRemainingGuessed(domhash, minimumLocalDelta, minimumGlobalDelta); if (waitingtime == 0) { // zero waiting is a good one domlist = entry.getValue(); @@ -542,7 +542,7 @@ public class Balancer { // check if the time after retrieval of last hash from same // domain is not shorter than the minimumDelta - long waitingtime = CrawlEntry.waitingRemainingGuessed(nexthash, minimumLocalDelta, minimumGlobalDelta); + long waitingtime = Latency.waitingRemainingGuessed(nexthash, minimumLocalDelta, minimumGlobalDelta); if (waitingtime == 0) { // the entry is fine result = new String((top) ? urlFileStack.pop().getColBytes(0) : urlFileStack.pot().getColBytes(0)); @@ -571,7 +571,7 @@ public class Balancer { // at this point we must check if the crawlEntry has relevancy because the crawl profile still exists // if not: return null. A calling method must handle the null value and try again if (profile != null && !profile.hasEntry(crawlEntry.profileHandle())) return null; - long sleeptime = crawlEntry.waitingRemaining(minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server + long sleeptime = Latency.waitingRemaining(crawlEntry.url(), minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server if (delay && sleeptime > 0) { // force a busy waiting here @@ -586,11 +586,7 @@ public class Balancer { sleeptime -= this.lastPrepare - t; } if (sleeptime > 0) try {synchronized(this) { this.wait(sleeptime); }} catch (final InterruptedException e) {} - } - - // update statistical data - crawlEntry.updateAccess(); - + } return crawlEntry; } diff --git a/source/de/anomic/crawler/CrawlEntry.java b/source/de/anomic/crawler/CrawlEntry.java index afea2bb60..beebc8b9c 100755 --- a/source/de/anomic/crawler/CrawlEntry.java +++ b/source/de/anomic/crawler/CrawlEntry.java @@ -29,13 +29,11 @@ package de.anomic.crawler; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.util.Date; -import java.util.concurrent.ConcurrentHashMap; import de.anomic.kelondro.index.Row; import de.anomic.kelondro.order.Base64Order; import de.anomic.kelondro.order.Bitfield; import de.anomic.kelondro.order.NaturalOrder; -import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverProcessorJob; import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.yacyURL; @@ -62,9 +60,6 @@ public class CrawlEntry extends serverProcessorJob { Base64Order.enhancedCoder ); - // a shared domainAccess map for all balancers. the key is a domain-hash (6 bytes) - public static final ConcurrentHashMap domainAccess = new ConcurrentHashMap(); - private String initiator; // the initiator hash, is NULL or "" if it is the own proxy; // if this is generated by a crawl, the own peer hash in entered private String refhash; // the url's referrer hash @@ -83,26 +78,6 @@ public class CrawlEntry extends serverProcessorJob { private String statusMessage; private int initialHash; // to provide a object hash that does not change even if the url changes because of redirection - public static class domaccess { - public long time; - public long robotsMinDelay; - public int count; - public String host; - public domaccess(String host) { - this.host = host; - this.time = System.currentTimeMillis(); - this.robotsMinDelay = 0; - this.count = 0; - } - public void update() { - this.time = System.currentTimeMillis(); - this.count++; - } - public long flux(long range) { - return count >= 1000 ? range * Math.min(5000, count) / 1000 : range / (1000 - count); - } - } - /** * @param initiator the hash of the initiator peer @@ -292,79 +267,4 @@ public class CrawlEntry extends serverProcessorJob { return this.profileHandle; } - /** - * check a domain flag so it can be calculated when a domain was accessed the last time - */ - public void updateAccess() { - String domhash = url.hash().substring(6); - domaccess lastAccess = domainAccess.get(domhash); - if (lastAccess == null) { - lastAccess = new domaccess(url.getHost()); - domainAccess.put(domhash, lastAccess); - } else { - lastAccess.update(); - } - } - - /** - * calculates how long should be waited until the domain can be accessed again - * this follows from given minimum access times, the fact that an url is a CGI url or now, the times that the domain was accessed - * and a given minimum access time as given in robots.txt - * @param minimumLocalDelta - * @param minimumGlobalDelta - * @return the remaining waiting time in milliseconds - */ - public long waitingRemaining(final long minimumLocalDelta, final long minimumGlobalDelta) { - final long delta = lastAccessDelta(this.url.hash()); - if (delta == Long.MAX_VALUE) return 0; - final boolean local = this.url.isLocal(); - long deltaBase = (local) ? minimumLocalDelta : minimumGlobalDelta; - if (this.url.isCGI()) deltaBase = deltaBase * 2; // mostly there is a database access in the background which creates a lot of unwanted IO on target site - domaccess lastAccess = domainAccess.get(this.url.hash().substring(6)); - lastAccess.robotsMinDelay = (local) ? 0 : plasmaSwitchboard.getSwitchboard().robots.crawlDelayMillis(this.url); - final long genericDelta = Math.min( - 60000, - Math.max( - deltaBase + ((lastAccess == null || local) ? 0 : lastAccess.flux(deltaBase)), - (local || lastAccess == null) ? 0 : lastAccess.robotsMinDelay) - ); // prevent that that robots file can stop our indexer completely - return (delta < genericDelta) ? genericDelta - delta : 0; - } - - /** - * guess a minimum waiting time - * the time is not correct, because if the domain was not checked yet by the robots.txt delay value, it is too low - * also the 'isCGI' property is missing, because the full text of the domain is unknown here - * @param urlhash - * @param minimumLocalDelta - * @param minimumGlobalDelta - * @return the remaining waiting time in milliseconds - */ - public static long waitingRemainingGuessed(String urlhash, final long minimumLocalDelta, final long minimumGlobalDelta) { - final long delta = lastAccessDelta(urlhash); - if (delta == Long.MAX_VALUE) return 0; - final boolean local = yacyURL.isLocal(urlhash); - long deltaBase = (local) ? minimumLocalDelta : minimumGlobalDelta; - domaccess lastAccess = domainAccess.get(urlhash.substring(6)); - final long genericDelta = Math.min( - 60000, - Math.max( - deltaBase + ((lastAccess == null || local) ? 0 : lastAccess.flux(deltaBase)), - (local || lastAccess == null) ? 0 : lastAccess.robotsMinDelay) - ); // prevent that that robots file can stop our indexer completely - return (delta < genericDelta) ? genericDelta - delta : 0; - } - - /** - * calculates the time since the last access of the domain as referenced by the url hash - * @param urlhash - * @return a time in milliseconds since last access of the domain or Long.MAX_VALUE if the domain was not accessed before - */ - private static long lastAccessDelta(final String hash) { - assert hash != null; - assert hash.length() == 6 || hash.length() == 12; - final domaccess lastAccess = domainAccess.get((hash.length() > 6) ? hash.substring(6) : hash); - if (lastAccess == null) return Long.MAX_VALUE; // never accessed - return System.currentTimeMillis() - lastAccess.time; - } } \ No newline at end of file diff --git a/source/de/anomic/crawler/FTPLoader.java b/source/de/anomic/crawler/FTPLoader.java index 1ae35b30d..5f1e415ed 100644 --- a/source/de/anomic/crawler/FTPLoader.java +++ b/source/de/anomic/crawler/FTPLoader.java @@ -78,6 +78,8 @@ public class FTPLoader { * @return */ public Document load(final CrawlEntry entry) throws IOException { + + long start = System.currentTimeMillis(); final yacyURL entryUrl = entry.url(); final String fullPath = getPath(entryUrl); @@ -146,7 +148,8 @@ public class FTPLoader { sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "server download" + detail); throw new IOException("FTPLoader: Unable to download URL " + entry.url().toString() + detail); } - + + Latency.update(entry.url().hash().substring(6), entry.url().getHost(), System.currentTimeMillis() - start); return htCache; } diff --git a/source/de/anomic/crawler/HTTPLoader.java b/source/de/anomic/crawler/HTTPLoader.java index 425ddf374..f8c12495b 100644 --- a/source/de/anomic/crawler/HTTPLoader.java +++ b/source/de/anomic/crawler/HTTPLoader.java @@ -99,7 +99,10 @@ public final class HTTPLoader { } public Document load(final CrawlEntry entry, final String parserMode) throws IOException { - return load(entry, parserMode, DEFAULT_CRAWLING_RETRY_COUNT); + long start = System.currentTimeMillis(); + Document doc = load(entry, parserMode, DEFAULT_CRAWLING_RETRY_COUNT); + Latency.update(entry.url().hash().substring(6), entry.url().getHost(), System.currentTimeMillis() - start); + return doc; } private Document load(final CrawlEntry entry, final String parserMode, final int retryCount) throws IOException { @@ -242,90 +245,6 @@ public final class HTTPLoader { } }*/ return htCache; - /* - } catch (final Exception e) { - final String errorMsg = e.getMessage(); - String failreason = null; - - if ((e instanceof IOException) && - (errorMsg != null) && - (errorMsg.indexOf("socket closed") >= 0) && - (Thread.currentThread().isInterrupted()) - ) { - this.log.logInfo("CRAWLER Interruption detected because of server shutdown."); - failreason = ErrorURL.DENIED_SERVER_SHUTDOWN; - } else if (e instanceof httpdLimitExceededException) { - this.log.logWarning("CRAWLER Max file size limit '" + maxFileSize + "' exceeded while downloading URL " + entry.url()); - failreason = ErrorURL.DENIED_FILESIZE_LIMIT_EXCEEDED; - } else if (e instanceof MalformedURLException) { - this.log.logWarning("CRAWLER Malformed URL '" + entry.url().toString() + "' detected. "); - failreason = ErrorURL.DENIED_MALFORMED_URL; - } else if (e instanceof NoRouteToHostException) { - this.log.logWarning("CRAWLER No route to host found while trying to crawl URL '" + entry.url().toString() + "'."); - failreason = ErrorURL.DENIED_NO_ROUTE_TO_HOST; - } else if ((e instanceof UnknownHostException) || - ((errorMsg != null) && (errorMsg.indexOf("unknown host") >= 0))) { - final yacyURL u = (entry.referrerhash() == null) ? null : sb.getURL(entry.referrerhash()); - this.log.logWarning("CRAWLER Unknown host in URL '" + entry.url() + "'. " + - "Referer URL: " + ((u == null) ? "Unknown" : u.toNormalform(true, true))); - failreason = ErrorURL.DENIED_UNKNOWN_HOST; - } else if (e instanceof java.net.BindException) { - this.log.logWarning("CRAWLER BindException detected while trying to download content from '" + entry.url().toString() + - "'. Retrying request."); - failreason = ErrorURL.DENIED_CONNECTION_BIND_EXCEPTION; - } else if ((errorMsg != null) && ( - (errorMsg.indexOf("Corrupt GZIP trailer") >= 0) || - (errorMsg.indexOf("Not in GZIP format") >= 0) || - (errorMsg.indexOf("Unexpected end of ZLIB") >= 0) - )) { - this.log.logWarning("CRAWLER Problems detected while receiving gzip encoded content from '" + entry.url().toString() + - "'. Retrying request without using gzip content encoding."); - failreason = ErrorURL.DENIED_CONTENT_DECODING_ERROR; - } else if ((errorMsg != null) && (errorMsg.indexOf("The host did not accept the connection within timeout of") >= 0)) { - this.log.logWarning("CRAWLER Timeout while trying to connect to '" + entry.url().toString() + - "'. Retrying request."); - failreason = ErrorURL.DENIED_CONNECTION_TIMEOUT; - } else if ((errorMsg != null) && (errorMsg.indexOf("Read timed out") >= 0)) { - this.log.logWarning("CRAWLER Read timeout while receiving content from '" + entry.url().toString() + - "'. Retrying request."); - failreason = ErrorURL.DENIED_CONNECTION_TIMEOUT; - } else if ((errorMsg != null) && (errorMsg.indexOf("connect timed out") >= 0)) { - this.log.logWarning("CRAWLER Timeout while trying to connect to '" + entry.url().toString() + - "'. Retrying request."); - failreason = ErrorURL.DENIED_CONNECTION_TIMEOUT; - } else if ((errorMsg != null) && (errorMsg.indexOf("Connection timed out") >= 0)) { - this.log.logWarning("CRAWLER Connection timeout while receiving content from '" + entry.url().toString() + - "'. Retrying request."); - failreason = ErrorURL.DENIED_CONNECTION_TIMEOUT; - } else if ((errorMsg != null) && (errorMsg.indexOf("Connection refused") >= 0)) { - this.log.logWarning("CRAWLER Connection refused while trying to connect to '" + entry.url().toString() + "'."); - failreason = ErrorURL.DENIED_CONNECTION_REFUSED; - } else if ((errorMsg != null) && (errorMsg.indexOf("Circular redirect to '")>= 0)) { - this.log.logWarning("CRAWLER Redirect Error with URL '" + entry.url().toString() + "': "+ e.toString()); - failreason = ErrorURL.DENIED_REDIRECTION_COUNTER_EXCEEDED; - } else if ((errorMsg != null) && (errorMsg.indexOf("There is not enough space on the disk") >= 0)) { - this.log.logSevere("CRAWLER Not enough space on the disk detected while crawling '" + entry.url().toString() + "'. " + - "Pausing crawlers. "); - sb.pauseCrawlJob(plasmaSwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); - sb.pauseCrawlJob(plasmaSwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL); - failreason = ErrorURL.DENIED_OUT_OF_DISK_SPACE; - } else if ((errorMsg != null) && (errorMsg.indexOf("Network is unreachable") >=0)) { - this.log.logSevere("CRAWLER Network is unreachable while trying to crawl URL '" + entry.url().toString() + "'. "); - failreason = ErrorURL.DENIED_NETWORK_IS_UNREACHABLE; - } else if ((errorMsg != null) && (errorMsg.indexOf("No trusted certificate found")>= 0)) { - this.log.logSevere("CRAWLER No trusted certificate found for URL '" + entry.url().toString() + "'. "); - failreason = ErrorURL.DENIED_SSL_UNTRUSTED_CERT; - } else { - this.log.logSevere("CRAWLER Unexpected Error with URL '" + entry.url().toString() + "': " + e.toString(), e); - failreason = ErrorURL.DENIED_CONNECTION_ERROR; - } - - if (failreason != null) { - // add url into error db - sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, failreason); - } - return null; - }*/ } } diff --git a/source/de/anomic/crawler/Latency.java b/source/de/anomic/crawler/Latency.java new file mode 100644 index 000000000..e8c31fe27 --- /dev/null +++ b/source/de/anomic/crawler/Latency.java @@ -0,0 +1,198 @@ +// Latency.java +// ------------ +// (C) 2009 by Michael Peter Christen; mc@yacy.net +// first published 19.03.2009 on http://yacy.net +// +// $LastChangedDate: 2009-03-16 19:08:43 +0100 (Mo, 16 Mrz 2009) $ +// $LastChangedRevision: 5723 $ +// $LastChangedBy: borg-0300 $ +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.crawler; + +import java.util.Iterator; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.yacy.yacyURL; + +public class Latency { + + private static final ConcurrentHashMap map = new ConcurrentHashMap(); + + public static void update(String hosthash, String host, long time) { + assert hosthash.length() == 6; + Host h = map.get(hosthash); + if (h == null) { + h = new Host(host, time); + map.put(hosthash, h); + } else { + h.update(time); + } + } + + public static Host host(String hosthash) { + assert hosthash.length() == 6; + return map.get(hosthash); + } + + public static int average(String hosthash) { + assert hosthash.length() == 6; + Host h = map.get(hosthash); + if (h == null) return 1000; + return h.average(); + } + + public static Iterator> iterator() { + return map.entrySet().iterator(); + } + + + /** + * calculate the time since the last access of the domain as referenced by the url hash + * @param urlhash + * @return a time in milliseconds since last access of the domain or Long.MAX_VALUE if the domain was not accessed before + */ + public static long lastAccessDelta(final String urlhash) { + assert urlhash.length() == 12 || urlhash.length() == 6; + final Latency.Host host = Latency.host((urlhash.length() == 6) ? urlhash : urlhash.substring(6)); + if (host == null) return Long.MAX_VALUE; // never accessed + return System.currentTimeMillis() - host.lastacc(); + } + + + + /** + * guess a minimum waiting time + * the time is not correct, because if the domain was not checked yet by the robots.txt delay value, it is too low + * also the 'isCGI' property is missing, because the full text of the domain is unknown here + * @param urlhash + * @param minimumLocalDelta + * @param minimumGlobalDelta + * @return the remaining waiting time in milliseconds + */ + public static long waitingRemainingGuessed(String urlhash, final long minimumLocalDelta, final long minimumGlobalDelta) { + assert urlhash.length() == 12 || urlhash.length() == 6; + Latency.Host latency = Latency.host((urlhash.length() == 6) ? urlhash : urlhash.substring(6)); + if (latency == null) return 0; + + final long delta = System.currentTimeMillis() - latency.lastacc(); + final boolean local = yacyURL.isLocal(urlhash); + long deltaBase = (local) ? minimumLocalDelta : minimumGlobalDelta; + final long genericDelta = Math.min( + 60000, + Math.max( + deltaBase + ((latency == null || local) ? 0 : latency.flux(deltaBase)), + (local || latency == null) ? 0 : latency.robotsDelay()) + ); // prevent that that robots file can stop our indexer completely + return (delta < genericDelta) ? genericDelta - delta : 0; + } + + + /** + * calculates how long should be waited until the domain can be accessed again + * this follows from: + * - given minimum access times + * - the fact that an url is a CGI url or not + * - the times that the domain was accessed (flux factor) + * - the response latency of the domain + * - and a given minimum access time as given in robots.txt + * @param minimumLocalDelta + * @param minimumGlobalDelta + * @return the remaining waiting time in milliseconds + */ + public static long waitingRemaining(yacyURL url, final long minimumLocalDelta, final long minimumGlobalDelta) { + + // first check if the domain was _ever_ accessed before + String hosthash = url.hash().substring(6); + Host host = host(hosthash); + if (host == null) return 0; // no delay + + // the time since last access to the domain is the basis of the remaining calculation + final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc(); + + // find the minimum waiting time based on the network domain (local or global) + final boolean local = url.isLocal(); + long waiting = (local) ? minimumLocalDelta : minimumGlobalDelta; + + // for CGI accesses, we double the minimum time + // mostly there is a database access in the background + // which creates a lot of unwanted IO on target site + if (url.isCGI()) waiting = waiting * 2; + + // if we have accessed the domain many times, get slower (the flux factor) + if (!local) waiting += host.flux(waiting); + + // find the delay as given by robots.txt on target site + long robotsDelay = (local) ? 0 : plasmaSwitchboard.getSwitchboard().robots.crawlDelayMillis(url); + waiting = Math.max(waiting, robotsDelay); + + // use the access latency as rule how fast we can access the server + // this applies also to localhost, but differently, because it is not necessary to + // consider so many external accesses + waiting = Math.max(waiting, (local) ? host.average() / 2 : host.average() * 2); + + // prevent that that a robots file can stop our indexer completely + waiting = Math.min(60000, waiting); + + // return time that is remaining + System.out.println("Latency: " + (waiting - timeSinceLastAccess)); + return Math.max(0, waiting - timeSinceLastAccess); + } + + public static final class Host { + private long timeacc; + private long lastacc; + private int count; + private String host; + private long robotsMinDelay; + public Host(String host, long time) { + this.host = host; + this.timeacc = time; + this.count = 1; + this.lastacc = System.currentTimeMillis(); + this.robotsMinDelay = 0; + } + public void update(long time) { + this.lastacc = System.currentTimeMillis(); + this.timeacc += time; + this.count++; + } + public int count() { + return this.count; + } + public int average() { + return (int) (this.timeacc / this.count); + } + public long lastacc() { + return this.lastacc; + } + public String host() { + return this.host; + } + public void robotsDelay(long ur) { + this.robotsMinDelay = ur; + } + public long robotsDelay() { + return this.robotsMinDelay; + } + public long flux(long range) { + return count >= 1000 ? range * Math.min(5000, count) / 1000 : range / (1000 - count); + } + } + +} diff --git a/source/de/anomic/crawler/NoticedURL.java b/source/de/anomic/crawler/NoticedURL.java index ee744f295..aca55fb94 100755 --- a/source/de/anomic/crawler/NoticedURL.java +++ b/source/de/anomic/crawler/NoticedURL.java @@ -43,8 +43,8 @@ public class NoticedURL { public static final int STACK_TYPE_MOVIE = 12; // put on movie stack public static final int STACK_TYPE_MUSIC = 13; // put on music stack - private static final long minimumLocalDeltaInit = 0; // the minimum time difference between access of the same local domain - private static final long minimumGlobalDeltaInit = 500; // the minimum time difference between access of the same global domain + public static final long minimumLocalDeltaInit = 0; // the minimum time difference between access of the same local domain + public static final long minimumGlobalDeltaInit = 500; // the minimum time difference between access of the same global domain private Balancer coreStack; // links found by crawling to depth-1 private Balancer limitStack; // links found by crawling at target depth diff --git a/source/de/anomic/kelondro/text/ReferenceContainerArray.java b/source/de/anomic/kelondro/text/ReferenceContainerArray.java index 3cb01717e..7b71a9ddc 100644 --- a/source/de/anomic/kelondro/text/ReferenceContainerArray.java +++ b/source/de/anomic/kelondro/text/ReferenceContainerArray.java @@ -251,30 +251,50 @@ public final class ReferenceContainerArray { System.out.println("*** DEBUG mergeOldest: vvvvvvvvv array has " + this.array.entries() + " entries vvvvvvvvv"); System.out.println("*** DEBUG mergeOldest: unmounted " + f1.getName()); System.out.println("*** DEBUG mergeOldest: unmounted " + f2.getName()); + File newFile = merge(f1, f2); + if (newFile == null) return true; + this.array.mountBLOB(newFile); + System.out.println("*** DEBUG mergeOldest: mounted " + newFile.getName()); + System.out.println("*** DEBUG mergeOldest: ^^^^^^^^^^^ array has " + this.array.entries() + " entries ^^^^^^^^^^^"); + return true; + } + + private synchronized File merge(File f1, File f2) throws IOException { // iterate both files and write a new one CloneableIterator i1 = new blobFileEntries(f1, this.payloadrow); CloneableIterator i2 = new blobFileEntries(f2, this.payloadrow); - ReferenceContainer c1, c2, c1o, c2o; - c1 = (i1.hasNext()) ? i1.next() : null; - c2 = (i2.hasNext()) ? i2.next() : null; - if (c1 == null && c2 == null) { - if (!f1.delete()) f1.deleteOnExit(); - if (!f2.delete()) f2.deleteOnExit(); - return true; - } - if (c1 == null) { - if (!f1.delete()) f1.deleteOnExit(); - this.array.mountBLOB(f2); - return true; - } - if (c2 == null) { + if (!i1.hasNext()) { + if (i2.hasNext()) { + if (!f1.delete()) f1.deleteOnExit(); + return f2; + } else { + if (!f1.delete()) f1.deleteOnExit(); + if (!f2.delete()) f2.deleteOnExit(); + return null; + } + } else if (!i2.hasNext()) { if (!f2.delete()) f2.deleteOnExit(); - this.array.mountBLOB(f1); - return true; + return f1; } + assert i1.hasNext(); + assert i2.hasNext(); File newFile = newContainerBLOBFile(); HeapWriter writer = new HeapWriter(newFile, this.array.keylength(), this.array.ordering()); + merge(i1, i2, writer); + writer.close(true); + // we don't need the old files any more + if (!f1.delete()) f1.deleteOnExit(); + if (!f2.delete()) f2.deleteOnExit(); + return newFile; + } + + private synchronized void merge(CloneableIterator i1, CloneableIterator i2, HeapWriter writer) throws IOException { + assert i1.hasNext(); + assert i2.hasNext(); + ReferenceContainer c1, c2, c1o, c2o; + c1 = i1.next(); + c2 = i2.next(); int e; while (true) { assert c1 != null; @@ -338,14 +358,6 @@ public final class ReferenceContainerArray { break; } // finished with writing - writer.close(true); - // we don't need the old files any more - if (!f1.delete()) f1.deleteOnExit(); - if (!f2.delete()) f2.deleteOnExit(); - this.array.mountBLOB(newFile); - System.out.println("*** DEBUG mergeOldest: mounted " + newFile.getName()); - System.out.println("*** DEBUG mergeOldest: ^^^^^^^^^^^ array has " + this.array.entries() + " entries ^^^^^^^^^^^"); - return true; } }