From 67aaffc0a2cb85fad3027e62da4d2dc067f5cd95 Mon Sep 17 00:00:00 2001 From: orbiter Date: Fri, 20 Mar 2009 10:21:23 +0000 Subject: [PATCH] - added Latency control to the crawler: because of the strongly enhanced indexing speed when using the new IndexCell RWI data structures (> 2000PPM on my notebook), it is now necessary to control the crawling speed depending on the response time of the target server (which is also YaCy in case of some intranet indexing use cases). The latency factor in crawl delay times is derived from the time that a target hosts takes to answer on http requests. For internet domains, the crawl delay is a minimum of twice the response time, in intranet cases the delay time is now a halve of the response time. - added API to monitor the latency times of the crawler: a new api at /api/latency_p.xml returns the current response times of domains, the time when the domain was accessed by the crawler the last time and many more attributes. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5733 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/api/latency_p.java | 64 ++++++ htroot/api/latency_p.xml | 12 ++ htroot/api/webstructure.java | 22 ++ source/de/anomic/crawler/Balancer.java | 18 +- source/de/anomic/crawler/CrawlEntry.java | 100 --------- source/de/anomic/crawler/FTPLoader.java | 5 +- source/de/anomic/crawler/HTTPLoader.java | 89 +------- source/de/anomic/crawler/Latency.java | 198 ++++++++++++++++++ source/de/anomic/crawler/NoticedURL.java | 4 +- .../text/ReferenceContainerArray.java | 60 +++--- 10 files changed, 349 insertions(+), 223 deletions(-) create mode 100644 htroot/api/latency_p.java create mode 100644 htroot/api/latency_p.xml create mode 100644 source/de/anomic/crawler/Latency.java diff --git a/htroot/api/latency_p.java b/htroot/api/latency_p.java new file mode 100644 index 000000000..8c5a3bc3e --- /dev/null +++ b/htroot/api/latency_p.java @@ -0,0 +1,64 @@ +// latency_p.java +// ------------ +// (C) 2009 by Michael Peter Christen; mc@yacy.net +// first published 19.03.2009 on http://yacy.net +// +// $LastChangedDate: 2009-03-16 19:08:43 +0100 (Mo, 16 Mrz 2009) $ +// $LastChangedRevision: 5723 $ +// $LastChangedBy: borg-0300 $ +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +import java.util.Date; +import java.util.Iterator; +import java.util.Map; + +import de.anomic.crawler.Latency; +import de.anomic.crawler.NoticedURL; +import de.anomic.crawler.Latency.Host; +import de.anomic.http.httpRequestHeader; +import de.anomic.kelondro.util.DateFormatter; +import de.anomic.server.serverObjects; +import de.anomic.server.serverSwitch; + +public class latency_p { + + public static serverObjects respond(final httpRequestHeader header, final serverObjects post, final serverSwitch env) { + + final serverObjects prop = new serverObjects(); + //final plasmaSwitchboard sb = (plasmaSwitchboard) env; + final Iterator> i = Latency.iterator(); + Map.Entry e; + int c = 0; + Latency.Host host; + while (i.hasNext()) { + e = i.next(); + host = e.getValue(); + prop.putXML("domains_" + c + "_hosthash", e.getKey()); + prop.putXML("domains_" + c + "_host", host.host()); + prop.putXML("domains_" + c + "_lastaccess", DateFormatter.formatShortSecond(new Date(host.lastacc()))); + prop.put("domains_" + c + "_count", host.count()); + prop.put("domains_" + c + "_average", host.average()); + prop.put("domains_" + c + "_robots", host.robotsDelay()); + prop.put("domains_" + c + "_flux", host.flux(NoticedURL.minimumGlobalDeltaInit)); + c++; + } + prop.put("domains", c); + + // return rewrite properties + return prop; + } + +} diff --git a/htroot/api/latency_p.xml b/htroot/api/latency_p.xml new file mode 100644 index 000000000..df0bc1c78 --- /dev/null +++ b/htroot/api/latency_p.xml @@ -0,0 +1,12 @@ + + +#{domains}# + + #[lastaccess]# + #[count]# + #[average]# + #[robots]# + #[flux]# + +#{/domains}# + \ No newline at end of file diff --git a/htroot/api/webstructure.java b/htroot/api/webstructure.java index 67fe37f65..a41e5a745 100644 --- a/htroot/api/webstructure.java +++ b/htroot/api/webstructure.java @@ -1,3 +1,25 @@ +// webstructure.java +// ------------ +// (C) 2009 by Michael Peter Christen; mc@yacy.net +// first published 01.05.2008 on http://yacy.net +// +// $LastChangedDate: 2009-03-16 19:08:43 +0100 (Mo, 16 Mrz 2009) $ +// $LastChangedRevision: 5723 $ +// $LastChangedBy: borg-0300 $ +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA import java.util.Iterator; diff --git a/source/de/anomic/crawler/Balancer.java b/source/de/anomic/crawler/Balancer.java index 33dfbb115..dee37b2ee 100644 --- a/source/de/anomic/crawler/Balancer.java +++ b/source/de/anomic/crawler/Balancer.java @@ -288,7 +288,7 @@ public class Balancer { while (i.hasNext() && c < max) { entry = i.next(); list = entry.getValue(); - if (onlyReadyForAccess && CrawlEntry.waitingRemainingGuessed(list.getFirst(), minimumLocalDelta, minimumGlobalDelta) > 0) continue; + if (onlyReadyForAccess && Latency.waitingRemainingGuessed(list.getFirst(), minimumLocalDelta, minimumGlobalDelta) > 0) continue; if (ram) { urlRAMStack.add(list.removeFirst()); } else try { @@ -415,7 +415,7 @@ public class Balancer { String besthash = null; while (i.hasNext()) { urlhash = i.next(); - waitingtime = CrawlEntry.waitingRemainingGuessed(urlhash, minimumLocalDelta, minimumGlobalDelta); + waitingtime = Latency.waitingRemainingGuessed(urlhash, minimumLocalDelta, minimumGlobalDelta); if (waitingtime == 0) { // zero waiting is a good one result = urlhash; @@ -474,7 +474,7 @@ public class Balancer { while (hitlist.size() > 0) { domhash = hitlist.remove(hitlist.lastKey()); if (maxhash == null) maxhash = domhash; // remember first entry - waitingtime = CrawlEntry.waitingRemainingGuessed(domhash, minimumLocalDelta, minimumGlobalDelta); + waitingtime = Latency.waitingRemainingGuessed(domhash, minimumLocalDelta, minimumGlobalDelta); if (waitingtime < 100) { domlist = domainStacks.get(domhash); result = domlist.removeFirst(); @@ -498,7 +498,7 @@ public class Balancer { while (i.hasNext()) { entry = i.next(); domhash = entry.getKey(); - waitingtime = CrawlEntry.waitingRemainingGuessed(domhash, minimumLocalDelta, minimumGlobalDelta); + waitingtime = Latency.waitingRemainingGuessed(domhash, minimumLocalDelta, minimumGlobalDelta); if (waitingtime == 0) { // zero waiting is a good one domlist = entry.getValue(); @@ -542,7 +542,7 @@ public class Balancer { // check if the time after retrieval of last hash from same // domain is not shorter than the minimumDelta - long waitingtime = CrawlEntry.waitingRemainingGuessed(nexthash, minimumLocalDelta, minimumGlobalDelta); + long waitingtime = Latency.waitingRemainingGuessed(nexthash, minimumLocalDelta, minimumGlobalDelta); if (waitingtime == 0) { // the entry is fine result = new String((top) ? urlFileStack.pop().getColBytes(0) : urlFileStack.pot().getColBytes(0)); @@ -571,7 +571,7 @@ public class Balancer { // at this point we must check if the crawlEntry has relevancy because the crawl profile still exists // if not: return null. A calling method must handle the null value and try again if (profile != null && !profile.hasEntry(crawlEntry.profileHandle())) return null; - long sleeptime = crawlEntry.waitingRemaining(minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server + long sleeptime = Latency.waitingRemaining(crawlEntry.url(), minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server if (delay && sleeptime > 0) { // force a busy waiting here @@ -586,11 +586,7 @@ public class Balancer { sleeptime -= this.lastPrepare - t; } if (sleeptime > 0) try {synchronized(this) { this.wait(sleeptime); }} catch (final InterruptedException e) {} - } - - // update statistical data - crawlEntry.updateAccess(); - + } return crawlEntry; } diff --git a/source/de/anomic/crawler/CrawlEntry.java b/source/de/anomic/crawler/CrawlEntry.java index afea2bb60..beebc8b9c 100755 --- a/source/de/anomic/crawler/CrawlEntry.java +++ b/source/de/anomic/crawler/CrawlEntry.java @@ -29,13 +29,11 @@ package de.anomic.crawler; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.util.Date; -import java.util.concurrent.ConcurrentHashMap; import de.anomic.kelondro.index.Row; import de.anomic.kelondro.order.Base64Order; import de.anomic.kelondro.order.Bitfield; import de.anomic.kelondro.order.NaturalOrder; -import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverProcessorJob; import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.yacyURL; @@ -62,9 +60,6 @@ public class CrawlEntry extends serverProcessorJob { Base64Order.enhancedCoder ); - // a shared domainAccess map for all balancers. the key is a domain-hash (6 bytes) - public static final ConcurrentHashMap domainAccess = new ConcurrentHashMap(); - private String initiator; // the initiator hash, is NULL or "" if it is the own proxy; // if this is generated by a crawl, the own peer hash in entered private String refhash; // the url's referrer hash @@ -83,26 +78,6 @@ public class CrawlEntry extends serverProcessorJob { private String statusMessage; private int initialHash; // to provide a object hash that does not change even if the url changes because of redirection - public static class domaccess { - public long time; - public long robotsMinDelay; - public int count; - public String host; - public domaccess(String host) { - this.host = host; - this.time = System.currentTimeMillis(); - this.robotsMinDelay = 0; - this.count = 0; - } - public void update() { - this.time = System.currentTimeMillis(); - this.count++; - } - public long flux(long range) { - return count >= 1000 ? range * Math.min(5000, count) / 1000 : range / (1000 - count); - } - } - /** * @param initiator the hash of the initiator peer @@ -292,79 +267,4 @@ public class CrawlEntry extends serverProcessorJob { return this.profileHandle; } - /** - * check a domain flag so it can be calculated when a domain was accessed the last time - */ - public void updateAccess() { - String domhash = url.hash().substring(6); - domaccess lastAccess = domainAccess.get(domhash); - if (lastAccess == null) { - lastAccess = new domaccess(url.getHost()); - domainAccess.put(domhash, lastAccess); - } else { - lastAccess.update(); - } - } - - /** - * calculates how long should be waited until the domain can be accessed again - * this follows from given minimum access times, the fact that an url is a CGI url or now, the times that the domain was accessed - * and a given minimum access time as given in robots.txt - * @param minimumLocalDelta - * @param minimumGlobalDelta - * @return the remaining waiting time in milliseconds - */ - public long waitingRemaining(final long minimumLocalDelta, final long minimumGlobalDelta) { - final long delta = lastAccessDelta(this.url.hash()); - if (delta == Long.MAX_VALUE) return 0; - final boolean local = this.url.isLocal(); - long deltaBase = (local) ? minimumLocalDelta : minimumGlobalDelta; - if (this.url.isCGI()) deltaBase = deltaBase * 2; // mostly there is a database access in the background which creates a lot of unwanted IO on target site - domaccess lastAccess = domainAccess.get(this.url.hash().substring(6)); - lastAccess.robotsMinDelay = (local) ? 0 : plasmaSwitchboard.getSwitchboard().robots.crawlDelayMillis(this.url); - final long genericDelta = Math.min( - 60000, - Math.max( - deltaBase + ((lastAccess == null || local) ? 0 : lastAccess.flux(deltaBase)), - (local || lastAccess == null) ? 0 : lastAccess.robotsMinDelay) - ); // prevent that that robots file can stop our indexer completely - return (delta < genericDelta) ? genericDelta - delta : 0; - } - - /** - * guess a minimum waiting time - * the time is not correct, because if the domain was not checked yet by the robots.txt delay value, it is too low - * also the 'isCGI' property is missing, because the full text of the domain is unknown here - * @param urlhash - * @param minimumLocalDelta - * @param minimumGlobalDelta - * @return the remaining waiting time in milliseconds - */ - public static long waitingRemainingGuessed(String urlhash, final long minimumLocalDelta, final long minimumGlobalDelta) { - final long delta = lastAccessDelta(urlhash); - if (delta == Long.MAX_VALUE) return 0; - final boolean local = yacyURL.isLocal(urlhash); - long deltaBase = (local) ? minimumLocalDelta : minimumGlobalDelta; - domaccess lastAccess = domainAccess.get(urlhash.substring(6)); - final long genericDelta = Math.min( - 60000, - Math.max( - deltaBase + ((lastAccess == null || local) ? 0 : lastAccess.flux(deltaBase)), - (local || lastAccess == null) ? 0 : lastAccess.robotsMinDelay) - ); // prevent that that robots file can stop our indexer completely - return (delta < genericDelta) ? genericDelta - delta : 0; - } - - /** - * calculates the time since the last access of the domain as referenced by the url hash - * @param urlhash - * @return a time in milliseconds since last access of the domain or Long.MAX_VALUE if the domain was not accessed before - */ - private static long lastAccessDelta(final String hash) { - assert hash != null; - assert hash.length() == 6 || hash.length() == 12; - final domaccess lastAccess = domainAccess.get((hash.length() > 6) ? hash.substring(6) : hash); - if (lastAccess == null) return Long.MAX_VALUE; // never accessed - return System.currentTimeMillis() - lastAccess.time; - } } \ No newline at end of file diff --git a/source/de/anomic/crawler/FTPLoader.java b/source/de/anomic/crawler/FTPLoader.java index 1ae35b30d..5f1e415ed 100644 --- a/source/de/anomic/crawler/FTPLoader.java +++ b/source/de/anomic/crawler/FTPLoader.java @@ -78,6 +78,8 @@ public class FTPLoader { * @return */ public Document load(final CrawlEntry entry) throws IOException { + + long start = System.currentTimeMillis(); final yacyURL entryUrl = entry.url(); final String fullPath = getPath(entryUrl); @@ -146,7 +148,8 @@ public class FTPLoader { sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "server download" + detail); throw new IOException("FTPLoader: Unable to download URL " + entry.url().toString() + detail); } - + + Latency.update(entry.url().hash().substring(6), entry.url().getHost(), System.currentTimeMillis() - start); return htCache; } diff --git a/source/de/anomic/crawler/HTTPLoader.java b/source/de/anomic/crawler/HTTPLoader.java index 425ddf374..f8c12495b 100644 --- a/source/de/anomic/crawler/HTTPLoader.java +++ b/source/de/anomic/crawler/HTTPLoader.java @@ -99,7 +99,10 @@ public final class HTTPLoader { } public Document load(final CrawlEntry entry, final String parserMode) throws IOException { - return load(entry, parserMode, DEFAULT_CRAWLING_RETRY_COUNT); + long start = System.currentTimeMillis(); + Document doc = load(entry, parserMode, DEFAULT_CRAWLING_RETRY_COUNT); + Latency.update(entry.url().hash().substring(6), entry.url().getHost(), System.currentTimeMillis() - start); + return doc; } private Document load(final CrawlEntry entry, final String parserMode, final int retryCount) throws IOException { @@ -242,90 +245,6 @@ public final class HTTPLoader { } }*/ return htCache; - /* - } catch (final Exception e) { - final String errorMsg = e.getMessage(); - String failreason = null; - - if ((e instanceof IOException) && - (errorMsg != null) && - (errorMsg.indexOf("socket closed") >= 0) && - (Thread.currentThread().isInterrupted()) - ) { - this.log.logInfo("CRAWLER Interruption detected because of server shutdown."); - failreason = ErrorURL.DENIED_SERVER_SHUTDOWN; - } else if (e instanceof httpdLimitExceededException) { - this.log.logWarning("CRAWLER Max file size limit '" + maxFileSize + "' exceeded while downloading URL " + entry.url()); - failreason = ErrorURL.DENIED_FILESIZE_LIMIT_EXCEEDED; - } else if (e instanceof MalformedURLException) { - this.log.logWarning("CRAWLER Malformed URL '" + entry.url().toString() + "' detected. "); - failreason = ErrorURL.DENIED_MALFORMED_URL; - } else if (e instanceof NoRouteToHostException) { - this.log.logWarning("CRAWLER No route to host found while trying to crawl URL '" + entry.url().toString() + "'."); - failreason = ErrorURL.DENIED_NO_ROUTE_TO_HOST; - } else if ((e instanceof UnknownHostException) || - ((errorMsg != null) && (errorMsg.indexOf("unknown host") >= 0))) { - final yacyURL u = (entry.referrerhash() == null) ? null : sb.getURL(entry.referrerhash()); - this.log.logWarning("CRAWLER Unknown host in URL '" + entry.url() + "'. " + - "Referer URL: " + ((u == null) ? "Unknown" : u.toNormalform(true, true))); - failreason = ErrorURL.DENIED_UNKNOWN_HOST; - } else if (e instanceof java.net.BindException) { - this.log.logWarning("CRAWLER BindException detected while trying to download content from '" + entry.url().toString() + - "'. Retrying request."); - failreason = ErrorURL.DENIED_CONNECTION_BIND_EXCEPTION; - } else if ((errorMsg != null) && ( - (errorMsg.indexOf("Corrupt GZIP trailer") >= 0) || - (errorMsg.indexOf("Not in GZIP format") >= 0) || - (errorMsg.indexOf("Unexpected end of ZLIB") >= 0) - )) { - this.log.logWarning("CRAWLER Problems detected while receiving gzip encoded content from '" + entry.url().toString() + - "'. Retrying request without using gzip content encoding."); - failreason = ErrorURL.DENIED_CONTENT_DECODING_ERROR; - } else if ((errorMsg != null) && (errorMsg.indexOf("The host did not accept the connection within timeout of") >= 0)) { - this.log.logWarning("CRAWLER Timeout while trying to connect to '" + entry.url().toString() + - "'. Retrying request."); - failreason = ErrorURL.DENIED_CONNECTION_TIMEOUT; - } else if ((errorMsg != null) && (errorMsg.indexOf("Read timed out") >= 0)) { - this.log.logWarning("CRAWLER Read timeout while receiving content from '" + entry.url().toString() + - "'. Retrying request."); - failreason = ErrorURL.DENIED_CONNECTION_TIMEOUT; - } else if ((errorMsg != null) && (errorMsg.indexOf("connect timed out") >= 0)) { - this.log.logWarning("CRAWLER Timeout while trying to connect to '" + entry.url().toString() + - "'. Retrying request."); - failreason = ErrorURL.DENIED_CONNECTION_TIMEOUT; - } else if ((errorMsg != null) && (errorMsg.indexOf("Connection timed out") >= 0)) { - this.log.logWarning("CRAWLER Connection timeout while receiving content from '" + entry.url().toString() + - "'. Retrying request."); - failreason = ErrorURL.DENIED_CONNECTION_TIMEOUT; - } else if ((errorMsg != null) && (errorMsg.indexOf("Connection refused") >= 0)) { - this.log.logWarning("CRAWLER Connection refused while trying to connect to '" + entry.url().toString() + "'."); - failreason = ErrorURL.DENIED_CONNECTION_REFUSED; - } else if ((errorMsg != null) && (errorMsg.indexOf("Circular redirect to '")>= 0)) { - this.log.logWarning("CRAWLER Redirect Error with URL '" + entry.url().toString() + "': "+ e.toString()); - failreason = ErrorURL.DENIED_REDIRECTION_COUNTER_EXCEEDED; - } else if ((errorMsg != null) && (errorMsg.indexOf("There is not enough space on the disk") >= 0)) { - this.log.logSevere("CRAWLER Not enough space on the disk detected while crawling '" + entry.url().toString() + "'. " + - "Pausing crawlers. "); - sb.pauseCrawlJob(plasmaSwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); - sb.pauseCrawlJob(plasmaSwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL); - failreason = ErrorURL.DENIED_OUT_OF_DISK_SPACE; - } else if ((errorMsg != null) && (errorMsg.indexOf("Network is unreachable") >=0)) { - this.log.logSevere("CRAWLER Network is unreachable while trying to crawl URL '" + entry.url().toString() + "'. "); - failreason = ErrorURL.DENIED_NETWORK_IS_UNREACHABLE; - } else if ((errorMsg != null) && (errorMsg.indexOf("No trusted certificate found")>= 0)) { - this.log.logSevere("CRAWLER No trusted certificate found for URL '" + entry.url().toString() + "'. "); - failreason = ErrorURL.DENIED_SSL_UNTRUSTED_CERT; - } else { - this.log.logSevere("CRAWLER Unexpected Error with URL '" + entry.url().toString() + "': " + e.toString(), e); - failreason = ErrorURL.DENIED_CONNECTION_ERROR; - } - - if (failreason != null) { - // add url into error db - sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, failreason); - } - return null; - }*/ } } diff --git a/source/de/anomic/crawler/Latency.java b/source/de/anomic/crawler/Latency.java new file mode 100644 index 000000000..e8c31fe27 --- /dev/null +++ b/source/de/anomic/crawler/Latency.java @@ -0,0 +1,198 @@ +// Latency.java +// ------------ +// (C) 2009 by Michael Peter Christen; mc@yacy.net +// first published 19.03.2009 on http://yacy.net +// +// $LastChangedDate: 2009-03-16 19:08:43 +0100 (Mo, 16 Mrz 2009) $ +// $LastChangedRevision: 5723 $ +// $LastChangedBy: borg-0300 $ +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.crawler; + +import java.util.Iterator; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.yacy.yacyURL; + +public class Latency { + + private static final ConcurrentHashMap map = new ConcurrentHashMap(); + + public static void update(String hosthash, String host, long time) { + assert hosthash.length() == 6; + Host h = map.get(hosthash); + if (h == null) { + h = new Host(host, time); + map.put(hosthash, h); + } else { + h.update(time); + } + } + + public static Host host(String hosthash) { + assert hosthash.length() == 6; + return map.get(hosthash); + } + + public static int average(String hosthash) { + assert hosthash.length() == 6; + Host h = map.get(hosthash); + if (h == null) return 1000; + return h.average(); + } + + public static Iterator> iterator() { + return map.entrySet().iterator(); + } + + + /** + * calculate the time since the last access of the domain as referenced by the url hash + * @param urlhash + * @return a time in milliseconds since last access of the domain or Long.MAX_VALUE if the domain was not accessed before + */ + public static long lastAccessDelta(final String urlhash) { + assert urlhash.length() == 12 || urlhash.length() == 6; + final Latency.Host host = Latency.host((urlhash.length() == 6) ? urlhash : urlhash.substring(6)); + if (host == null) return Long.MAX_VALUE; // never accessed + return System.currentTimeMillis() - host.lastacc(); + } + + + + /** + * guess a minimum waiting time + * the time is not correct, because if the domain was not checked yet by the robots.txt delay value, it is too low + * also the 'isCGI' property is missing, because the full text of the domain is unknown here + * @param urlhash + * @param minimumLocalDelta + * @param minimumGlobalDelta + * @return the remaining waiting time in milliseconds + */ + public static long waitingRemainingGuessed(String urlhash, final long minimumLocalDelta, final long minimumGlobalDelta) { + assert urlhash.length() == 12 || urlhash.length() == 6; + Latency.Host latency = Latency.host((urlhash.length() == 6) ? urlhash : urlhash.substring(6)); + if (latency == null) return 0; + + final long delta = System.currentTimeMillis() - latency.lastacc(); + final boolean local = yacyURL.isLocal(urlhash); + long deltaBase = (local) ? minimumLocalDelta : minimumGlobalDelta; + final long genericDelta = Math.min( + 60000, + Math.max( + deltaBase + ((latency == null || local) ? 0 : latency.flux(deltaBase)), + (local || latency == null) ? 0 : latency.robotsDelay()) + ); // prevent that that robots file can stop our indexer completely + return (delta < genericDelta) ? genericDelta - delta : 0; + } + + + /** + * calculates how long should be waited until the domain can be accessed again + * this follows from: + * - given minimum access times + * - the fact that an url is a CGI url or not + * - the times that the domain was accessed (flux factor) + * - the response latency of the domain + * - and a given minimum access time as given in robots.txt + * @param minimumLocalDelta + * @param minimumGlobalDelta + * @return the remaining waiting time in milliseconds + */ + public static long waitingRemaining(yacyURL url, final long minimumLocalDelta, final long minimumGlobalDelta) { + + // first check if the domain was _ever_ accessed before + String hosthash = url.hash().substring(6); + Host host = host(hosthash); + if (host == null) return 0; // no delay + + // the time since last access to the domain is the basis of the remaining calculation + final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc(); + + // find the minimum waiting time based on the network domain (local or global) + final boolean local = url.isLocal(); + long waiting = (local) ? minimumLocalDelta : minimumGlobalDelta; + + // for CGI accesses, we double the minimum time + // mostly there is a database access in the background + // which creates a lot of unwanted IO on target site + if (url.isCGI()) waiting = waiting * 2; + + // if we have accessed the domain many times, get slower (the flux factor) + if (!local) waiting += host.flux(waiting); + + // find the delay as given by robots.txt on target site + long robotsDelay = (local) ? 0 : plasmaSwitchboard.getSwitchboard().robots.crawlDelayMillis(url); + waiting = Math.max(waiting, robotsDelay); + + // use the access latency as rule how fast we can access the server + // this applies also to localhost, but differently, because it is not necessary to + // consider so many external accesses + waiting = Math.max(waiting, (local) ? host.average() / 2 : host.average() * 2); + + // prevent that that a robots file can stop our indexer completely + waiting = Math.min(60000, waiting); + + // return time that is remaining + System.out.println("Latency: " + (waiting - timeSinceLastAccess)); + return Math.max(0, waiting - timeSinceLastAccess); + } + + public static final class Host { + private long timeacc; + private long lastacc; + private int count; + private String host; + private long robotsMinDelay; + public Host(String host, long time) { + this.host = host; + this.timeacc = time; + this.count = 1; + this.lastacc = System.currentTimeMillis(); + this.robotsMinDelay = 0; + } + public void update(long time) { + this.lastacc = System.currentTimeMillis(); + this.timeacc += time; + this.count++; + } + public int count() { + return this.count; + } + public int average() { + return (int) (this.timeacc / this.count); + } + public long lastacc() { + return this.lastacc; + } + public String host() { + return this.host; + } + public void robotsDelay(long ur) { + this.robotsMinDelay = ur; + } + public long robotsDelay() { + return this.robotsMinDelay; + } + public long flux(long range) { + return count >= 1000 ? range * Math.min(5000, count) / 1000 : range / (1000 - count); + } + } + +} diff --git a/source/de/anomic/crawler/NoticedURL.java b/source/de/anomic/crawler/NoticedURL.java index ee744f295..aca55fb94 100755 --- a/source/de/anomic/crawler/NoticedURL.java +++ b/source/de/anomic/crawler/NoticedURL.java @@ -43,8 +43,8 @@ public class NoticedURL { public static final int STACK_TYPE_MOVIE = 12; // put on movie stack public static final int STACK_TYPE_MUSIC = 13; // put on music stack - private static final long minimumLocalDeltaInit = 0; // the minimum time difference between access of the same local domain - private static final long minimumGlobalDeltaInit = 500; // the minimum time difference between access of the same global domain + public static final long minimumLocalDeltaInit = 0; // the minimum time difference between access of the same local domain + public static final long minimumGlobalDeltaInit = 500; // the minimum time difference between access of the same global domain private Balancer coreStack; // links found by crawling to depth-1 private Balancer limitStack; // links found by crawling at target depth diff --git a/source/de/anomic/kelondro/text/ReferenceContainerArray.java b/source/de/anomic/kelondro/text/ReferenceContainerArray.java index 3cb01717e..7b71a9ddc 100644 --- a/source/de/anomic/kelondro/text/ReferenceContainerArray.java +++ b/source/de/anomic/kelondro/text/ReferenceContainerArray.java @@ -251,30 +251,50 @@ public final class ReferenceContainerArray { System.out.println("*** DEBUG mergeOldest: vvvvvvvvv array has " + this.array.entries() + " entries vvvvvvvvv"); System.out.println("*** DEBUG mergeOldest: unmounted " + f1.getName()); System.out.println("*** DEBUG mergeOldest: unmounted " + f2.getName()); + File newFile = merge(f1, f2); + if (newFile == null) return true; + this.array.mountBLOB(newFile); + System.out.println("*** DEBUG mergeOldest: mounted " + newFile.getName()); + System.out.println("*** DEBUG mergeOldest: ^^^^^^^^^^^ array has " + this.array.entries() + " entries ^^^^^^^^^^^"); + return true; + } + + private synchronized File merge(File f1, File f2) throws IOException { // iterate both files and write a new one CloneableIterator i1 = new blobFileEntries(f1, this.payloadrow); CloneableIterator i2 = new blobFileEntries(f2, this.payloadrow); - ReferenceContainer c1, c2, c1o, c2o; - c1 = (i1.hasNext()) ? i1.next() : null; - c2 = (i2.hasNext()) ? i2.next() : null; - if (c1 == null && c2 == null) { - if (!f1.delete()) f1.deleteOnExit(); - if (!f2.delete()) f2.deleteOnExit(); - return true; - } - if (c1 == null) { - if (!f1.delete()) f1.deleteOnExit(); - this.array.mountBLOB(f2); - return true; - } - if (c2 == null) { + if (!i1.hasNext()) { + if (i2.hasNext()) { + if (!f1.delete()) f1.deleteOnExit(); + return f2; + } else { + if (!f1.delete()) f1.deleteOnExit(); + if (!f2.delete()) f2.deleteOnExit(); + return null; + } + } else if (!i2.hasNext()) { if (!f2.delete()) f2.deleteOnExit(); - this.array.mountBLOB(f1); - return true; + return f1; } + assert i1.hasNext(); + assert i2.hasNext(); File newFile = newContainerBLOBFile(); HeapWriter writer = new HeapWriter(newFile, this.array.keylength(), this.array.ordering()); + merge(i1, i2, writer); + writer.close(true); + // we don't need the old files any more + if (!f1.delete()) f1.deleteOnExit(); + if (!f2.delete()) f2.deleteOnExit(); + return newFile; + } + + private synchronized void merge(CloneableIterator i1, CloneableIterator i2, HeapWriter writer) throws IOException { + assert i1.hasNext(); + assert i2.hasNext(); + ReferenceContainer c1, c2, c1o, c2o; + c1 = i1.next(); + c2 = i2.next(); int e; while (true) { assert c1 != null; @@ -338,14 +358,6 @@ public final class ReferenceContainerArray { break; } // finished with writing - writer.close(true); - // we don't need the old files any more - if (!f1.delete()) f1.deleteOnExit(); - if (!f2.delete()) f2.deleteOnExit(); - this.array.mountBLOB(newFile); - System.out.println("*** DEBUG mergeOldest: mounted " + newFile.getName()); - System.out.println("*** DEBUG mergeOldest: ^^^^^^^^^^^ array has " + this.array.entries() + " entries ^^^^^^^^^^^"); - return true; } }