From 1bbf362cef0e45b6dd70f0d13c49cf08442b72f1 Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 17 Sep 2008 21:45:21 +0000 Subject: [PATCH] update to the crawl balancer: better organization and better crawl delay prediction git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5176 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- build.properties | 2 +- htroot/PerformanceQueues_p.java | 3 +- source/de/anomic/crawler/Balancer.java | 271 +++++++++++------- source/de/anomic/crawler/CrawlEntry.java | 102 +++++++ source/de/anomic/crawler/NoticedURL.java | 31 +- .../de/anomic/plasma/plasmaSwitchboard.java | 5 +- 6 files changed, 280 insertions(+), 134 deletions(-) diff --git a/build.properties b/build.properties index 6dc3cdde8..4ed50b995 100644 --- a/build.properties +++ b/build.properties @@ -3,7 +3,7 @@ javacSource=1.5 javacTarget=1.5 # Release Configuration -releaseVersion=0.60 +releaseVersion=0.601 stdReleaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz embReleaseFile=yacy_emb_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz proReleaseFile=yacy_pro_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz diff --git a/htroot/PerformanceQueues_p.java b/htroot/PerformanceQueues_p.java index 7f26a99bc..5e831dd1c 100644 --- a/htroot/PerformanceQueues_p.java +++ b/htroot/PerformanceQueues_p.java @@ -267,8 +267,7 @@ public class PerformanceQueues_p { final long minimumGlobalDelta = post.getLong("minimumGlobalDelta", switchboard.crawlQueues.noticeURL.getMinimumGlobalDelta()); switchboard.setConfig("minimumLocalDelta", minimumLocalDelta); switchboard.setConfig("minimumGlobalDelta", minimumGlobalDelta); - switchboard.crawlQueues.noticeURL.setMinimumLocalDelta(minimumLocalDelta); - switchboard.crawlQueues.noticeURL.setMinimumGlobalDelta(minimumGlobalDelta); + switchboard.crawlQueues.noticeURL.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta); } // delta settings diff --git a/source/de/anomic/crawler/Balancer.java b/source/de/anomic/crawler/Balancer.java index 68b232d02..e22ca861c 100644 --- a/source/de/anomic/crawler/Balancer.java +++ b/source/de/anomic/crawler/Balancer.java @@ -37,10 +37,8 @@ import de.anomic.kelondro.kelondroEcoTable; import de.anomic.kelondro.kelondroIndex; import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroStack; -import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.logging.serverLog; import de.anomic.yacy.yacySeedDB; -import de.anomic.yacy.yacyURL; public class Balancer { @@ -48,9 +46,6 @@ public class Balancer { private static final String indexSuffix = "9.db"; private static final int EcoFSBufferSize = 200; - // a shared domainAccess map for all balancers. the key is a domain-hash (6 bytes) - public static final ConcurrentHashMap domainAccess = new ConcurrentHashMap(); - // definition of payload for fileStack private static final kelondroRow stackrow = new kelondroRow("byte[] urlhash-" + yacySeedDB.commonHashLength, kelondroBase64Order.enhancedCoder, 0); @@ -64,26 +59,11 @@ public class Balancer { private final String stackname; private boolean top; // to alternate between top and bottom of the file stack private final boolean fullram; - - public static class domaccess { - public long time; - public int count; - public String host; - public domaccess(String host) { - this.host = host; - this.time = System.currentTimeMillis(); - this.count = 0; - } - public void update() { - this.time = System.currentTimeMillis(); - this.count++; - } - public long flux(long range) { - return count >= 1000 ? range * Math.min(5000, count) / 1000 : range / (1000 - count); - } - } + private long minimumLocalDelta; + private long minimumGlobalDelta; - public Balancer(final File cachePath, final String stackname, final boolean fullram) { + public Balancer(final File cachePath, final String stackname, final boolean fullram, + final long minimumLocalDelta, final long minimumGlobalDelta) { this.cacheStacksPath = cachePath; this.stackname = stackname; final File stackFile = new File(cachePath, stackname + stackSuffix); @@ -92,6 +72,8 @@ public class Balancer { this.urlRAMStack = new ArrayList(); this.top = true; this.fullram = fullram; + this.minimumLocalDelta = minimumLocalDelta; + this.minimumGlobalDelta = minimumGlobalDelta; // create a stack for newly entered entries if (!(cachePath.exists())) cachePath.mkdir(); // make the path @@ -105,7 +87,7 @@ public class Balancer { byte[] hash; while (i.hasNext()) { hash = i.next(); - pushHash(new String(hash)); + pushHashToDomainStacks(new String(hash), true); } } catch (final IOException e) { e.printStackTrace(); @@ -113,8 +95,21 @@ public class Balancer { } } + public long getMinimumLocalDelta() { + return this.minimumLocalDelta; + } + + public long getMinimumGlobalDelta() { + return this.minimumGlobalDelta; + } + + public void setMinimumDelta(final long minimumLocalDelta, final long minimumGlobalDelta) { + this.minimumLocalDelta = minimumLocalDelta; + this.minimumGlobalDelta = minimumGlobalDelta; + } + public synchronized void close() { - while (domainStacksNotEmpty()) flushOnceDomStacks(0, true); // flush to ram, because the ram flush is optimized + while (domainStacksNotEmpty()) flushOnceDomStacks(0, true, false); // flush to ram, because the ram flush is optimized size(); try { flushAllRamStack(); } catch (final IOException e) {} if (urlFileIndex != null) { @@ -284,7 +279,13 @@ public class Balancer { return sum; } - private void flushOnceDomStacks(final int minimumleft, final boolean ram) { + /** + * removes the head element of all domain stacks and moves the element in either the ram stack or the file stack + * @param minimumleft + * @param ram + * @param onlyReadyForAccess + */ + private void flushOnceDomStacks(final int minimumleft, final boolean ram, final boolean onlyReadyForAccess) { // takes one entry from every domain stack and puts it on the ram or file stack // the minimumleft value is a limit for the number of entries that should be left if (domainStacks.size() == 0) return; @@ -296,6 +297,7 @@ public class Balancer { entry = i.next(); list = entry.getValue(); if (list.size() > minimumleft) { + if (onlyReadyForAccess && CrawlEntry.waitingRemainingGuessed(list.getFirst(), minimumLocalDelta, minimumGlobalDelta) > 0) continue; if (ram) { urlRAMStack.add(list.removeFirst()); } else try { @@ -319,6 +321,47 @@ public class Balancer { urlFileStack.push(urlFileStack.row().newEntry(new byte[][]{(urlRAMStack.get(urlRAMStack.size() / 2)).getBytes()})); } + private void shiftFileToDomStacks(final int wantedsize) { + int count = sizeDomainStacks() - wantedsize; + while ((urlFileStack != null) && (count > 0) && (urlFileStack.size() > 0)) { + // flush some entries from disc to ram stack + try { + // one from the top: + kelondroRow.Entry t = urlFileStack.pop(); + if (t == null) break; + pushHashToDomainStacks(new String(t.getColBytes(0)), false); + count--; + if (urlFileStack.size() == 0) break; + // one from the bottom: + t = urlFileStack.pot(); + if (t == null) break; + pushHashToDomainStacks(new String(t.getColBytes(0)), false); + count--; + } catch (final IOException e) { + break; + } + } + } + + private void shiftFileToRAM(final int wantedsize) { + while ((urlFileStack != null) && (urlRAMStack.size() <= wantedsize) && (urlFileStack.size() > 0)) { + // flush some entries from disc to ram stack + try { + // one from the top: + kelondroRow.Entry t = urlFileStack.pop(); + if (t == null) break; + urlRAMStack.add(new String(t.getColBytes(0))); + if (urlFileStack.size() == 0) break; + // one from the bottom: + t = urlFileStack.pot(); + if (t == null) break; + urlRAMStack.add(new String(t.getColBytes(0))); + } catch (final IOException e) { + break; + } + } + } + public synchronized void push(final CrawlEntry entry) throws IOException { assert entry != null; if (urlFileIndex.has(entry.url().hash().getBytes())) { @@ -330,10 +373,10 @@ public class Balancer { urlFileIndex.put(entry.toRow()); // add the hash to a queue - pushHash(entry.url().hash()); + pushHashToDomainStacks(entry.url().hash(), true); } - private void pushHash(final String hash) { + private void pushHashToDomainStacks(final String hash, boolean flush) { // extend domain stack final String dom = hash.substring(6); LinkedList domainList = domainStacks.get(dom); @@ -350,54 +393,91 @@ public class Balancer { } // check size of domainStacks and flush - if ((domainStacks.size() > 100) || (sizeDomainStacks() > 1000)) { - flushOnceDomStacks(1, urlRAMStack.size() < 100); // when the ram stack is small, flush it there + if (flush && (domainStacks.size() > 100) || (sizeDomainStacks() > 1000)) { + flushOnceDomStacks(1, urlRAMStack.size() < 100, true); // when the ram stack is small, flush it there } } - public synchronized CrawlEntry pop(final long minimumLocalDelta, final long minimumGlobalDelta, final long maximumAge) throws IOException { - // returns an url-hash from the stack and ensures minimum delta times + public synchronized CrawlEntry pop(boolean delay) throws IOException { + // returns a crawl entry from the stack and ensures minimum delta times // we have 3 sources to choose from: the ramStack, the domainStacks and the fileStack String result = null; // the result // 1st: check ramStack if (urlRAMStack.size() > 0) { - result = urlRAMStack.remove(0); + //result = urlRAMStack.remove(0); + Iterator i = urlRAMStack.iterator(); + String urlhash; + long waitingtime, min = Long.MAX_VALUE; + String besthash = null; + while (i.hasNext()) { + urlhash = i.next(); + waitingtime = CrawlEntry.waitingRemainingGuessed(urlhash, minimumLocalDelta, minimumGlobalDelta); + if (waitingtime == 0) { + // zero waiting is a good one + result = urlhash; + i.remove(); + min = Long.MAX_VALUE; // that causes that the if at the end of this loop is not used + besthash = null; + break; + } + if (waitingtime < min) { + min = waitingtime; + besthash = urlhash; + } + } + if (min <= 500 && besthash != null) { + // find that entry that was best end remove it + i = urlRAMStack.iterator(); + while (i.hasNext()) { + urlhash = i.next(); + if (urlhash.equals(besthash)) { + // zero waiting is a good one + result = urlhash; + i.remove(); + break; + } + } + } } + // the next options use the domain stack. If this is not filled enough, they dont work at all + // so just fill them up with some stuff + if (result == null) shiftFileToDomStacks(1000); + // 2nd-a: check domainStacks for latest arrivals if ((result == null) && (domainStacks.size() > 0)) synchronized (domainStacks) { // we select specific domains that have not been used for a long time - // i.e. 60 seconds. Latest arrivals that have not yet been crawled - // fit also in that scheme + // Latest arrivals that have not yet been crawled fit also in that scheme final Iterator>> i = domainStacks.entrySet().iterator(); Map.Entry> entry; String domhash; - long delta, maxdelta = 0; - String maxhash = null; + long waitingtime, min = Long.MAX_VALUE; + String besthash = null; LinkedList domlist; while (i.hasNext()) { entry = i.next(); domhash = entry.getKey(); - delta = lastAccessDelta(domhash); - if (delta == Integer.MAX_VALUE) { - // a brand new domain - we take it + waitingtime = CrawlEntry.waitingRemainingGuessed(domhash, minimumLocalDelta, minimumGlobalDelta); + if (waitingtime == 0) { + // zero waiting is a good one domlist = entry.getValue(); result = domlist.removeFirst(); if (domlist.size() == 0) i.remove(); + min = Long.MAX_VALUE; // that causes that the if at the end of this loop is not used + besthash = null; break; } - if (delta > maxdelta) { - maxdelta = delta; - maxhash = domhash; + if (waitingtime < min) { + min = waitingtime; + besthash = domhash; } } - if (maxdelta > maximumAge) { - // success - we found an entry from a domain that has not been used for a long time - domlist = domainStacks.get(maxhash); + if (min <= 500 && besthash != null) { + domlist = domainStacks.get(besthash); result = domlist.removeFirst(); - if (domlist.size() == 0) domainStacks.remove(maxhash); + if (domlist.size() == 0) domainStacks.remove(besthash); } } @@ -405,7 +485,7 @@ public class Balancer { if ((result == null) && (domainStacks.size() > 0)) synchronized (domainStacks) { // we order all domains by the number of entries per domain // then we iterate through these domains in descending entry order - // and that that one, that has a delta > minimumDelta + // and take that one, that has a zero waiting time final Iterator>> i = domainStacks.entrySet().iterator(); Map.Entry> entry; String domhash; @@ -422,13 +502,13 @@ public class Balancer { // now iterate in descending order and fetch that one, // that is acceptable by the minimumDelta constraint - long delta; + long waitingtime; String maxhash = null; while (hitlist.size() > 0) { domhash = hitlist.remove(hitlist.lastKey()); if (maxhash == null) maxhash = domhash; // remember first entry - delta = lastAccessDelta(domhash); - if (delta > ((yacyURL.isLocal(domhash)) ? minimumLocalDelta : minimumGlobalDelta)) { + waitingtime = CrawlEntry.waitingRemainingGuessed(domhash, minimumLocalDelta, minimumGlobalDelta); + if (waitingtime == 0) { domlist = domainStacks.get(domhash); result = domlist.removeFirst(); if (domlist.size() == 0) domainStacks.remove(domhash); @@ -457,14 +537,13 @@ public class Balancer { // check if the time after retrieval of last hash from same // domain is not shorter than the minimumDelta - long delta = lastAccessDelta(nexthash); - if (delta > ((yacyURL.isLocal(nexthash)) ? minimumLocalDelta : minimumGlobalDelta)) { + long waitingtime = CrawlEntry.waitingRemainingGuessed(nexthash, minimumLocalDelta, minimumGlobalDelta); + if (waitingtime == 0) { // the entry is fine result = new String((top) ? urlFileStack.pop().getColBytes(0) : urlFileStack.pot().getColBytes(0)); } else { // try other entry result = new String((top) ? urlFileStack.pot().getColBytes(0) : urlFileStack.pop().getColBytes(0)); - delta = lastAccessDelta(result); } } top = !top; // alternate top/bottom @@ -477,83 +556,59 @@ public class Balancer { } // finally: check minimumDelta and if necessary force a sleep - final long delta = lastAccessDelta(result); - assert delta >= 0: "delta = " + delta; final int s = urlFileIndex.size(); - final kelondroRow.Entry rowEntry = urlFileIndex.remove(result.getBytes()); - assert (rowEntry == null) || (urlFileIndex.size() + 1 == s) : "urlFileIndex.size() = " + urlFileIndex.size() + ", s = " + s + ", result = " + result; + kelondroRow.Entry rowEntry = urlFileIndex.remove(result.getBytes()); if (rowEntry == null) { - serverLog.logSevere("PLASMA BALANCER", "get() found a valid urlhash, but failed to fetch the corresponding url entry - total size = " + size() + ", fileStack.size() = " + urlFileStack.size() + ", ramStack.size() = " + urlRAMStack.size() + ", domainStacks.size() = " + domainStacks.size()); - return null; + throw new IOException("get() found a valid urlhash, but failed to fetch the corresponding url entry - total size = " + size() + ", fileStack.size() = " + urlFileStack.size() + ", ramStack.size() = " + urlRAMStack.size() + ", domainStacks.size() = " + domainStacks.size()); } assert urlFileIndex.size() + 1 == s : "urlFileIndex.size() = " + urlFileIndex.size() + ", s = " + s + ", result = " + result; - final CrawlEntry crawlEntry = new CrawlEntry(rowEntry); - final long genericDelta = ensureDelta(result.substring(6), crawlEntry, minimumLocalDelta, minimumGlobalDelta); - if (delta < genericDelta) { + final CrawlEntry crawlEntry = new CrawlEntry(rowEntry); + long sleeptime = crawlEntry.waitingRemaining(minimumLocalDelta, minimumGlobalDelta); + + if (delay && sleeptime > 0) { // force a busy waiting here // in best case, this should never happen if the balancer works propertly // this is only to protection against the worst case, where the crawler could // behave in a DoS-manner - final long sleeptime = genericDelta - delta; + serverLog.logInfo("PLASMA BALANCER", "forcing fetch delay of " + sleeptime + " millisecond for " + crawlEntry.url().getHost()); try {synchronized(this) { this.wait(sleeptime); }} catch (final InterruptedException e) {} } // update statistical data - domaccess lastAccess = domainAccess.get(result.substring(6)); - if (lastAccess == null) { - lastAccess = new domaccess(crawlEntry.url().getHost()); - domainAccess.put(result.substring(6), lastAccess); - } else { - lastAccess.update(); - } + crawlEntry.updateAccess(); return crawlEntry; } - - private long ensureDelta(String hosthash, CrawlEntry crawlEntry, final long minimumLocalDelta, final long minimumGlobalDelta) { - long deltaBase = (yacyURL.isLocal(hosthash)) ? minimumLocalDelta : minimumGlobalDelta; - if (crawlEntry.url().isCGI()) deltaBase = deltaBase * 2; // mostly there is a database access in the background which creates a lot of unwanted IO on target site - domaccess lastAccess = domainAccess.get(hosthash); - return Math.min( - 60000, - Math.max( - deltaBase + ((lastAccess == null) ? 0 : lastAccess.flux(deltaBase)), - plasmaSwitchboard.getSwitchboard().robots.crawlDelayMillis(crawlEntry.url())) - ); // prevent that that robots file can stop our indexer completely - } - - private long lastAccessDelta(final String hash) { - assert hash != null; - final domaccess lastAccess = domainAccess.get((hash.length() > 6) ? hash.substring(6) : hash); - if (lastAccess == null) return Long.MAX_VALUE; // never accessed - return System.currentTimeMillis() - lastAccess.time; - } + /** + * return top-elements from the crawl stack + * we do not produce here more entries than exist on the stack + * because otherwise the balancing does not work properly + * @param count + * @return + * @throws IOException + */ public synchronized ArrayList top(int count) throws IOException { // if we need to flush anything, then flush the domain stack first, // to avoid that new urls get hidden by old entries from the file stack if (urlRAMStack == null) return null; + + // ensure that the domain stacks are filled enough + shiftFileToDomStacks(count); + + // flush from the domain stacks first until they are empty + if ((domainStacksNotEmpty()) && (urlRAMStack.size() <= count)) { + flushOnceDomStacks(0, true, true); + } while ((domainStacksNotEmpty()) && (urlRAMStack.size() <= count)) { // flush only that much as we need to display - flushOnceDomStacks(0, true); - } - while ((urlFileStack != null) && (urlRAMStack.size() <= count) && (urlFileStack.size() > 0)) { - // flush some entries from disc to ram stack - try { - // one from the top: - kelondroRow.Entry t = urlFileStack.pop(); - if (t == null) break; - urlRAMStack.add(new String(t.getColBytes(0))); - if (urlFileStack.size() == 0) break; - // one from the bottom: - t = urlFileStack.pot(); - if (t == null) break; - urlRAMStack.add(new String(t.getColBytes(0))); - } catch (final IOException e) { - break; - } + flushOnceDomStacks(0, true, false); } + // if the ram is still not full enough, use the file stack + shiftFileToRAM(count); + + // finally, construct a list using the urlRAMStack which was filled with this procedure count = Math.min(count, urlRAMStack.size()); final ArrayList list = new ArrayList(); for (int i = 0; i < count; i++) { diff --git a/source/de/anomic/crawler/CrawlEntry.java b/source/de/anomic/crawler/CrawlEntry.java index d8756a873..7cbae7968 100755 --- a/source/de/anomic/crawler/CrawlEntry.java +++ b/source/de/anomic/crawler/CrawlEntry.java @@ -29,11 +29,13 @@ package de.anomic.crawler; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.util.Date; +import java.util.concurrent.ConcurrentHashMap; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBitfield; import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.kelondro.kelondroRow; +import de.anomic.plasma.plasmaSwitchboard; import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.yacyURL; @@ -60,6 +62,9 @@ public class CrawlEntry { 0 ); + // a shared domainAccess map for all balancers. the key is a domain-hash (6 bytes) + public static final ConcurrentHashMap domainAccess = new ConcurrentHashMap(); + private String initiator; // the initiator hash, is NULL or "" if it is the own proxy; // if this is generated by a crawl, the own peer hash in entered private String refhash; // the url's referrer hash @@ -78,6 +83,27 @@ public class CrawlEntry { private String status; private int initialHash; // to provide a object hash that does not change even if the url changes because of redirection + public static class domaccess { + public long time; + public long robotsMinDelay; + public int count; + public String host; + public domaccess(String host) { + this.host = host; + this.time = System.currentTimeMillis(); + this.robotsMinDelay = 0; + this.count = 0; + } + public void update() { + this.time = System.currentTimeMillis(); + this.count++; + } + public long flux(long range) { + return count >= 1000 ? range * Math.min(5000, count) / 1000 : range / (1000 - count); + } + } + + /** * @param initiator the hash of the initiator peer * @param url the {@link URL} to crawl @@ -261,4 +287,80 @@ public class CrawlEntry { // the handle of the crawl profile return this.profileHandle; } + + /** + * check a domain flag so it can be calculated when a domain was accessed the last time + */ + public void updateAccess() { + String domhash = url.hash().substring(6); + domaccess lastAccess = domainAccess.get(domhash); + if (lastAccess == null) { + lastAccess = new domaccess(url.getHost()); + domainAccess.put(domhash, lastAccess); + } else { + lastAccess.update(); + } + } + + /** + * calculates how long should be waited until the domain can be accessed again + * this follows from given minimum access times, the fact that an url is a CGI url or now, the times that the domain was accessed + * and a given minimum access time as given in robots.txt + * @param minimumLocalDelta + * @param minimumGlobalDelta + * @return the remaining waiting time in milliseconds + */ + public long waitingRemaining(final long minimumLocalDelta, final long minimumGlobalDelta) { + final long delta = lastAccessDelta(this.url.hash()); + if (delta == Long.MAX_VALUE) return 0; + final boolean local = this.url.isLocal(); + long deltaBase = (local) ? minimumLocalDelta : minimumGlobalDelta; + if (this.url.isCGI()) deltaBase = deltaBase * 2; // mostly there is a database access in the background which creates a lot of unwanted IO on target site + domaccess lastAccess = domainAccess.get(this.url.hash().substring(6)); + lastAccess.robotsMinDelay = (local) ? 0 : plasmaSwitchboard.getSwitchboard().robots.crawlDelayMillis(this.url); + final long genericDelta = Math.min( + 60000, + Math.max( + deltaBase + ((lastAccess == null || local) ? 0 : lastAccess.flux(deltaBase)), + (local || lastAccess == null) ? 0 : lastAccess.robotsMinDelay) + ); // prevent that that robots file can stop our indexer completely + return (delta < genericDelta) ? genericDelta - delta : 0; + } + + /** + * guess a minimum waiting time + * the time is not correct, because if the domain was not checked yet by the robots.txt delay value, it is too low + * also the 'isCGI' property is missing, because the full text of the domain is unknown here + * @param urlhash + * @param minimumLocalDelta + * @param minimumGlobalDelta + * @return the remaining waiting time in milliseconds + */ + public static long waitingRemainingGuessed(String urlhash, final long minimumLocalDelta, final long minimumGlobalDelta) { + final long delta = lastAccessDelta(urlhash); + if (delta == Long.MAX_VALUE) return 0; + final boolean local = yacyURL.isLocal(urlhash); + long deltaBase = (local) ? minimumLocalDelta : minimumGlobalDelta; + domaccess lastAccess = domainAccess.get(urlhash.substring(6)); + final long genericDelta = Math.min( + 60000, + Math.max( + deltaBase + ((lastAccess == null || local) ? 0 : lastAccess.flux(deltaBase)), + (local || lastAccess == null) ? 0 : lastAccess.robotsMinDelay) + ); // prevent that that robots file can stop our indexer completely + return (delta < genericDelta) ? genericDelta - delta : 0; + } + + /** + * calculates the time since the last access of the domain as referenced by the url hash + * @param urlhash + * @return a time in milliseconds since last access of the domain or Long.MAX_VALUE if the domain was not accessed before + */ + private static long lastAccessDelta(final String hash) { + assert hash != null; + assert hash.length() == 6 || hash.length() == 12; + final domaccess lastAccess = domainAccess.get((hash.length() > 6) ? hash.substring(6) : hash); + if (lastAccess == null) return Long.MAX_VALUE; // never accessed + return System.currentTimeMillis() - lastAccess.time; + } } \ No newline at end of file diff --git a/source/de/anomic/crawler/NoticedURL.java b/source/de/anomic/crawler/NoticedURL.java index 893855050..26e690fec 100755 --- a/source/de/anomic/crawler/NoticedURL.java +++ b/source/de/anomic/crawler/NoticedURL.java @@ -45,41 +45,30 @@ public class NoticedURL { private static final long minimumLocalDeltaInit = 0; // the minimum time difference between access of the same local domain private static final long minimumGlobalDeltaInit = 500; // the minimum time difference between access of the same global domain - private static final long maximumDomAge = 60000; // the maximum age of a domain until it is used for another crawl attempt private Balancer coreStack; // links found by crawling to depth-1 private Balancer limitStack; // links found by crawling at target depth private Balancer remoteStack; // links from remote crawl orders - //private final plasmaCrawlBalancer overhangStack; // links found by crawling at depth+1 - //private kelondroStack imageStack; // links pointing to image resources - //private kelondroStack movieStack; // links pointing to movie resources - //private kelondroStack musicStack; // links pointing to music resources - private long minimumLocalDelta; - private long minimumGlobalDelta; public NoticedURL(final File cachePath) { - this.coreStack = new Balancer(cachePath, "urlNoticeCoreStack", false); - this.limitStack = new Balancer(cachePath, "urlNoticeLimitStack", false); + this.coreStack = new Balancer(cachePath, "urlNoticeCoreStack", false, minimumLocalDeltaInit, minimumGlobalDeltaInit); + this.limitStack = new Balancer(cachePath, "urlNoticeLimitStack", false, minimumLocalDeltaInit, minimumGlobalDeltaInit); //overhangStack = new plasmaCrawlBalancer(overhangStackFile); - this.remoteStack = new Balancer(cachePath, "urlNoticeRemoteStack", false); - this.minimumLocalDelta = minimumLocalDeltaInit; - this.minimumGlobalDelta = minimumGlobalDeltaInit; + this.remoteStack = new Balancer(cachePath, "urlNoticeRemoteStack", false, minimumLocalDeltaInit, minimumGlobalDeltaInit); } public long getMinimumLocalDelta() { - return this.minimumLocalDelta; + return this.coreStack.getMinimumLocalDelta(); } public long getMinimumGlobalDelta() { - return this.minimumGlobalDelta; + return this.coreStack.getMinimumGlobalDelta(); } - public void setMinimumLocalDelta(final long newDelta) { - this.minimumLocalDelta = Math.max(minimumLocalDeltaInit, newDelta); - } - - public void setMinimumGlobalDelta(final long newDelta) { - this.minimumGlobalDelta = Math.max(minimumGlobalDeltaInit, newDelta); + public void setMinimumDelta(final long minimumLocalDelta, final long minimumGlobalDelta) { + this.coreStack.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta); + this.limitStack.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta); + this.remoteStack.setMinimumDelta(minimumLocalDelta, minimumGlobalDelta); } public void clear() { @@ -228,7 +217,7 @@ public class NoticedURL { CrawlEntry entry; synchronized (balancer) { while ((s = balancer.size()) > 0) { - entry = balancer.pop((delay) ? minimumLocalDelta : 0, (delay) ? minimumGlobalDelta : 0, maximumDomAge); + entry = balancer.pop(delay); if (entry == null) { if (s > balancer.size()) continue; final int aftersize = balancer.size(); diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 0c23ad359..8987f230c 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -461,8 +461,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch