shorter minimum delay values for intranet crawl targets

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4047 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 18 years ago
parent f890cc86aa
commit 344911bfaa

@ -58,6 +58,7 @@ import de.anomic.kelondro.kelondroIndex;
import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroStack; import de.anomic.kelondro.kelondroStack;
import de.anomic.kelondro.kelondroAbstractRecords; import de.anomic.kelondro.kelondroAbstractRecords;
import de.anomic.server.serverDomains;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.yacySeedDB;
@ -297,7 +298,7 @@ public class plasmaCrawlBalancer {
} }
} }
public synchronized plasmaCrawlEntry pop(long minimumDelta, long maximumAge) throws IOException { public synchronized plasmaCrawlEntry pop(long minimumLocalDelta, long minimumGlobalDelta, long maximumAge) throws IOException {
// returns an url-hash from the stack and ensures minimum delta times // returns an url-hash from the stack and ensures minimum delta times
// we have 3 sources to choose from: the ramStack, the domainStacks and the fileStack // we have 3 sources to choose from: the ramStack, the domainStacks and the fileStack
@ -370,7 +371,7 @@ public class plasmaCrawlBalancer {
domhash = (String) hitlist.remove(hitlist.lastKey()); domhash = (String) hitlist.remove(hitlist.lastKey());
if (maxhash == null) maxhash = domhash; // remember first entry if (maxhash == null) maxhash = domhash; // remember first entry
delta = lastAccessDelta(domhash); delta = lastAccessDelta(domhash);
if (delta > minimumDelta) { if (delta > minimumGlobalDelta) {
domlist = (LinkedList) domainStacks.get(domhash); domlist = (LinkedList) domainStacks.get(domhash);
result = (String) domlist.removeFirst(); result = (String) domlist.removeFirst();
if (domlist.size() == 0) domainStacks.remove(domhash); if (domlist.size() == 0) domainStacks.remove(domhash);
@ -400,7 +401,7 @@ public class plasmaCrawlBalancer {
// check if the time after retrieval of last hash from same // check if the time after retrieval of last hash from same
// domain is not shorter than the minimumDelta // domain is not shorter than the minimumDelta
long delta = lastAccessDelta(nexthash); long delta = lastAccessDelta(nexthash);
if (delta > minimumDelta) { if (delta > minimumGlobalDelta) {
// the entry is fine // the entry is fine
result = new String((top) ? urlFileStack.pop().getColBytes(0) : urlFileStack.pot().getColBytes(0)); result = new String((top) ? urlFileStack.pop().getColBytes(0) : urlFileStack.pot().getColBytes(0));
} else { } else {
@ -429,6 +430,7 @@ public class plasmaCrawlBalancer {
return null; return null;
} }
plasmaCrawlEntry crawlEntry = new plasmaCrawlEntry(rowEntry); plasmaCrawlEntry crawlEntry = new plasmaCrawlEntry(rowEntry);
long minimumDelta = (serverDomains.isLocal(crawlEntry.url())) ? minimumLocalDelta : minimumGlobalDelta;
plasmaCrawlRobotsTxt.Entry robotsEntry = plasmaSwitchboard.robots.getEntry(crawlEntry.url().getHost()); plasmaCrawlRobotsTxt.Entry robotsEntry = plasmaSwitchboard.robots.getEntry(crawlEntry.url().getHost());
Integer hostDelay = (robotsEntry == null) ? null : robotsEntry.getCrawlDelay(); Integer hostDelay = (robotsEntry == null) ? null : robotsEntry.getCrawlDelay();
long genericDelta = ((robotsEntry == null) || (hostDelay == null)) ? minimumDelta : Math.max(minimumDelta, hostDelay.intValue() * 1000); long genericDelta = ((robotsEntry == null) || (hostDelay == null)) ? minimumDelta : Math.max(minimumDelta, hostDelay.intValue() * 1000);

@ -60,7 +60,8 @@ public class plasmaCrawlNURL {
public static final int STACK_TYPE_MOVIE = 12; // put on movie stack public static final int STACK_TYPE_MOVIE = 12; // put on movie stack
public static final int STACK_TYPE_MUSIC = 13; // put on music stack public static final int STACK_TYPE_MUSIC = 13; // put on music stack
private static final long minimumDelta = 500; // the minimum time difference between access of the same domain private static final long minimumLocalDelta = 100; // the minimum time difference between access of the same local domain
private static final long minimumGlobalDelta = 500; // the minimum time difference between access of the same global domain
private static final long maximumDomAge = 60000; // the maximum age of a domain until it is used for another crawl attempt private static final long maximumDomAge = 60000; // the maximum age of a domain until it is used for another crawl attempt
private final plasmaCrawlBalancer coreStack; // links found by crawling to depth-1 private final plasmaCrawlBalancer coreStack; // links found by crawling to depth-1
@ -188,7 +189,7 @@ public class plasmaCrawlNURL {
plasmaCrawlEntry entry; plasmaCrawlEntry entry;
synchronized (balancer) { synchronized (balancer) {
while ((s = balancer.size()) > 0) { while ((s = balancer.size()) > 0) {
entry = balancer.pop(minimumDelta, maximumDomAge); entry = balancer.pop(minimumLocalDelta, minimumGlobalDelta, maximumDomAge);
if (entry == null) { if (entry == null) {
if (s > balancer.size()) continue; if (s > balancer.size()) continue;
int aftersize = balancer.size(); int aftersize = balancer.size();

@ -197,9 +197,9 @@ public class serverDomains {
} }
// checks for local/global IP range and local IP // checks for local/global IP range and local IP
public static boolean isLocal(URL url) { public static boolean isLocal(URL url) {
return dnsResolve(url.getHost()).isSiteLocalAddress(); InetAddress hostAddress = dnsResolve(url.getHost());
return hostAddress.isSiteLocalAddress() || hostAddress.isLoopbackAddress();
} }
private static InetAddress[] localAddresses = null; private static InetAddress[] localAddresses = null;

Loading…
Cancel
Save