refactoring

pull/1/head
orbiter 12 years ago
parent 8ac2e8c8c9
commit 0e8d752462

@ -42,8 +42,6 @@ import org.openjena.atlas.logging.Log;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.Domains;
@ -51,7 +49,6 @@ import net.yacy.cora.sorting.OrderedScoreMap;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.data.Cache;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.Latency;
import net.yacy.crawler.retrieval.Request;
@ -293,37 +290,6 @@ public class Balancer {
return map;
}
/**
* Get the minimum sleep time for a given url. The result can also be negative to reflect the time since the last access
* The time can be as low as Integer.MIN_VALUE to show that there should not be any limitation at all.
* @param robots
* @param profileEntry
* @param crawlURL
* @return the sleep time in milliseconds; may be negative for no sleep time
*/
private long getDomainSleepTime(final RobotsTxt robots, final CrawlProfile profileEntry, final DigestURL crawlURL) {
if (profileEntry == null) return 0;
long sleeptime = (
profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY ||
(profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlURL.hash()))
) ? Integer.MIN_VALUE : Latency.waitingRemaining(crawlURL, robots, profileEntry.getAgent()); // this uses the robots.txt database and may cause a loading of robots.txt from the server
return sleeptime;
}
/**
* load a robots.txt to get the robots time.
* ATTENTION: this method causes that a robots.txt is loaded from the web which may cause a longer delay in execution.
* This shall therefore not be called in synchronized environments.
* @param robots
* @param profileEntry
* @param crawlURL
* @return
*/
private long getRobotsTime(final RobotsTxt robots, final DigestURL crawlURL, ClientIdentification.Agent agent) {
long sleeptime = Latency.waitingRobots(crawlURL, robots, agent); // this uses the robots.txt database and may cause a loading of robots.txt from the server
return sleeptime < 0 ? 0 : sleeptime;
}
/**
* get lists of crawl request entries for a specific host
* @param host
@ -434,7 +400,7 @@ public class Balancer {
continue;
}
// depending on the caching policy we need sleep time to avoid DoS-like situations
sleeptime = getDomainSleepTime(robots, profileEntry, crawlEntry.url());
sleeptime = Latency.getDomainSleepTime(robots, profileEntry, crawlEntry.url());
assert Base64Order.enhancedCoder.equal(nexthash, rowEntry.getPrimaryKeyBytes()) : "result = " + ASCII.String(nexthash) + ", rowEntry.getPrimaryKeyBytes() = " + ASCII.String(rowEntry.getPrimaryKeyBytes());
assert Base64Order.enhancedCoder.equal(nexthash, crawlEntry.url().hash()) : "result = " + ASCII.String(nexthash) + ", crawlEntry.url().hash() = " + ASCII.String(crawlEntry.url().hash());
@ -445,7 +411,7 @@ public class Balancer {
}
if (crawlEntry == null) return null;
ClientIdentification.Agent agent = profileEntry == null ? ClientIdentification.yacyInternetCrawlerAgent : profileEntry.getAgent();
long robotsTime = getRobotsTime(robots, crawlEntry.url(), agent);
long robotsTime = Latency.getRobotsTime(robots, crawlEntry.url(), agent);
Latency.updateAfterSelection(crawlEntry.url(), profileEntry == null ? 0 : robotsTime);
if (delay && sleeptime > 0) {
// force a busy waiting here

@ -31,6 +31,7 @@ import java.util.concurrent.atomic.AtomicLong;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.crawler.robots.RobotsTxtEntry;
@ -262,6 +263,37 @@ public class Latency {
return s.toString();
}
/**
* Get the minimum sleep time for a given url. The result can also be negative to reflect the time since the last access
* The time can be as low as Integer.MIN_VALUE to show that there should not be any limitation at all.
* @param robots
* @param profileEntry
* @param crawlURL
* @return the sleep time in milliseconds; may be negative for no sleep time
*/
public static long getDomainSleepTime(final RobotsTxt robots, final CrawlProfile profileEntry, final DigestURL crawlURL) {
if (profileEntry == null) return 0;
long sleeptime = (
profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY ||
(profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlURL.hash()))
) ? Integer.MIN_VALUE : waitingRemaining(crawlURL, robots, profileEntry.getAgent()); // this uses the robots.txt database and may cause a loading of robots.txt from the server
return sleeptime;
}
/**
* load a robots.txt to get the robots time.
* ATTENTION: this method causes that a robots.txt is loaded from the web which may cause a longer delay in execution.
* This shall therefore not be called in synchronized environments.
* @param robots
* @param profileEntry
* @param crawlURL
* @return
*/
public static long getRobotsTime(final RobotsTxt robots, final DigestURL crawlURL, ClientIdentification.Agent agent) {
long sleeptime = waitingRobots(crawlURL, robots, agent); // this uses the robots.txt database and may cause a loading of robots.txt from the server
return sleeptime < 0 ? 0 : sleeptime;
}
public static final class Host {
private AtomicLong timeacc;
private AtomicLong lastacc;

Loading…
Cancel
Save