refactoring

12 years ago · 0e8d752462
parent 8ac2e8c8c9
commit 0e8d752462
2 changed files with 34 additions and 36 deletions
--- a/source/net/yacy/crawler/Balancer.java
+++ b/source/net/yacy/crawler/Balancer.java
@ -42,8 +42,6 @@ import org.openjena.atlas.logging.Log;

 import net.yacy.cora.document.encoding.ASCII;
 import net.yacy.cora.document.encoding.UTF8;
-import net.yacy.cora.document.id.DigestURL;
-import net.yacy.cora.federate.yacy.CacheStrategy;
 import net.yacy.cora.order.Base64Order;
 import net.yacy.cora.protocol.ClientIdentification;
 import net.yacy.cora.protocol.Domains;
@ -51,7 +49,6 @@ import net.yacy.cora.sorting.OrderedScoreMap;
 import net.yacy.cora.storage.HandleSet;
 import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.cora.util.SpaceExceededException;
-import net.yacy.crawler.data.Cache;
 import net.yacy.crawler.data.CrawlProfile;
 import net.yacy.crawler.data.Latency;
 import net.yacy.crawler.retrieval.Request;
@ -293,37 +290,6 @@ public class Balancer {
        return map;
    }

-    /**
-     * Get the minimum sleep time for a given url. The result can also be negative to reflect the time since the last access
-     * The time can be as low as Integer.MIN_VALUE to show that there should not be any limitation at all.
-     * @param robots
-     * @param profileEntry
-     * @param crawlURL
-     * @return the sleep time in milliseconds; may be negative for no sleep time
-     */
-    private long getDomainSleepTime(final RobotsTxt robots, final CrawlProfile profileEntry, final DigestURL crawlURL) {
-        if (profileEntry == null) return 0;
-        long sleeptime = (
-            profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY ||
-            (profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlURL.hash()))
-            ) ? Integer.MIN_VALUE : Latency.waitingRemaining(crawlURL, robots, profileEntry.getAgent()); // this uses the robots.txt database and may cause a loading of robots.txt from the server
-        return sleeptime;
-    }
-    
-    /**
-     * load a robots.txt to get the robots time.
-     * ATTENTION: this method causes that a robots.txt is loaded from the web which may cause a longer delay in execution.
-     * This shall therefore not be called in synchronized environments.
-     * @param robots
-     * @param profileEntry
-     * @param crawlURL
-     * @return
-     */
-    private long getRobotsTime(final RobotsTxt robots, final DigestURL crawlURL, ClientIdentification.Agent agent) {
-        long sleeptime = Latency.waitingRobots(crawlURL, robots, agent); // this uses the robots.txt database and may cause a loading of robots.txt from the server
-        return sleeptime < 0 ? 0 : sleeptime;
-    }
-
    /**
     * get lists of crawl request entries for a specific host
     * @param host
@ -434,7 +400,7 @@ public class Balancer {
    	        	continue;
    	        }
    	        // depending on the caching policy we need sleep time to avoid DoS-like situations
-    	        sleeptime = getDomainSleepTime(robots, profileEntry, crawlEntry.url());
+    	        sleeptime = Latency.getDomainSleepTime(robots, profileEntry, crawlEntry.url());
    
    	        assert Base64Order.enhancedCoder.equal(nexthash, rowEntry.getPrimaryKeyBytes()) : "result = " + ASCII.String(nexthash) + ", rowEntry.getPrimaryKeyBytes() = " + ASCII.String(rowEntry.getPrimaryKeyBytes());
    	        assert Base64Order.enhancedCoder.equal(nexthash, crawlEntry.url().hash()) : "result = " + ASCII.String(nexthash) + ", crawlEntry.url().hash() = " + ASCII.String(crawlEntry.url().hash());
@ -445,7 +411,7 @@ public class Balancer {
    	}
    	if (crawlEntry == null) return null;
    	ClientIdentification.Agent agent = profileEntry == null ? ClientIdentification.yacyInternetCrawlerAgent : profileEntry.getAgent();
-    	long robotsTime = getRobotsTime(robots, crawlEntry.url(), agent);
+    	long robotsTime = Latency.getRobotsTime(robots, crawlEntry.url(), agent);
        Latency.updateAfterSelection(crawlEntry.url(), profileEntry == null ? 0 : robotsTime);
        if (delay && sleeptime > 0) {
            // force a busy waiting here
--- a/source/net/yacy/crawler/data/Latency.java
+++ b/source/net/yacy/crawler/data/Latency.java
@ -31,6 +31,7 @@ import java.util.concurrent.atomic.AtomicLong;

 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.document.id.MultiProtocolURL;
+import net.yacy.cora.federate.yacy.CacheStrategy;
 import net.yacy.cora.protocol.ClientIdentification;
 import net.yacy.crawler.robots.RobotsTxt;
 import net.yacy.crawler.robots.RobotsTxtEntry;
@ -262,6 +263,37 @@ public class Latency {
        return s.toString();
    }

+    /**
+     * Get the minimum sleep time for a given url. The result can also be negative to reflect the time since the last access
+     * The time can be as low as Integer.MIN_VALUE to show that there should not be any limitation at all.
+     * @param robots
+     * @param profileEntry
+     * @param crawlURL
+     * @return the sleep time in milliseconds; may be negative for no sleep time
+     */
+    public static long getDomainSleepTime(final RobotsTxt robots, final CrawlProfile profileEntry, final DigestURL crawlURL) {
+        if (profileEntry == null) return 0;
+        long sleeptime = (
+            profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY ||
+            (profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlURL.hash()))
+            ) ? Integer.MIN_VALUE : waitingRemaining(crawlURL, robots, profileEntry.getAgent()); // this uses the robots.txt database and may cause a loading of robots.txt from the server
+        return sleeptime;
+    }
+    
+    /**
+     * load a robots.txt to get the robots time.
+     * ATTENTION: this method causes that a robots.txt is loaded from the web which may cause a longer delay in execution.
+     * This shall therefore not be called in synchronized environments.
+     * @param robots
+     * @param profileEntry
+     * @param crawlURL
+     * @return
+     */
+    public static long getRobotsTime(final RobotsTxt robots, final DigestURL crawlURL, ClientIdentification.Agent agent) {
+        long sleeptime = waitingRobots(crawlURL, robots, agent); // this uses the robots.txt database and may cause a loading of robots.txt from the server
+        return sleeptime < 0 ? 0 : sleeptime;
+    }
+    
    public static final class Host {
        private AtomicLong timeacc;
        private AtomicLong lastacc;