use supplied url port to get robots.txt in crawlers hostqueue

pull/46/head
reger 9 years ago
parent ed765de29b
commit 379e9b330d

@ -297,7 +297,7 @@ public class HostBalancer implements Balancer {
String s = i.next();
HostQueue hq = this.queues.get(s);
if (hq == null) {i.remove(); continue smallstacks;}
int delta = Latency.waitingRemainingGuessed(hq.getHost(), s, robots, ClientIdentification.yacyInternetCrawlerAgent);
int delta = Latency.waitingRemainingGuessed(hq.getHost(), hq.getPort(), s, robots, ClientIdentification.yacyInternetCrawlerAgent);
if (delta < 0) continue; // keep all non-waiting stacks; they are useful to speed up things
// to protect all small stacks which have a fast throughput, remove all with long waiting time
if (delta >= 1000) {i.remove(); continue smallstacks;}
@ -332,7 +332,7 @@ public class HostBalancer implements Balancer {
mixedstrategy: for (String h: this.roundRobinHostHashes) {
HostQueue hq = this.queues.get(h);
if (hq != null) {
int delta = Latency.waitingRemainingGuessed(hq.getHost(), h, robots, ClientIdentification.yacyInternetCrawlerAgent) / 200;
int delta = Latency.waitingRemainingGuessed(hq.getHost(), hq.getPort(), h, robots, ClientIdentification.yacyInternetCrawlerAgent) / 200;
if (delta < 0) delta = 0;
List<String> queueHashes = fastTree.get(delta);
if (queueHashes == null) {
@ -427,7 +427,7 @@ public class HostBalancer implements Balancer {
String s = i.next();
HostQueue hq = this.queues.get(s);
if (hq == null) {i.remove(); continue protectcheck;}
int delta = Latency.waitingRemainingGuessed(hq.getHost(), s, robots, ClientIdentification.yacyInternetCrawlerAgent);
int delta = Latency.waitingRemainingGuessed(hq.getHost(), hq.getPort(), s, robots, ClientIdentification.yacyInternetCrawlerAgent);
if (delta >= 0) {i.remove();}
}
}
@ -488,7 +488,7 @@ public class HostBalancer implements Balancer {
public Map<String, Integer[]> getDomainStackHosts(RobotsTxt robots) {
Map<String, Integer[]> map = new TreeMap<String, Integer[]>(); // we use a tree map to get a stable ordering
for (HostQueue hq: this.queues.values()) try {
int delta = Latency.waitingRemainingGuessed(hq.getHost(), DigestURL.hosthash(hq.getHost(), hq.getPort()), robots, ClientIdentification.yacyInternetCrawlerAgent);
int delta = Latency.waitingRemainingGuessed(hq.getHost(), hq.getPort(), DigestURL.hosthash(hq.getHost(), hq.getPort()), robots, ClientIdentification.yacyInternetCrawlerAgent);
map.put(hq.getHost() + ":" + hq.getPort(), new Integer[]{hq.size(), delta});
} catch (MalformedURLException e) {
ConcurrentLog.logException(e);

@ -518,7 +518,7 @@ public class HostQueue implements Balancer {
@Override
public Map<String, Integer[]> getDomainStackHosts(RobotsTxt robots) {
Map<String, Integer[]> map = new TreeMap<String, Integer[]>();
int delta = Latency.waitingRemainingGuessed(this.hostName, this.hostHash, robots, ClientIdentification.yacyInternetCrawlerAgent);
int delta = Latency.waitingRemainingGuessed(this.hostName, this.port, this.hostHash, robots, ClientIdentification.yacyInternetCrawlerAgent);
map.put(this.hostName, new Integer[]{this.size(), delta});
return map;
}

@ -283,7 +283,7 @@ public class LegacyBalancer implements Balancer {
final String hostname = entry.getKey();
final HostHandles hosthandles = entry.getValue();
int size = hosthandles.handleSet.size();
int delta = Latency.waitingRemainingGuessed(hostname, hosthandles.hosthash, robots, ClientIdentification.yacyInternetCrawlerAgent);
int delta = Latency.waitingRemainingGuessed(hostname, 80, hosthandles.hosthash, robots, ClientIdentification.yacyInternetCrawlerAgent);
map.put(hostname, new Integer[]{size, delta});
}
return map;

@ -146,7 +146,7 @@ public class Latency {
* @return the remaining waiting time in milliseconds. The return value may be negative
* which expresses how long the time is over the minimum waiting time.
*/
public static int waitingRemainingGuessed(final String hostname, final String hosthash, final RobotsTxt robots, final ClientIdentification.Agent agent) {
public static int waitingRemainingGuessed(final String hostname, final int port, final String hosthash, final RobotsTxt robots, final ClientIdentification.Agent agent) {
// first check if the domain was _ever_ accessed before
final Host host = map.get(hosthash);
@ -171,7 +171,7 @@ public class Latency {
// find the delay as given by robots.txt on target site
if (robots != null) {
int robotsDelay = waitingRobots(hostname + ":80", robots, agent, false);
int robotsDelay = waitingRobots(hostname + ":" + port, robots, agent, false);
if (robotsDelay < 0) return -timeSinceLastAccess; // no limits if granted exclusively for this peer
waiting = Math.max(waiting, robotsDelay);
}

Loading…
Cancel
Save