Michael Peter Christen 4 years ago
commit 3078b74e1d

@ -133,7 +133,7 @@ document.getElementById("apilink").setAttribute("href", "Network.xml?" + window.
#{list}# #{list}#
<tr class="TableCell#(dark)#Light::Dark::Summary#(/dark)#"> <tr class="TableCell#(dark)#Light::Dark::Summary#(/dark)#">
<td >#[hash]#</td> <td >#[hash]#</td>
<td #(special)#::class="TableCellActive"#(/special)#>#[shortname]##(ssl)#::<a href="https://#[ip]#:#[portssl]#/Network.html?page=1" class="forceNoExternalIcon" target="_blank"><img src="env/grafics/lock.gif" width="11" height="15" title="https supported" alt="https supported" /></a>#(/ssl)#</td> <td #(special)#::class="TableCellActive"#(/special)#>#[shortname]##(ssl)#::<a href="https://#[ip]#:#[portssl]#/Network.html?page=1" class="forceNoExternalIcon" target="_blank"><img src="env/grafics/lockclose.png" width="11" height="15" title="https supported" alt="https supported" /></a>#(/ssl)#</td>
<td nowrap> <td nowrap>
#(type)##(direct)#<img src="env/grafics/JuniorPassive.gif" width="11" height="11" title="Type: Junior | Contact: passive" alt="Junior passive" />::<img src="env/grafics/JuniorDirect.gif" width="11" height="11" title="Type: Junior | Contact: direct" alt="Junior direct" />::<img src="env/grafics/JuniorOffline.gif" width="11" height="11" title="Type: Junior | Contact: offline" alt="Junior offline" />#(/direct)#::#(direct)#<img src="env/grafics/SeniorPassive.gif" width="11" height="11" title="Type: Senior | Contact: passive" alt="senior passive" />::<img src="env/grafics/SeniorDirect.gif" width="11" height="11" title="Type: Senior | Contact: direct" alt="Senior direct" />::<img src="env/grafics/SeniorOffline.gif" width="11" height="11" title="Type: Senior | Contact: offline" alt="Senior offline" />#(/direct)#::<a href="#[url]#" class="forceNoExternalIcon">#(direct)#<img src="env/grafics/PrincipalPassive.gif" width="11" height="11" title="Type: Principal | Contact: passive | Seed download: possible" alt="Principal passive" />::<img src="env/grafics/PrincipalDirect.gif" width="11" height="11" title="Type: Principal | Contact: direct | Seed download: possible" alt="Principal active" />::<img src="env/grafics/PrincipalOffline.gif" width="11" height="11" title="Type: Principal | Contact: offline | Seed download: ?" alt="Principal offline" />#(/direct)#</a>#(/type)##(acceptcrawl)#<img src="env/grafics/CrawlNo.gif" width="11" height="11" title="Accept Crawl: no" alt="no crawl" />::<img src="env/grafics/CrawlYes.gif" width="11" height="11" title="Accept Crawl: yes" alt="crawl possible" />::<img src="env/grafics/CrawlYesOffline.gif" width="11" height="11" title="Accept Crawl: yes" alt="crawl possible" />#(/acceptcrawl)##(dhtreceive)#<img src="env/grafics/DHTReceiveNo.gif" width="11" height="11" title="DHT Receive: no; #[peertags]#" alt="no DHT receive" />::<img src="env/grafics/DHTReceiveYes.gif" width="11" height="11" title="DHT Receive: yes" alt="DHT receive enabled" />::<img src="env/grafics/DHTReceiveYesOffline.gif" width="11" height="11" title="DHT Receive: yes" alt="DHT receive enabled" />#(/dhtreceive)##{ips}#<a href="#(c)#http://#(ipv6)#::[#(/ipv6)##[ip]##(ipv6)#::]#(/ipv6)#:#[port]#/Network.html?page=1::Network.html?page=4&amp;peerHash=#[hash]#&amp;peerIP=#[ip]#&amp;peerPort=#[port]#&amp;addPeer=add+Peer#(/c)#"#(c)#:: target="_blank"#(/c)#class="forceNoExternalIcon"><img src="env/grafics/#(nodestate)#NodeDisqualified::NodeQualified#(/nodestate)##(c)##(ipv6)#IPv4::IPv6#(/ipv6)#::#(ipv6)#IPv4::IPv6#(/ipv6)##(/c)#.gif" width="11" height="11" title="#(c)##(ipv6)#IPv4::IPv6#(/ipv6)#::#(ipv6)#IPv4::IPv6#(/ipv6)##(/c)# #(nodestate)#Peer::Node Peer#(/nodestate)#" /></a>#{/ips}#</td> #(type)##(direct)#<img src="env/grafics/JuniorPassive.gif" width="11" height="11" title="Type: Junior | Contact: passive" alt="Junior passive" />::<img src="env/grafics/JuniorDirect.gif" width="11" height="11" title="Type: Junior | Contact: direct" alt="Junior direct" />::<img src="env/grafics/JuniorOffline.gif" width="11" height="11" title="Type: Junior | Contact: offline" alt="Junior offline" />#(/direct)#::#(direct)#<img src="env/grafics/SeniorPassive.gif" width="11" height="11" title="Type: Senior | Contact: passive" alt="senior passive" />::<img src="env/grafics/SeniorDirect.gif" width="11" height="11" title="Type: Senior | Contact: direct" alt="Senior direct" />::<img src="env/grafics/SeniorOffline.gif" width="11" height="11" title="Type: Senior | Contact: offline" alt="Senior offline" />#(/direct)#::<a href="#[url]#" class="forceNoExternalIcon">#(direct)#<img src="env/grafics/PrincipalPassive.gif" width="11" height="11" title="Type: Principal | Contact: passive | Seed download: possible" alt="Principal passive" />::<img src="env/grafics/PrincipalDirect.gif" width="11" height="11" title="Type: Principal | Contact: direct | Seed download: possible" alt="Principal active" />::<img src="env/grafics/PrincipalOffline.gif" width="11" height="11" title="Type: Principal | Contact: offline | Seed download: ?" alt="Principal offline" />#(/direct)#</a>#(/type)##(acceptcrawl)#<img src="env/grafics/CrawlNo.gif" width="11" height="11" title="Accept Crawl: no" alt="no crawl" />::<img src="env/grafics/CrawlYes.gif" width="11" height="11" title="Accept Crawl: yes" alt="crawl possible" />::<img src="env/grafics/CrawlYesOffline.gif" width="11" height="11" title="Accept Crawl: yes" alt="crawl possible" />#(/acceptcrawl)##(dhtreceive)#<img src="env/grafics/DHTReceiveNo.gif" width="11" height="11" title="DHT Receive: no; #[peertags]#" alt="no DHT receive" />::<img src="env/grafics/DHTReceiveYes.gif" width="11" height="11" title="DHT Receive: yes" alt="DHT receive enabled" />::<img src="env/grafics/DHTReceiveYesOffline.gif" width="11" height="11" title="DHT Receive: yes" alt="DHT receive enabled" />#(/dhtreceive)##{ips}#<a href="#(c)#http://#(ipv6)#::[#(/ipv6)##[ip]##(ipv6)#::]#(/ipv6)#:#[port]#/Network.html?page=1::Network.html?page=4&amp;peerHash=#[hash]#&amp;peerIP=#[ip]#&amp;peerPort=#[port]#&amp;addPeer=add+Peer#(/c)#"#(c)#:: target="_blank"#(/c)#class="forceNoExternalIcon"><img src="env/grafics/#(nodestate)#NodeDisqualified::NodeQualified#(/nodestate)##(c)##(ipv6)#IPv4::IPv6#(/ipv6)#::#(ipv6)#IPv4::IPv6#(/ipv6)##(/c)#.gif" width="11" height="11" title="#(c)##(ipv6)#IPv4::IPv6#(/ipv6)#::#(ipv6)#IPv4::IPv6#(/ipv6)##(/c)# #(nodestate)#Peer::Node Peer#(/nodestate)#" /></a>#{/ips}#</td>
<td align="right">#[version]#</td> <td align="right">#[version]#</td>
@ -249,7 +249,7 @@ document.getElementById("apilink").setAttribute("href", "Network.xml?" + window.
<td>QPH<br/>(remote)</td> <td>QPH<br/>(remote)</td>
</tr> </tr>
<tr class="TableCellLight"> <tr class="TableCellLight">
<td>#[my-name]##(my-ssl)#::<img src="env/grafics/lock.gif" width="11" height="15" title="https supported" alt="https supported" />#(/my-ssl)#</td> <td>#[my-name]##(my-ssl)#::<img src="env/grafics/lockclose.png" width="11" height="15" title="https supported" alt="https supported" />#(/my-ssl)#</td>
<td nowrap>#(my-info)#<img src="env/grafics/Virgin.gif" width="11" height="11" title="Type: Virgin" alt="Virgin" />::<img src="env/grafics/JuniorDirect.gif" width="11" height="11" title="Type: Junior" alt="Junior" />::<img src="env/grafics/SeniorDirect.gif" width="11" height="11" title="Type: Senior" alt="Senior" />::<img src="env/grafics/PrincipalDirect.gif" width="11" height="11" title="Type: Principal" alt="Principal" />#(/my-info)##(my-acceptcrawl)#<img src="env/grafics/CrawlNo.gif" width="11" height="11" title="Accept Crawl: no" alt="no crawl" />::<img src="env/grafics/CrawlYes.gif" width="11" height="11" title="Accept Crawl: yes" alt="Crawl enabled" />#(/my-acceptcrawl)##(my-dhtreceive)#<img src="env/grafics/DHTReceiveNo.gif" width="11" height="11" title="DHT Receive: no" alt="no DHT receive" />::<img src="env/grafics/DHTReceiveYes.gif" width="11" height="11" title="DHT Receive: yes" alt="DHT Receive enabled" />#(/my-dhtreceive)##{ips}#<a href="#(c)#http://#(ipv6)#::[#(/ipv6)##[ip]##(ipv6)#::]#(/ipv6)#:#[port]#/Network.html?page=1::Network.html?page=4&amp;peerHash=#[hash]#&amp;peerIP=#[ip]#&amp;peerPort=#[port]#&amp;addPeer=add+Peer#(/c)#"#(c)#:: target="_blank"#(/c)#class="forceNoExternalIcon"><img src="env/grafics/#(nodestate)#NodeDisqualified::NodeQualified#(/nodestate)##(c)##(ipv6)#IPv4::IPv6#(/ipv6)#::#(ipv6)#IPv4::IPv6#(/ipv6)##(/c)#.gif" width="11" height="11" title="#(c)##(ipv6)#IPv4::IPv6#(/ipv6)#::#(ipv6)#IPv4::IPv6#(/ipv6)##(/c)# #(nodestate)#Peer::Node Peer#(/nodestate)#" /></a>#{/ips}#</td> <td nowrap>#(my-info)#<img src="env/grafics/Virgin.gif" width="11" height="11" title="Type: Virgin" alt="Virgin" />::<img src="env/grafics/JuniorDirect.gif" width="11" height="11" title="Type: Junior" alt="Junior" />::<img src="env/grafics/SeniorDirect.gif" width="11" height="11" title="Type: Senior" alt="Senior" />::<img src="env/grafics/PrincipalDirect.gif" width="11" height="11" title="Type: Principal" alt="Principal" />#(/my-info)##(my-acceptcrawl)#<img src="env/grafics/CrawlNo.gif" width="11" height="11" title="Accept Crawl: no" alt="no crawl" />::<img src="env/grafics/CrawlYes.gif" width="11" height="11" title="Accept Crawl: yes" alt="Crawl enabled" />#(/my-acceptcrawl)##(my-dhtreceive)#<img src="env/grafics/DHTReceiveNo.gif" width="11" height="11" title="DHT Receive: no" alt="no DHT receive" />::<img src="env/grafics/DHTReceiveYes.gif" width="11" height="11" title="DHT Receive: yes" alt="DHT Receive enabled" />#(/my-dhtreceive)##{ips}#<a href="#(c)#http://#(ipv6)#::[#(/ipv6)##[ip]##(ipv6)#::]#(/ipv6)#:#[port]#/Network.html?page=1::Network.html?page=4&amp;peerHash=#[hash]#&amp;peerIP=#[ip]#&amp;peerPort=#[port]#&amp;addPeer=add+Peer#(/c)#"#(c)#:: target="_blank"#(/c)#class="forceNoExternalIcon"><img src="env/grafics/#(nodestate)#NodeDisqualified::NodeQualified#(/nodestate)##(c)##(ipv6)#IPv4::IPv6#(/ipv6)#::#(ipv6)#IPv4::IPv6#(/ipv6)##(/c)#.gif" width="11" height="11" title="#(c)##(ipv6)#IPv4::IPv6#(/ipv6)#::#(ipv6)#IPv4::IPv6#(/ipv6)##(/c)# #(nodestate)#Peer::Node Peer#(/nodestate)#" /></a>#{/ips}#</td>
<td align="right">#[my-version]#</td> <td align="right">#[my-version]#</td>
<td align="right">#[my-utc]#</td> <td align="right">#[my-utc]#</td>

@ -134,7 +134,7 @@
You can download a more recent version of YaCy. Click here to install this update and restart YaCy: You can download a more recent version of YaCy. Click here to install this update and restart YaCy:
<form action="Status.html" method="get" class="PeerControl" accept-charset="UTF-8"> <form action="Status.html" method="get" class="PeerControl" accept-charset="UTF-8">
<button type="submit" name="aquirerelease" class="btn btn-primary" value="Update YaCy"> <button type="submit" name="aquirerelease" class="btn btn-primary" value="Update YaCy">
<img src="env/grafics/lock.gif" alt="lock icon"/> <img src="env/grafics/lockclose.png" alt="lock icon"/>
Install YaCy v#[latestVersion]# Install YaCy v#[latestVersion]#
</button> </button>
</form> </form>

@ -57,12 +57,12 @@
<form action="Surftips.html" method="get" class="PeerControl" accept-charset="UTF-8"><div> <form action="Surftips.html" method="get" class="PeerControl" accept-charset="UTF-8"><div>
#(publicSurftips)# #(publicSurftips)#
<button type="submit" name="publicPage" class="btn btn-primary" value="1"> <button type="submit" name="publicPage" class="btn btn-primary" value="1">
<img src="env/grafics/lock.gif" alt="authentication required" /> <img src="env/grafics/lockclose.png" alt="authentication required" />
Show surftips to everyone Show surftips to everyone
</button> </button>
:: ::
<button type="submit" name="publicPage" class="btn btn-primary" value="0"> <button type="submit" name="publicPage" class="btn btn-primary" value="0">
<img src="env/grafics/lock.gif" alt="authentication required" /> <img src="env/grafics/lockclose.png" alt="authentication required" />
Hide surftips for users without autorization Hide surftips for users without autorization
</button> </button>
#(/publicSurftips)# #(/publicSurftips)#

@ -331,14 +331,21 @@ public class HostBalancer implements Balancer {
if (size <= 10) {smallStacksExist = true; break smallsearch;} if (size <= 10) {smallStacksExist = true; break smallsearch;}
} }
} }
if (singletonStacksExist || smallStacksExist) { Set<String> freshhosts = new HashSet<>();
Iterator<String> i = this.roundRobinHostHashes.iterator(); Iterator<String> i = this.roundRobinHostHashes.iterator();
smallstacks: while (i.hasNext()) { smallstacks: while (i.hasNext()) {
if (this.roundRobinHostHashes.size() <= 10) break smallstacks; // don't shrink the hosts until nothing is left if (this.roundRobinHostHashes.size() <= 10) break smallstacks; // don't shrink the hosts until nothing is left
String s = i.next(); String hosthash = i.next();
HostQueue hq = this.queues.get(s); HostQueue hq = this.queues.get(hosthash);
if (hq == null) {i.remove(); continue smallstacks;} if (hq == null) {i.remove(); continue smallstacks;}
int delta = Latency.waitingRemainingGuessed(hq.getHost(), hq.getPort(), s, robots, ClientIdentification.yacyInternetCrawlerAgent); int delta = Latency.waitingRemainingGuessed(hq.getHost(), hq.getPort(), hosthash, robots, ClientIdentification.yacyInternetCrawlerAgent);
if (delta == Integer.MIN_VALUE) {
// never-crawled hosts; we do not want to have too many of them in here. Loading new hosts means: waiting for robots.txt to load
freshhosts.add(hosthash);
i.remove();
continue smallstacks;
}
if (singletonStacksExist || smallStacksExist) {
if (delta < 0) continue; // keep all non-waiting stacks; they are useful to speed up things if (delta < 0) continue; // keep all non-waiting stacks; they are useful to speed up things
// to protect all small stacks which have a fast throughput, remove all with long waiting time // to protect all small stacks which have a fast throughput, remove all with long waiting time
if (delta >= 1000) {i.remove(); continue smallstacks;} if (delta >= 1000) {i.remove(); continue smallstacks;}
@ -350,6 +357,10 @@ public class HostBalancer implements Balancer {
} }
} }
} }
// put at least one of the fresh hosts back
if (freshhosts.size() > 0) this.roundRobinHostHashes.add(freshhosts.iterator().next());
// result
if (this.roundRobinHostHashes.size() == 1) { if (this.roundRobinHostHashes.size() == 1) {
if (log.isFine()) log.fine("(re-)initialized the round-robin queue with one host"); if (log.isFine()) log.fine("(re-)initialized the round-robin queue with one host");
} else { } else {

@ -98,14 +98,13 @@ public final class HTTPLoader {
* @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream. * @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
* @throws IOException when an error occurred * @throws IOException when an error occurred
*/ */
public StreamResponse openInputStream(final Request request, CrawlProfile profile, final int retryCount, public StreamResponse openInputStream(
final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent) final Request request, CrawlProfile profile, final int retryCount,
throws IOException { final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent
) throws IOException {
if (retryCount < 0) { if (retryCount < 0) {
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1);
FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1); throw new IOException( "retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$");
throw new IOException(
"retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$");
} }
DigestURL url = request.url(); DigestURL url = request.url();
@ -158,8 +157,7 @@ public final class HTTPLoader {
if (statusCode > 299 && statusCode < 310) { if (statusCode > 299 && statusCode < 310) {
client.finish(); client.finish();
final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, statusline, final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, statusline, responseHeader, requestURLString);
responseHeader, requestURLString);
if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) { if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {
// we have two use cases here: loading from a crawl or just // we have two use cases here: loading from a crawl or just
@ -196,15 +194,20 @@ public final class HTTPLoader {
"CRAWLER Redirect of URL=" + requestURLString + " aborted because of server shutdown.$"); "CRAWLER Redirect of URL=" + requestURLString + " aborted because of server shutdown.$");
} }
// check if the redirected URL is the same as the requested URL
// this shortcuts a time-out using retryCount
if (redirectionUrl.equals(url)) {
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirect to same url", -1);
throw new IOException( "retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$");
}
// retry crawling with new url // retry crawling with new url
request.redirectURL(redirectionUrl); request.redirectURL(redirectionUrl);
return openInputStream(request, profile, retryCount - 1, maxFileSize, blacklistType, agent); return openInputStream(request, profile, retryCount - 1, maxFileSize, blacklistType, agent);
} }
// we don't want to follow redirects // we don't want to follow redirects
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode); throw new IOException("REJECTED UNWANTED REDIRECTION '" + statusline + "' for URL '" + requestURLString + "'$");
throw new IOException("REJECTED UNWANTED REDIRECTION '" + statusline
+ "' for URL '" + requestURLString + "'$");
} else if (statusCode == HttpStatus.SC_OK || statusCode == HttpStatus.SC_NON_AUTHORITATIVE_INFORMATION) { } else if (statusCode == HttpStatus.SC_OK || statusCode == HttpStatus.SC_NON_AUTHORITATIVE_INFORMATION) {
// the transfer is ok // the transfer is ok
@ -397,8 +400,6 @@ public final class HTTPLoader {
throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason); throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason);
} }
throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to " + redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check"); throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to " + redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check");
} }
// if we are already doing a shutdown we don't need to retry crawling // if we are already doing a shutdown we don't need to retry crawling

@ -399,7 +399,7 @@ public final class LoaderDispatcher {
// load resource from the internet // load resource from the internet
StreamResponse response; StreamResponse response;
if (protocol.equals("http") || protocol.equals("https")) { if (protocol.equals("http") || protocol.equals("https")) {
response = this.httpLoader.openInputStream(request, crawlProfile, 1, maxFileSize, blacklistType, agent); response = this.httpLoader.openInputStream(request, crawlProfile, 2, maxFileSize, blacklistType, agent);
} else if (protocol.equals("ftp")) { } else if (protocol.equals("ftp")) {
response = this.ftpLoader.openInputStream(request, true); response = this.ftpLoader.openInputStream(request, true);
} else if (protocol.equals("smb")) { } else if (protocol.equals("smb")) {

Loading…
Cancel
Save