Merge branch 'master' of https://github.com/yacy/yacy_search_server.git

4 years ago · 3078b74e1d
parent 01cc32217f 7997836506
commit 3078b74e1d
6 changed files with 307 additions and 295 deletions
--- a/htroot/Network.html
+++ b/htroot/Network.html
@ -133,7 +133,7 @@ document.getElementById("apilink").setAttribute("href", "Network.xml?" + window.
      #{list}#
      <tr class="TableCell#(dark)#Light::Dark::Summary#(/dark)#">
        <td >#[hash]#</td>
-        <td #(special)#::class="TableCellActive"#(/special)#>#[shortname]##(ssl)#::<a href="https://#[ip]#:#[portssl]#/Network.html?page=1" class="forceNoExternalIcon" target="_blank"><img src="env/grafics/lock.gif" width="11" height="15" title="https supported" alt="https supported" /></a>#(/ssl)#</td>
+        <td #(special)#::class="TableCellActive"#(/special)#>#[shortname]##(ssl)#::<a href="https://#[ip]#:#[portssl]#/Network.html?page=1" class="forceNoExternalIcon" target="_blank"><img src="env/grafics/lockclose.png" width="11" height="15" title="https supported" alt="https supported" /></a>#(/ssl)#</td>
        <td nowrap>
          #(type)##(direct)#<img src="env/grafics/JuniorPassive.gif" width="11" height="11" title="Type: Junior | Contact: passive" alt="Junior passive" />::<img src="env/grafics/JuniorDirect.gif" width="11" height="11" title="Type: Junior | Contact: direct" alt="Junior direct" />::<img src="env/grafics/JuniorOffline.gif" width="11" height="11" title="Type: Junior | Contact: offline" alt="Junior offline" />#(/direct)#::#(direct)#<img src="env/grafics/SeniorPassive.gif" width="11" height="11" title="Type: Senior | Contact: passive" alt="senior passive" />::<img src="env/grafics/SeniorDirect.gif" width="11" height="11" title="Type: Senior | Contact: direct" alt="Senior direct" />::<img src="env/grafics/SeniorOffline.gif" width="11" height="11" title="Type: Senior | Contact: offline" alt="Senior offline" />#(/direct)#::<a href="#[url]#" class="forceNoExternalIcon">#(direct)#<img src="env/grafics/PrincipalPassive.gif" width="11" height="11" title="Type: Principal | Contact: passive | Seed download: possible" alt="Principal passive" />::<img src="env/grafics/PrincipalDirect.gif" width="11" height="11" title="Type: Principal | Contact: direct | Seed download: possible" alt="Principal active" />::<img src="env/grafics/PrincipalOffline.gif" width="11" height="11" title="Type: Principal | Contact: offline | Seed download: ?" alt="Principal offline" />#(/direct)#</a>#(/type)##(acceptcrawl)#<img src="env/grafics/CrawlNo.gif" width="11" height="11" title="Accept Crawl: no" alt="no crawl" />::<img src="env/grafics/CrawlYes.gif" width="11" height="11" title="Accept Crawl: yes" alt="crawl possible" />::<img src="env/grafics/CrawlYesOffline.gif" width="11" height="11" title="Accept Crawl: yes" alt="crawl possible" />#(/acceptcrawl)##(dhtreceive)#<img src="env/grafics/DHTReceiveNo.gif" width="11" height="11" title="DHT Receive: no; #[peertags]#" alt="no DHT receive" />::<img src="env/grafics/DHTReceiveYes.gif" width="11" height="11" title="DHT Receive: yes" alt="DHT receive enabled" />::<img src="env/grafics/DHTReceiveYesOffline.gif" width="11" height="11" title="DHT Receive: yes" alt="DHT receive enabled" />#(/dhtreceive)##{ips}#<a href="#(c)#http://#(ipv6)#::[#(/ipv6)##[ip]##(ipv6)#::]#(/ipv6)#:#[port]#/Network.html?page=1::Network.html?page=4&amp;peerHash=#[hash]#&amp;peerIP=#[ip]#&amp;peerPort=#[port]#&amp;addPeer=add+Peer#(/c)#"#(c)#:: target="_blank"#(/c)#class="forceNoExternalIcon"><img src="env/grafics/#(nodestate)#NodeDisqualified::NodeQualified#(/nodestate)##(c)##(ipv6)#IPv4::IPv6#(/ipv6)#::#(ipv6)#IPv4::IPv6#(/ipv6)##(/c)#.gif" width="11" height="11" title="#(c)##(ipv6)#IPv4::IPv6#(/ipv6)#::#(ipv6)#IPv4::IPv6#(/ipv6)##(/c)# #(nodestate)#Peer::Node Peer#(/nodestate)#" /></a>#{/ips}#</td>
        <td align="right">#[version]#</td>
@ -249,7 +249,7 @@ document.getElementById("apilink").setAttribute("href", "Network.xml?" + window.
        <td>QPH<br/>(remote)</td>
      </tr>
      <tr class="TableCellLight">
-        <td>#[my-name]##(my-ssl)#::<img src="env/grafics/lock.gif" width="11" height="15" title="https supported" alt="https supported" />#(/my-ssl)#</td>
+        <td>#[my-name]##(my-ssl)#::<img src="env/grafics/lockclose.png" width="11" height="15" title="https supported" alt="https supported" />#(/my-ssl)#</td>
        <td nowrap>#(my-info)#<img src="env/grafics/Virgin.gif" width="11" height="11" title="Type: Virgin" alt="Virgin" />::<img src="env/grafics/JuniorDirect.gif" width="11" height="11" title="Type: Junior" alt="Junior" />::<img src="env/grafics/SeniorDirect.gif" width="11" height="11" title="Type: Senior" alt="Senior" />::<img src="env/grafics/PrincipalDirect.gif" width="11" height="11" title="Type: Principal" alt="Principal" />#(/my-info)##(my-acceptcrawl)#<img src="env/grafics/CrawlNo.gif" width="11" height="11" title="Accept Crawl: no" alt="no crawl" />::<img src="env/grafics/CrawlYes.gif" width="11" height="11" title="Accept Crawl: yes" alt="Crawl enabled" />#(/my-acceptcrawl)##(my-dhtreceive)#<img src="env/grafics/DHTReceiveNo.gif" width="11" height="11" title="DHT Receive: no" alt="no DHT receive" />::<img src="env/grafics/DHTReceiveYes.gif" width="11" height="11" title="DHT Receive: yes" alt="DHT Receive enabled" />#(/my-dhtreceive)##{ips}#<a href="#(c)#http://#(ipv6)#::[#(/ipv6)##[ip]##(ipv6)#::]#(/ipv6)#:#[port]#/Network.html?page=1::Network.html?page=4&amp;peerHash=#[hash]#&amp;peerIP=#[ip]#&amp;peerPort=#[port]#&amp;addPeer=add+Peer#(/c)#"#(c)#:: target="_blank"#(/c)#class="forceNoExternalIcon"><img src="env/grafics/#(nodestate)#NodeDisqualified::NodeQualified#(/nodestate)##(c)##(ipv6)#IPv4::IPv6#(/ipv6)#::#(ipv6)#IPv4::IPv6#(/ipv6)##(/c)#.gif" width="11" height="11" title="#(c)##(ipv6)#IPv4::IPv6#(/ipv6)#::#(ipv6)#IPv4::IPv6#(/ipv6)##(/c)# #(nodestate)#Peer::Node Peer#(/nodestate)#" /></a>#{/ips}#</td>
        <td align="right">#[my-version]#</td>
        <td align="right">#[my-utc]#</td>
--- a/htroot/Status.html
+++ b/htroot/Status.html
@ -134,7 +134,7 @@
      You can download a more recent version of YaCy. Click here to install this update and restart YaCy: 
      <form action="Status.html" method="get" class="PeerControl" accept-charset="UTF-8">
        <button type="submit" name="aquirerelease" class="btn btn-primary" value="Update YaCy">
-          <img src="env/grafics/lock.gif" alt="lock icon"/>
+          <img src="env/grafics/lockclose.png" alt="lock icon"/>
          Install YaCy v#[latestVersion]#
        </button>
      </form>
--- a/htroot/Surftips.html
+++ b/htroot/Surftips.html
@ -57,12 +57,12 @@
    <form action="Surftips.html" method="get" class="PeerControl" accept-charset="UTF-8"><div>
      #(publicSurftips)#
      <button type="submit" name="publicPage" class="btn btn-primary" value="1">
-        <img src="env/grafics/lock.gif" alt="authentication required" />
+        <img src="env/grafics/lockclose.png" alt="authentication required" />
          Show surftips to everyone
      </button>
      ::
      <button type="submit" name="publicPage" class="btn btn-primary" value="0">
-        <img src="env/grafics/lock.gif" alt="authentication required" />
+        <img src="env/grafics/lockclose.png" alt="authentication required" />
          Hide surftips for users without autorization
      </button>
      #(/publicSurftips)#
--- a/source/net/yacy/crawler/HostBalancer.java
+++ b/source/net/yacy/crawler/HostBalancer.java
@ -65,7 +65,7 @@ public class HostBalancer implements Balancer {

    private final static ConcurrentLog log = new ConcurrentLog("HostBalancer");
    public final static HandleMap depthCache = new RowHandleMap(Word.commonHashLength, Word.commonHashOrder, 2, 8 * 1024 * 1024, "HostBalancer.DepthCache");
-    
+
    private final File hostsPath;
    private final boolean exceed134217727;
    private final Map<String, HostQueue> queues;
@ -84,7 +84,7 @@ public class HostBalancer implements Balancer {
            final boolean exceed134217727) {
        this(hostsPath, onDemandLimit, exceed134217727, true);
    }
-    
+
    /**
     * Create a new instance and fills the queue by scanning the hostsPath directory.
     * @param hostsPath
@ -100,7 +100,7 @@ public class HostBalancer implements Balancer {
        this.hostsPath = hostsPath;
        this.onDemandLimit = onDemandLimit;
        this.exceed134217727 = exceed134217727;
-        
+
        // create a stack for newly entered entries
        if (!(hostsPath.exists())) hostsPath.mkdirs(); // make the path
        this.queues = new ConcurrentHashMap<String, HostQueue>();
@ -114,7 +114,7 @@ public class HostBalancer implements Balancer {
     * return immediately (as large unfinished crawls may take longer to load)
     */
    private void init(final boolean async) {
-    	if(async) {
+        if(async) {
            Thread t = new Thread("HostBalancer.init") {
                @Override
                public void run() {
@ -122,10 +122,10 @@ public class HostBalancer implements Balancer {
                }
            };

-            t.start();    		
-    	} else {
-    		runInit();
-    	}
+            t.start();            
+        } else {
+            runInit();
+        }
    }

    /**
@ -185,7 +185,7 @@ public class HostBalancer implements Balancer {
        }
        return c;
    }
-    
+
    /**
     * delete all urls which are stored for given host hashes
     * @param hosthashes
@ -230,11 +230,11 @@ public class HostBalancer implements Balancer {
        return c;
    }

-	/**
-	 * @return true when the URL is queued is this or any other HostBalancer
-	 *         instance (as {@link #depthCache} is shared between all HostBalancer
-	 *         instances)
-	 */
+    /**
+     * @return true when the URL is queued is this or any other HostBalancer
+     *         instance (as {@link #depthCache} is shared between all HostBalancer
+     *         instances)
+     */
    @Override
    public boolean has(final byte[] urlhashb) {
        if (depthCache.has(urlhashb)) return true;
@ -313,7 +313,7 @@ public class HostBalancer implements Balancer {
        tryagain: while (true) try {
            HostQueue rhq = null;
            String rhh = null;
-        
+
            synchronized (this) {
                if (this.roundRobinHostHashes.size() == 0) {
                    // refresh the round-robin cache
@ -331,14 +331,21 @@ public class HostBalancer implements Balancer {
                            if (size <= 10) {smallStacksExist = true; break smallsearch;}
                        }
                    }
-                    if (singletonStacksExist || smallStacksExist) {
-                        Iterator<String> i = this.roundRobinHostHashes.iterator();
-                        smallstacks: while (i.hasNext()) {
-                            if (this.roundRobinHostHashes.size() <= 10) break smallstacks; // don't shrink the hosts until nothing is left
-                            String s = i.next();
-                            HostQueue hq = this.queues.get(s);
-                            if (hq == null) {i.remove(); continue smallstacks;}
-                            int delta = Latency.waitingRemainingGuessed(hq.getHost(), hq.getPort(), s, robots, ClientIdentification.yacyInternetCrawlerAgent);
+                    Set<String> freshhosts = new HashSet<>();
+                    Iterator<String> i = this.roundRobinHostHashes.iterator();
+                    smallstacks: while (i.hasNext()) {
+                        if (this.roundRobinHostHashes.size() <= 10) break smallstacks; // don't shrink the hosts until nothing is left
+                        String hosthash = i.next();
+                        HostQueue hq = this.queues.get(hosthash);
+                        if (hq == null) {i.remove(); continue smallstacks;}
+                        int delta = Latency.waitingRemainingGuessed(hq.getHost(), hq.getPort(), hosthash, robots, ClientIdentification.yacyInternetCrawlerAgent);
+                        if (delta == Integer.MIN_VALUE) {
+                            // never-crawled hosts; we do not want to have too many of them in here. Loading new hosts means: waiting for robots.txt to load
+                            freshhosts.add(hosthash);
+                            i.remove();
+                            continue smallstacks;
+                        }
+                        if (singletonStacksExist || smallStacksExist) {
                            if (delta < 0) continue; // keep all non-waiting stacks; they are useful to speed up things
                            // to protect all small stacks which have a fast throughput, remove all with long waiting time
                            if (delta >= 1000) {i.remove(); continue smallstacks;}
@ -350,6 +357,10 @@ public class HostBalancer implements Balancer {
                            }
                        }
                    }
+                    // put at least one of the fresh hosts back
+                    if (freshhosts.size() > 0) this.roundRobinHostHashes.add(freshhosts.iterator().next());
+
+                    // result
                    if (this.roundRobinHostHashes.size() == 1) {
                        if (log.isFine()) log.fine("(re-)initialized the round-robin queue with one host");
                    } else {
@ -357,13 +368,13 @@ public class HostBalancer implements Balancer {
                    }
                }
                if (this.roundRobinHostHashes.size() == 0) return null;
-                
+
                // if the queue size is 1, just take that
                if (this.roundRobinHostHashes.size() == 1) {
                    rhh = this.roundRobinHostHashes.iterator().next();
                    rhq = this.queues.get(rhh);
                }
-                
+
                if (rhq == null) {
                    // mixed minimum sleep time / largest queue strategy:
                    // create a map of sleep time / queue relations with a fuzzy sleep time (ms / 500).
@ -449,7 +460,7 @@ public class HostBalancer implements Balancer {
                }
                */
            }
-            
+
            if (rhq == null) {
                this.roundRobinHostHashes.clear(); // force re-initialization
                continue tryagain;
@ -458,7 +469,7 @@ public class HostBalancer implements Balancer {
            long timestamp = System.currentTimeMillis();
            Request request = rhq.pop(delay, cs, robots); // this pop is outside of synchronization to prevent blocking of pushes
            long actualwaiting = System.currentTimeMillis() - timestamp;
-            
+
            if (actualwaiting > 1000) {
                synchronized (this) {
                    // to prevent that this occurs again, remove all stacks with positive delay times (which may be less after that waiting)
@ -473,7 +484,7 @@ public class HostBalancer implements Balancer {
                    }
                }
            }
-            
+
            if (rhq.isEmpty()) {
                synchronized (this) {
                    this.queues.remove(rhh);
@ -545,7 +556,7 @@ public class HostBalancer implements Balancer {
    @Override
    public List<Request> getDomainStackReferences(String host, int maxcount, long maxtime) {
        if (host == null) {
-        	return Collections.emptyList();
+            return Collections.emptyList();
        }
        try {
            HostQueue hq = this.queues.get(DigestURL.hosthash(host, host.startsWith("ftp.") ? 21 : 80));
--- a/source/net/yacy/crawler/retrieval/HTTPLoader.java
+++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java
@ -85,8 +85,8 @@ public final class HTTPLoader {
        Latency.updateAfterLoad(entry.url(), System.currentTimeMillis() - start);
        return doc;
    }
-    
-	/**
+
+    /**
     * Open an input stream on a requested HTTP resource. When the resource content size is small 
     * (lower than {@link Response#CRAWLER_MAX_SIZE_TO_CACHE}, fully load it and use a ByteArrayInputStream instance.
     * @param request
@ -98,228 +98,231 @@ public final class HTTPLoader {
     * @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
     * @throws IOException when an error occurred
     */
-	public StreamResponse openInputStream(final Request request, CrawlProfile profile, final int retryCount,
-			final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent)
-					throws IOException {
-		if (retryCount < 0) {
-			this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
-					FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1);
-			throw new IOException(
-					"retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$");
-		}
-		DigestURL url = request.url();
-
-		final String host = url.getHost();
-		if (host == null || host.length() < 2) {
-			throw new IOException("host is not well-formed: '" + host + "'");
-		}
-		final String path = url.getFile();
-		int port = url.getPort();
-		final boolean ssl = url.getProtocol().equals("https");
-		if (port < 0)
-			port = (ssl) ? 443 : 80;
-
-		// check if url is in blacklist
-		final String hostlow = host.toLowerCase(Locale.ROOT);
-		if (blacklistType != null && Switchboard.urlBlacklist.isListed(blacklistType, hostlow, path)) {
-			this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT,
-					"url in blacklist", -1);
-			throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$");
-		}
-
-		// resolve yacy and yacyh domains
-		final AlternativeDomainNames yacyResolver = this.sb.peers;
-		if (yacyResolver != null) {
-			final String yAddress = yacyResolver.resolve(host);
-			if (yAddress != null) {
-				url = new DigestURL(url.getProtocol() + "://" + yAddress + path);
-			}
-		}
-
-		// create a request header
-		final RequestHeader requestHeader = createRequestheader(request, agent);
-
-		// HTTP-Client
-		final HTTPClient client = new HTTPClient(agent);
-		client.setRedirecting(false); // we want to handle redirection
-										// ourselves, so we don't index pages
-										// twice
-		client.setTimout(this.socketTimeout);
-		client.setHeader(requestHeader.entrySet());
-
-		// send request
-		client.GET(url, false);
-		final StatusLine statusline = client.getHttpResponse().getStatusLine();
-		final int statusCode = statusline.getStatusCode();
-		final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders());
-		String requestURLString = request.url().toNormalform(true);
-
-		// check redirection
-		if (statusCode > 299 && statusCode < 310) {
-			client.finish();
-			
-			final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, statusline,
-					responseHeader, requestURLString);
-
-			if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {
-				// we have two use cases here: loading from a crawl or just
-				// loading the url. Check this:
-				if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) {
-					// put redirect url on the crawler queue to repeat a
-					// double-check
-    	        	/* We have to clone the request instance and not to modify directly its URL, 
-    	        	 * otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */
+    public StreamResponse openInputStream(
+            final Request request, CrawlProfile profile, final int retryCount,
+            final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent
+        ) throws IOException {
+        if (retryCount < 0) {
+            this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1);
+            throw new IOException( "retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$");
+        }
+        DigestURL url = request.url();
+
+        final String host = url.getHost();
+        if (host == null || host.length() < 2) {
+            throw new IOException("host is not well-formed: '" + host + "'");
+        }
+        final String path = url.getFile();
+        int port = url.getPort();
+        final boolean ssl = url.getProtocol().equals("https");
+        if (port < 0)
+            port = (ssl) ? 443 : 80;
+
+        // check if url is in blacklist
+        final String hostlow = host.toLowerCase(Locale.ROOT);
+        if (blacklistType != null && Switchboard.urlBlacklist.isListed(blacklistType, hostlow, path)) {
+            this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT,
+                    "url in blacklist", -1);
+            throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$");
+        }
+
+        // resolve yacy and yacyh domains
+        final AlternativeDomainNames yacyResolver = this.sb.peers;
+        if (yacyResolver != null) {
+            final String yAddress = yacyResolver.resolve(host);
+            if (yAddress != null) {
+                url = new DigestURL(url.getProtocol() + "://" + yAddress + path);
+            }
+        }
+
+        // create a request header
+        final RequestHeader requestHeader = createRequestheader(request, agent);
+
+        // HTTP-Client
+        final HTTPClient client = new HTTPClient(agent);
+        client.setRedirecting(false); // we want to handle redirection
+                                        // ourselves, so we don't index pages
+                                        // twice
+        client.setTimout(this.socketTimeout);
+        client.setHeader(requestHeader.entrySet());
+
+        // send request
+        client.GET(url, false);
+        final StatusLine statusline = client.getHttpResponse().getStatusLine();
+        final int statusCode = statusline.getStatusCode();
+        final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders());
+        String requestURLString = request.url().toNormalform(true);
+
+        // check redirection
+        if (statusCode > 299 && statusCode < 310) {
+            client.finish();
+
+            final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, statusline, responseHeader, requestURLString);
+
+            if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {
+                // we have two use cases here: loading from a crawl or just
+                // loading the url. Check this:
+                if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) {
+                    // put redirect url on the crawler queue to repeat a
+                    // double-check
+                    /* We have to clone the request instance and not to modify directly its URL, 
+                     * otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */
                    Request redirectedRequest = new Request(request.initiator(),
-                    		redirectionUrl,
-                    		request.referrerhash(),
-                    		request.name(),
-                    		request.appdate(),
-                    		request.profileHandle(),
-                    		request.depth(),
-                    		request.timezoneOffset());
-    	            String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest);
-    	            if(rejectReason != null) {
+                            redirectionUrl,
+                            request.referrerhash(),
+                            request.name(),
+                            request.appdate(),
+                            request.profileHandle(),
+                            request.depth(),
+                            request.timezoneOffset());
+                    String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest);
+                    if(rejectReason != null) {
                        throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason);
-    	            }
-					// in the end we must throw an exception (even if this is
-					// not an error, just to abort the current process
-					throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to "
-							+ redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check");
-				}
-
-				// if we are already doing a shutdown we don't need to retry
-				// crawling
-				if (Thread.currentThread().isInterrupted()) {
-					this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
-							FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode);
-					throw new IOException(
-							"CRAWLER Redirect of URL=" + requestURLString + " aborted because of server shutdown.$");
-				}
-
-				// retry crawling with new url
-				request.redirectURL(redirectionUrl);
-				return openInputStream(request, profile, retryCount - 1, maxFileSize, blacklistType, agent);
-			}
-			// we don't want to follow redirects
-			this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
-					FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
-			throw new IOException("REJECTED UNWANTED REDIRECTION '" + statusline
-					+ "' for URL '" + requestURLString + "'$");
-		} else if (statusCode == HttpStatus.SC_OK || statusCode == HttpStatus.SC_NON_AUTHORITATIVE_INFORMATION) {
-			// the transfer is ok
-
-			/*
-			 * When content is not large (less than Response.CRAWLER_MAX_SIZE_TO_CACHE), we have better cache it if cache is enabled and url is not local
-			 */
-			long contentLength = client.getHttpResponse().getEntity().getContentLength();
-			InputStream contentStream;
-			if (profile != null && profile.storeHTCache() && contentLength > 0 && contentLength < (Response.CRAWLER_MAX_SIZE_TO_CACHE) && !url.isLocal()) {
-				byte[] content = null;
-				try {
-					content = HTTPClient.getByteArray(client.getHttpResponse().getEntity(), maxFileSize);
-					Cache.store(url, responseHeader, content);
-				} catch (final IOException e) {
-					this.log.warn("cannot write " + url + " to Cache (3): " + e.getMessage(), e);
-				} finally {
-					client.finish();
-				}
-
-				contentStream = new ByteArrayInputStream(content);
-			} else {
-				/*
-				 * Content length may already be known now : check it before opening a stream
-				 */
-				if (maxFileSize >= 0 && contentLength > maxFileSize) {
-					throw new IOException("Content to download exceed maximum value of " + maxFileSize + " bytes");
-				}
-				/*
-				 * Create a HTTPInputStream delegating to
-				 * client.getContentstream(). Close method will ensure client is
-				 * properly closed.
-				 */
-				contentStream = new HTTPInputStream(client);
-				/* Anticipated content length may not be already known or incorrect : let's apply now the same eventual content size restriction as when loading in a byte array */
-				if(maxFileSize >= 0) {
-					contentStream = new StrictLimitInputStream(contentStream, maxFileSize,
-							"Content to download exceed maximum value of " + Formatter.bytesToString(maxFileSize));
-				}
-			}
-
-			return new StreamResponse(new Response(request, requestHeader, responseHeader, profile, false, null), contentStream);
-		} else {
-			client.finish();
-			// if the response has not the right response type then reject file
-			this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
-					FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode);
-			throw new IOException("REJECTED WRONG STATUS TYPE '" + statusline
-					+ "' for URL '" + requestURLString + "'$");
-		}
-	}
-
-	/**
-	 * Extract redirect URL from response header. Status code is supposed to be between 299 and 310. Parameters must not be null.
-	 * @return redirect URL
-	 * @throws IOException when an error occured
-	 */
-	private DigestURL extractRedirectURL(final Request request, CrawlProfile profile, DigestURL url,
-			final StatusLine statusline, final ResponseHeader responseHeader, String requestURLString)
-					throws IOException {
-		// read redirection URL
-		String redirectionUrlString = responseHeader.get(HeaderFramework.LOCATION);
-		redirectionUrlString = redirectionUrlString == null ? "" : redirectionUrlString.trim();
-
-		if (redirectionUrlString.isEmpty()) {
-			this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
-					FailCategory.TEMPORARY_NETWORK_FAILURE,
-					"no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusline.getStatusCode());
-			throw new IOException("REJECTED EMTPY REDIRECTION '" + statusline
-					+ "' for URL '" + requestURLString + "'$");
-		}
-
-		// normalize URL
-		final DigestURL redirectionUrl = DigestURL.newURL(request.url(), redirectionUrlString);
-
-		// restart crawling with new url
-		this.log.info("CRAWLER Redirection detected ('" + statusline + "') for URL "
-				+ requestURLString);
-		this.log.info("CRAWLER ..Redirecting request to: " + redirectionUrl.toNormalform(false));
-
-		this.sb.webStructure.generateCitationReference(url, redirectionUrl);
-
-		if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_RECORD_REDIRECTS, true)) {
-			this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
-					FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusline.getStatusCode());
-		}
-		return redirectionUrl;
-	}
-
-	/**
-	 * Create request header for loading content.
-	 * @param request search request
-	 * @param agent agent identification information
-	 * @return a request header
-	 * @throws IOException when an error occured
-	 */
-	private RequestHeader createRequestheader(final Request request, final ClientIdentification.Agent agent)
-			throws IOException {
-		final RequestHeader requestHeader = new RequestHeader();
-		requestHeader.put(HeaderFramework.USER_AGENT, agent.userAgent);
-		if (request.referrerhash() != null) {
+                    }
+                    // in the end we must throw an exception (even if this is
+                    // not an error, just to abort the current process
+                    throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to "
+                            + redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check");
+                }
+
+                // if we are already doing a shutdown we don't need to retry
+                // crawling
+                if (Thread.currentThread().isInterrupted()) {
+                    this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
+                            FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode);
+                    throw new IOException(
+                            "CRAWLER Redirect of URL=" + requestURLString + " aborted because of server shutdown.$");
+                }
+
+                // check if the redirected URL is the same as the requested URL
+                // this shortcuts a time-out using retryCount
+                if (redirectionUrl.equals(url)) {
+                    this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirect to same url", -1);
+                    throw new IOException( "retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$");
+                }
+
+                // retry crawling with new url
+                request.redirectURL(redirectionUrl);
+                return openInputStream(request, profile, retryCount - 1, maxFileSize, blacklistType, agent);
+            }
+            // we don't want to follow redirects
+            this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
+            throw new IOException("REJECTED UNWANTED REDIRECTION '" + statusline + "' for URL '" + requestURLString + "'$");
+        } else if (statusCode == HttpStatus.SC_OK || statusCode == HttpStatus.SC_NON_AUTHORITATIVE_INFORMATION) {
+            // the transfer is ok
+
+            /*
+             * When content is not large (less than Response.CRAWLER_MAX_SIZE_TO_CACHE), we have better cache it if cache is enabled and url is not local
+             */
+            long contentLength = client.getHttpResponse().getEntity().getContentLength();
+            InputStream contentStream;
+            if (profile != null && profile.storeHTCache() && contentLength > 0 && contentLength < (Response.CRAWLER_MAX_SIZE_TO_CACHE) && !url.isLocal()) {
+                byte[] content = null;
+                try {
+                    content = HTTPClient.getByteArray(client.getHttpResponse().getEntity(), maxFileSize);
+                    Cache.store(url, responseHeader, content);
+                } catch (final IOException e) {
+                    this.log.warn("cannot write " + url + " to Cache (3): " + e.getMessage(), e);
+                } finally {
+                    client.finish();
+                }
+
+                contentStream = new ByteArrayInputStream(content);
+            } else {
+                /*
+                 * Content length may already be known now : check it before opening a stream
+                 */
+                if (maxFileSize >= 0 && contentLength > maxFileSize) {
+                    throw new IOException("Content to download exceed maximum value of " + maxFileSize + " bytes");
+                }
+                /*
+                 * Create a HTTPInputStream delegating to
+                 * client.getContentstream(). Close method will ensure client is
+                 * properly closed.
+                 */
+                contentStream = new HTTPInputStream(client);
+                /* Anticipated content length may not be already known or incorrect : let's apply now the same eventual content size restriction as when loading in a byte array */
+                if(maxFileSize >= 0) {
+                    contentStream = new StrictLimitInputStream(contentStream, maxFileSize,
+                            "Content to download exceed maximum value of " + Formatter.bytesToString(maxFileSize));
+                }
+            }
+
+            return new StreamResponse(new Response(request, requestHeader, responseHeader, profile, false, null), contentStream);
+        } else {
+            client.finish();
+            // if the response has not the right response type then reject file
+            this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
+                    FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode);
+            throw new IOException("REJECTED WRONG STATUS TYPE '" + statusline
+                    + "' for URL '" + requestURLString + "'$");
+        }
+    }
+
+    /**
+     * Extract redirect URL from response header. Status code is supposed to be between 299 and 310. Parameters must not be null.
+     * @return redirect URL
+     * @throws IOException when an error occured
+     */
+    private DigestURL extractRedirectURL(final Request request, CrawlProfile profile, DigestURL url,
+            final StatusLine statusline, final ResponseHeader responseHeader, String requestURLString)
+                    throws IOException {
+        // read redirection URL
+        String redirectionUrlString = responseHeader.get(HeaderFramework.LOCATION);
+        redirectionUrlString = redirectionUrlString == null ? "" : redirectionUrlString.trim();
+
+        if (redirectionUrlString.isEmpty()) {
+            this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
+                    FailCategory.TEMPORARY_NETWORK_FAILURE,
+                    "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusline.getStatusCode());
+            throw new IOException("REJECTED EMTPY REDIRECTION '" + statusline
+                    + "' for URL '" + requestURLString + "'$");
+        }
+
+        // normalize URL
+        final DigestURL redirectionUrl = DigestURL.newURL(request.url(), redirectionUrlString);
+
+        // restart crawling with new url
+        this.log.info("CRAWLER Redirection detected ('" + statusline + "') for URL "
+                + requestURLString);
+        this.log.info("CRAWLER ..Redirecting request to: " + redirectionUrl.toNormalform(false));
+
+        this.sb.webStructure.generateCitationReference(url, redirectionUrl);
+
+        if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_RECORD_REDIRECTS, true)) {
+            this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile,
+                    FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusline.getStatusCode());
+        }
+        return redirectionUrl;
+    }
+
+    /**
+     * Create request header for loading content.
+     * @param request search request
+     * @param agent agent identification information
+     * @return a request header
+     * @throws IOException when an error occured
+     */
+    private RequestHeader createRequestheader(final Request request, final ClientIdentification.Agent agent)
+            throws IOException {
+        final RequestHeader requestHeader = new RequestHeader();
+        requestHeader.put(HeaderFramework.USER_AGENT, agent.userAgent);
+        if (request.referrerhash() != null) {
                    DigestURL refererURL = this.sb.getURL(request.referrerhash());
                    if (refererURL != null) {
                        requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true));
                    }
-		}
-
-		requestHeader.put(HeaderFramework.ACCEPT, this.sb.getConfig("crawler.http.accept", DEFAULT_ACCEPT));
-		requestHeader.put(HeaderFramework.ACCEPT_LANGUAGE,
-				this.sb.getConfig("crawler.http.acceptLanguage", DEFAULT_LANGUAGE));
-		requestHeader.put(HeaderFramework.ACCEPT_CHARSET,
-				this.sb.getConfig("crawler.http.acceptCharset", DEFAULT_CHARSET));
-		requestHeader.put(HeaderFramework.ACCEPT_ENCODING,
-				this.sb.getConfig("crawler.http.acceptEncoding", DEFAULT_ENCODING));
-		return requestHeader;
-	}
+        }
+
+        requestHeader.put(HeaderFramework.ACCEPT, this.sb.getConfig("crawler.http.accept", DEFAULT_ACCEPT));
+        requestHeader.put(HeaderFramework.ACCEPT_LANGUAGE,
+                this.sb.getConfig("crawler.http.acceptLanguage", DEFAULT_LANGUAGE));
+        requestHeader.put(HeaderFramework.ACCEPT_CHARSET,
+                this.sb.getConfig("crawler.http.acceptCharset", DEFAULT_CHARSET));
+        requestHeader.put(HeaderFramework.ACCEPT_ENCODING,
+                this.sb.getConfig("crawler.http.acceptEncoding", DEFAULT_ENCODING));
+        return requestHeader;
+    }

    private Response load(final Request request, CrawlProfile profile, final int retryCount, final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {

@ -347,10 +350,10 @@ public final class HTTPLoader {
        // resolve yacy and yacyh domains
        final AlternativeDomainNames yacyResolver = this.sb.peers;
        if(yacyResolver != null) {
-        	final String yAddress = yacyResolver.resolve(host);
-        	if(yAddress != null) {
-        		url = new DigestURL(url.getProtocol() + "://" + yAddress + path);
-        	}
+            final String yAddress = yacyResolver.resolve(host);
+            if(yAddress != null) {
+                url = new DigestURL(url.getProtocol() + "://" + yAddress + path);
+            }
        }

        // take a file from the net
@ -366,41 +369,39 @@ public final class HTTPLoader {
        client.setHeader(requestHeader.entrySet());

        // send request
-    	final byte[] responseBody = client.GETbytes(url, sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_USER_NAME, "admin"), sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_B64MD5, ""), maxFileSize, false);
+        final byte[] responseBody = client.GETbytes(url, sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_USER_NAME, "admin"), sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_B64MD5, ""), maxFileSize, false);
        final int statusCode = client.getHttpResponse().getStatusLine().getStatusCode();
-    	final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders());
+        final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders());
        String requestURLString = request.url().toNormalform(true);

        // check redirection
-    	if (statusCode > 299 && statusCode < 310) {
+        if (statusCode > 299 && statusCode < 310) {

-    	    final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, client.getHttpResponse().getStatusLine(),
-					responseHeader, requestURLString);
+            final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, client.getHttpResponse().getStatusLine(),
+                    responseHeader, requestURLString);

-    	    if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {
-    	        // we have two use cases here: loading from a crawl or just loading the url. Check this:
-    	        if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) {
+            if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {
+                // we have two use cases here: loading from a crawl or just loading the url. Check this:
+                if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) {
                    // put redirect url on the crawler queue to repeat a double-check
-    	        	/* We have to clone the request instance and not to modify directly its URL, 
-    	        	 * otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */
+                    /* We have to clone the request instance and not to modify directly its URL, 
+                     * otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */
                    Request redirectedRequest = new Request(request.initiator(),
-                    		redirectionUrl,
-                    		request.referrerhash(),
-                    		request.name(),
-                    		request.appdate(),
-                    		request.profileHandle(),
-                    		request.depth(),
-                    		request.timezoneOffset());
-    	            String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest);
-    	            // in the end we must throw an exception (even if this is not an error, just to abort the current process
-    	            if(rejectReason != null) {
+                            redirectionUrl,
+                            request.referrerhash(),
+                            request.name(),
+                            request.appdate(),
+                            request.profileHandle(),
+                            request.depth(),
+                            request.timezoneOffset());
+                    String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest);
+                    // in the end we must throw an exception (even if this is not an error, just to abort the current process
+                    if(rejectReason != null) {
                        throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason);
-    	            }
+                    }
                    throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to " + redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check");
-    	            
+                }

-    	        }
-    	        
                // if we are already doing a shutdown we don't need to retry crawling
                if (Thread.currentThread().isInterrupted()) {
                    this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode);
@ -410,15 +411,15 @@ public final class HTTPLoader {
                // retry crawling with new url
                request.redirectURL(redirectionUrl);
                return load(request, profile, retryCount - 1, maxFileSize, blacklistType, agent);
-    	    }
+            }
            // we don't want to follow redirects
            this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
            throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
        } else if (responseBody == null) {
-    	    // no response, reject file
+            // no response, reject file
            this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", statusCode);
            throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
-    	} else if (statusCode == 200 || statusCode == 203) {
+        } else if (statusCode == 200 || statusCode == 203) {
            // the transfer is ok

            // we write the new cache entry to file system directly
@ -427,8 +428,8 @@ public final class HTTPLoader {

            // check length again in case it was not possible to get the length before loading
            if (maxFileSize >= 0 && contentLength > maxFileSize) {
-            	this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode);
-            	throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)$");
+                this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode);
+                throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)$");
            }

            // create a new cache entry
@ -442,9 +443,9 @@ public final class HTTPLoader {
            );

            return response;
-    	} else {
+        } else {
            // if the response has not the right response type then reject file
-        	this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode);
+            this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode);
            throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$");
        }
    }
@ -485,17 +486,17 @@ public final class HTTPLoader {
        final HTTPClient client = new HTTPClient(agent);
        client.setTimout(20000);
        client.setHeader(requestHeader.entrySet());
-        	final byte[] responseBody = client.GETbytes(request.url(), null, null, false);
+            final byte[] responseBody = client.GETbytes(request.url(), null, null, false);
            final int code = client.getHttpResponse().getStatusLine().getStatusCode();
-        	final ResponseHeader header = new ResponseHeader(code, client.getHttpResponse().getAllHeaders());
+            final ResponseHeader header = new ResponseHeader(code, client.getHttpResponse().getAllHeaders());
            // FIXME: 30*-handling (bottom) is never reached
            // we always get the final content because httpClient.followRedirects = true

-        	if (responseBody != null && (code == 200 || code == 203)) {
+            if (responseBody != null && (code == 200 || code == 203)) {
                // the transfer is ok

-        		//statistics:
-        		ByteCount.addAccountCount(ByteCount.CRAWLER, responseBody.length);
+                //statistics:
+                ByteCount.addAccountCount(ByteCount.CRAWLER, responseBody.length);

                // we write the new cache entry to file system directly

@ -513,7 +514,7 @@ public final class HTTPLoader {
            } else if (code > 299 && code < 310) {
                if (header.containsKey(HeaderFramework.LOCATION)) {
                    // getting redirection URL
-                	String redirectionUrlString = header.get(HeaderFramework.LOCATION);
+                    String redirectionUrlString = header.get(HeaderFramework.LOCATION);
                    redirectionUrlString = redirectionUrlString.trim();

                    if (redirectionUrlString.isEmpty()) {
@ -535,7 +536,7 @@ public final class HTTPLoader {
                }
            } else {
                // if the response has not the right response type then reject file
-            	throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
+                throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
            }
        return response;
    }
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@ -399,7 +399,7 @@ public final class LoaderDispatcher {
        // load resource from the internet
        StreamResponse response;
        if (protocol.equals("http") || protocol.equals("https")) {
-        	response = this.httpLoader.openInputStream(request, crawlProfile, 1, maxFileSize, blacklistType, agent);
+        	response = this.httpLoader.openInputStream(request, crawlProfile, 2, maxFileSize, blacklistType, agent);
        } else if (protocol.equals("ftp")) {
        	response = this.ftpLoader.openInputStream(request, true);
        } else if (protocol.equals("smb")) {