fixed a number of small bugs:

- better crawl star for files paths and smb paths - added time-out wrapper for dns resolving and reverse resolving to prevent blockings - fixed intranet scanner result list check boxes - prevented htcache usage in case of file and smb crawling (not necessary, documents are locally available) - fixed rss feed loader - fixes sitemap loader which had not been restricted to single files (crawl-depth must be zero) - clearing of crawl result lists when a network switch was done - higher maximum file size for crawler git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7214 6c8d7289-2bf4-0310-a012-ef5d649a1542
15 years ago · 2c549ae341
parent f6eebb6f99
commit 2c549ae341
22 changed files with 229 additions and 99 deletions
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@ -674,10 +674,10 @@ crawler.clientTimeout=9000
 crawler.http.acceptEncoding=gzip
 crawler.http.acceptLanguage=en-us,en;q=0.5
 crawler.http.acceptCharset=ISO-8859-1,utf-8;q=0.7,*;q=0.7
-crawler.http.maxFileSize=1048576
+crawler.http.maxFileSize=10485760

 # ftp crawler specific settings; size in bytes
-crawler.ftp.maxFileSize=1048576
+crawler.ftp.maxFileSize=10485760

 # smb crawler specific settings: maximum size
 crawler.smb.maxFileSize=100000000
--- a/htroot/CrawlProfileEditor_p.java
+++ b/htroot/CrawlProfileEditor_p.java
@ -92,7 +92,6 @@ public class CrawlProfileEditor_p {
        labels.add(new eentry(CrawlProfile.INDEX_TEXT,          "Index Text",            false, eentry.BOOLEAN));
        labels.add(new eentry(CrawlProfile.INDEX_MEDIA,         "Index Media",           false, eentry.BOOLEAN));
        labels.add(new eentry(CrawlProfile.STORE_HTCACHE,       "Store in HTCache",      false, eentry.BOOLEAN));
-        labels.add(new eentry(CrawlProfile.STORE_TXCACHE,       "Store in TXCache",      false, eentry.BOOLEAN));
        labels.add(new eentry(CrawlProfile.REMOTE_INDEXING,     "Remote Indexing",       false, eentry.BOOLEAN));
        labels.add(new eentry(CrawlProfile.XSSTOPW,             "Static stop-words",     false, eentry.BOOLEAN));
        labels.add(new eentry(CrawlProfile.XDSTOPW,             "Dynamic stop-words",    false, eentry.BOOLEAN));
--- a/htroot/CrawlResults.java
+++ b/htroot/CrawlResults.java
@ -188,15 +188,16 @@ public class CrawlResults {
                try {
                    urle = sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).load(entry.getKey().getBytes(), null, 0);
                    if (urle == null) {
-                        Log.logWarning("PLASMA", "CrawlResults: URL not in index with url hash "+ entry.getKey());
+                        Log.logWarning("PLASMA", "CrawlResults: URL not in index with url hash " + entry.getKey());
                        urlstr = null;
                        urltxt = null;
                        metadata = null;
-                    } else {
-                        metadata = urle.metadata();
-                        urlstr = metadata.url().toNormalform(false, true);
-                        urltxt = nxTools.shortenURLString(urlstr, 72); // shorten the string text like a URL
+                        continue;
                    }
+                    metadata = urle.metadata();
+                    urlstr = metadata.url().toNormalform(false, true);
+                    urltxt = nxTools.shortenURLString(urlstr, 72); // shorten the string text like a URL
+                    
                    initiatorSeed = entry.getValue() == null || entry.getValue().initiatorHash == null ? null : sb.peers.getConnected(new String(entry.getValue().initiatorHash));
                    executorSeed = entry.getValue() == null || entry.getValue().executorHash == null ? null : sb.peers.getConnected(new String(entry.getValue().executorHash));

--- a/htroot/CrawlStartIntranet_p.html
+++ b/htroot/CrawlStartIntranet_p.html
@ -36,7 +36,7 @@
 	#(/notintranet)#
 	
 	#(servertable)#::
-	<form id="servertable" action="CrawlStartIntranet_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8" ><fieldset>
+	<form id="servertable" name="servertable" action="CrawlStartIntranet_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8" ><fieldset>
 	  <legend><label for="servertable">Available Intranet Server</label></legend>
      <table class="sortable" border="0" cellpadding="2" cellspacing="1">
        <tr class="TableHeader" valign="bottom">
--- a/htroot/CrawlStartSite_p.html
+++ b/htroot/CrawlStartSite_p.html
@ -45,8 +45,7 @@
            <span id="robotsOK"></span><img align="top" src="/env/grafics/empty.gif" name="ajax" alt="empty" />
            </td>
          </tr><tr>
-            <td><input type="radio" name="crawlingMode" id="sitelist" value="sitelist" disabled="disabled"
-            onmousedown="document.getElementById('rangeDomain').disabled=true;document.getElementById('rangeSubpath').disabled=true;document.getElementById('crawlingDomMaxCheck').disabled=true;document.getElementById('crawlingDomMaxPages').disabled=true;document.getElementById('crawlingQ').disabled=true;"/>Link-List of URL</td>
+            <td><input type="radio" name="crawlingMode" id="sitelist" value="sitelist" disabled="disabled" />Link-List of URL</td>
            <td><div id="sitelistURLs"></div></td>
          </tr><tr>
            <td><input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled"
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@ -138,14 +138,13 @@ public class Crawler_p {
                final boolean fullDomain = post.get("range", "wide").equals("domain"); // special property in simple crawl start
                final boolean subPath    = post.get("range", "wide").equals("subpath"); // special property in simple crawl start
                
-                
                // set the crawl filter
                String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL);
                String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER);
                if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL; // avoid that all urls are filtered out if bad value was submitted
                // special cases:
                if (crawlingStartURL!= null && fullDomain) {
-                    newcrawlingMustMatch = ".*" + crawlingStartURL.getHost() + ".*";
+                    newcrawlingMustMatch = crawlingStartURL.isFile() ? "file:///.*" : crawlingStartURL.isSMB() ? "smb://.*" : ".*" + crawlingStartURL.getHost() + ".*";
                }
                if (crawlingStart!= null && subPath && (pos = crawlingStart.lastIndexOf('/')) > 0) {
                    newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*";
@ -203,7 +202,8 @@ public class Crawler_p {
                final boolean indexMedia = post.get("indexMedia", "off").equals("on");
                env.setConfig("indexMedia", (indexMedia) ? "true" : "false");
                
-                final boolean storeHTCache = post.get("storeHTCache", "off").equals("on");
+                boolean storeHTCache = post.get("storeHTCache", "off").equals("on");
+                if (crawlingStartURL.isFile() || crawlingStartURL.isSMB()) storeHTCache = false;
                env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false");
                
                final String cachePolicyString = post.get("cachePolicy", "iffresh");
@ -247,15 +247,21 @@ public class Crawler_p {
                        // stack url
                        sb.crawler.profilesPassiveCrawls.remove(crawlingStartURL.hash()); // if there is an old entry, delete it
                        final CrawlProfile pe = new CrawlProfile(
-                                (crawlingStartURL.getHost() == null) ? Long.toHexString(System.currentTimeMillis()) : crawlingStartURL.getHost(),
+                                (crawlingStartURL.getHost() == null) ? crawlingStartURL.toNormalform(true, false) : crawlingStartURL.getHost(),
                                crawlingStartURL,
                                newcrawlingMustMatch,
                                newcrawlingMustNotMatch,
                                newcrawlingdepth,
-                                crawlingIfOlder, crawlingDomMaxPages,
+                                crawlingIfOlder,
+                                crawlingDomMaxPages,
                                crawlingQ,
                                indexText, indexMedia,
-                                storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cachePolicy);
+                                storeHTCache,
+                                crawlOrder,
+                                xsstopw,
+                                xdstopw,
+                                xpstopw,
+                                cachePolicy);
                        sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
                        final String reasonString = sb.crawlStacker.stackCrawl(new Request(
                                sb.peers.mySeed().hash.getBytes(),
@ -352,7 +358,8 @@ public class Crawler_p {
                            final Map<MultiProtocolURI, String> hyperlinks = scraper.getAnchors();
                            final DigestURI crawlURL = new DigestURI("file://" + file.toString(), null);
                            final CrawlProfile profile = new CrawlProfile(
-                                    fileName, crawlURL,
+                                    fileName,
+                                    crawlURL,
                                    newcrawlingMustMatch,
                                    CrawlProfile.MATCH_NEVER,
                                    newcrawlingdepth,
@ -362,9 +369,10 @@ public class Crawler_p {
                                    indexText,
                                    indexMedia,
                                    storeHTCache,
-                                    true,
                                    crawlOrder,
-                                    xsstopw, xdstopw, xpstopw,
+                                    xsstopw,
+                                    xdstopw,
+                                    xpstopw,
                                    cachePolicy);
                            sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
                            sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
@ -405,15 +413,21 @@ public class Crawler_p {
                	try {
                		final DigestURI sitemapURL = new DigestURI(sitemapURLStr, null);
                		final CrawlProfile pe = new CrawlProfile(
-                				sitemapURLStr, sitemapURL,
-                				newcrawlingMustMatch,
+                				sitemapURLStr,
+                				sitemapURL,
+                				CrawlProfile.MATCH_ALL,
                				CrawlProfile.MATCH_NEVER,
-                				newcrawlingdepth,
-                				crawlingIfOlder, crawlingDomMaxPages,
-                				crawlingQ,
-                				indexText, indexMedia,
-                				storeHTCache, true, crawlOrder,
-                				xsstopw, xdstopw, xpstopw,
+                				0,
+                				crawlingIfOlder,
+                				crawlingDomMaxPages,
+                				true,
+                				indexText,
+                				indexMedia,
+                				storeHTCache,
+                				crawlOrder,
+                				xsstopw,
+                				xdstopw,
+                				xpstopw,
                				cachePolicy);
                		sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
                		final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, pe);
@ -431,7 +445,7 @@ public class Crawler_p {
                        // download document
                        ContentScraper scraper = null;
                        scraper = sb.loader.parseResource(sitelistURL, CrawlProfile.CacheStrategy.IFFRESH);
-                        String title = scraper.getTitle();
+                        // String title = scraper.getTitle();
                        // String description = scraper.getDescription();
                        
                        // get links and generate filter
@ -444,7 +458,7 @@ public class Crawler_p {

                        // put links onto crawl queue
                        final CrawlProfile profile = new CrawlProfile(
-                                title == null || title.length() == 0 ? sitelistURL.getHost() : title,
+                                sitelistURL.getHost(),
                                sitelistURL,
                                newcrawlingMustMatch,
                                CrawlProfile.MATCH_NEVER,
@ -455,9 +469,10 @@ public class Crawler_p {
                                indexText,
                                indexMedia,
                                storeHTCache,
-                                true,
                                crawlOrder,
-                                xsstopw, xdstopw, xpstopw,
+                                xsstopw,
+                                xdstopw,
+                                xpstopw,
                                cachePolicy);
                        sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
                        sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
--- a/htroot/QuickCrawlLink_p.java
+++ b/htroot/QuickCrawlLink_p.java
@ -157,7 +157,6 @@ public class QuickCrawlLink_p {
                        indexText,
                        indexMedia,
                        storeHTCache,
-                        true,
                        remoteIndexing,
                        xsstopw,
                        xdstopw,
--- a/htroot/SettingsAck_p.java
+++ b/htroot/SettingsAck_p.java
@ -36,6 +36,7 @@ import java.util.regex.Pattern;
 import java.util.regex.PatternSyntaxException;

 import net.yacy.cora.document.MultiProtocolURI;
+import net.yacy.cora.protocol.Domains;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.kelondro.order.Base64Order;
 import net.yacy.kelondro.order.Digest;
@ -111,7 +112,7 @@ public class SettingsAck_p {
                final serverCore theServerCore = (serverCore) env.getThread("10_httpd");
                try {
                    final InetSocketAddress theNewAddress = theServerCore.generateSocketAddress(port);
-                    final String hostName = theNewAddress.getHostName();
+                    final String hostName = Domains.getHostName(theNewAddress.getAddress());
                    prop.put("info_restart", "1");
                    prop.put("info_restart_ip",(hostName.equals("0.0.0.0"))? "localhost" : hostName);
                    prop.put("info_restart_port", theNewAddress.getPort());
--- a/htroot/api/util/getpageinfo_p.java
+++ b/htroot/api/util/getpageinfo_p.java
@ -26,11 +26,13 @@ public class getpageinfo_p {
        prop.put("robots-allowed", "3"); //unknown
        prop.put("sitemap", "");
        prop.put("favicon","");        
+        prop.put("sitelist", "");
+        prop.put("filter", ".*");
        
        // default actions
        String actions="title,robots";
        
-        if(post!=null && post.containsKey("url")){
+        if (post != null && post.containsKey("url")) {
            if(post.containsKey("actions"))
                actions=post.get("actions");
            String url=post.get("url");
@ -97,7 +99,7 @@ public class getpageinfo_p {
                    prop.putXML("filter", filter.length() > 0 ? filter.substring(1) : ".*");
                }
            }
-            if(actions.indexOf("robots")>=0){
+            if (actions.indexOf("robots")>=0) {
                try {
                    final DigestURI theURL = new DigestURI(url, null);
                    
--- a/htroot/js/IndexCreate.js
+++ b/htroot/js/IndexCreate.js
@ -48,14 +48,14 @@ function handleResponse(){
 		        sitemap=response.getElementsByTagName("sitemap")[0].firstChild.nodeValue;
 		    }		
 			document.getElementsByName("sitemapURL")[0].value=sitemap;
-			document.getElementById("sitemap").disabled=false;
+			if (sitemap) document.getElementById("sitemap").disabled=false;
 		}
 			sitelist="";		
 	        if (response.getElementsByTagName("sitelist")[0].firstChild!=null){
 		        sitelist=response.getElementsByTagName("sitelist")[0].firstChild.nodeValue;
 		    }
 			document.getElementById("sitelistURLs").innerHTML = sitelist;
-			document.getElementById("sitelist").disabled=false;
+			if (sitelist) document.getElementById("sitelist").disabled=false;
        
 		// clear the ajax image
 		document.getElementsByName("ajax")[0].setAttribute("src", AJAX_OFF);
--- a/source/de/anomic/crawler/CrawlProfile.java
+++ b/source/de/anomic/crawler/CrawlProfile.java
@ -53,7 +53,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
    public static final String INDEX_TEXT       = "indexText";
    public static final String INDEX_MEDIA      = "indexMedia";
    public static final String STORE_HTCACHE    = "storeHTCache";
-    public static final String STORE_TXCACHE    = "storeTXCache";
    public static final String REMOTE_INDEXING  = "remoteIndexing";
    public static final String XSSTOPW          = "xsstopw";
    public static final String XDSTOPW          = "xdstopw";
@ -64,17 +63,22 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
    private Pattern mustmatch = null, mustnotmatch = null;
    
    
-    public CrawlProfile(final String name, final DigestURI startURL,
+    public CrawlProfile(
+                 final String name,
+                 final DigestURI startURL,
                 final String mustmatch,
                 final String mustnotmatch,
                 final int depth,
                 final long recrawlIfOlder /*date*/,
                 final int domMaxPages,
                 final boolean crawlingQ,
-                 final boolean indexText, final boolean indexMedia,
-                 final boolean storeHTCache, final boolean storeTXCache,
+                 final boolean indexText,
+                 final boolean indexMedia,
+                 final boolean storeHTCache,
                 final boolean remoteIndexing,
-                 final boolean xsstopw, final boolean xdstopw, final boolean xpstopw,
+                 final boolean xsstopw,
+                 final boolean xdstopw,
+                 final boolean xpstopw,
                 final CacheStrategy cacheStrategy) {
        super(40);
        if (name == null || name.length() == 0) throw new NullPointerException("name must not be null");
@ -91,7 +95,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        put(INDEX_TEXT,       indexText);
        put(INDEX_MEDIA,      indexMedia);
        put(STORE_HTCACHE,    storeHTCache);
-        put(STORE_TXCACHE,    storeTXCache);
        put(REMOTE_INDEXING,  remoteIndexing);
        put(XSSTOPW,          xsstopw); // exclude static stop-words
        put(XDSTOPW,          xdstopw); // exclude dynamic stop-word
@ -218,11 +221,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        if (r == null) return false;
        return (r.equals(Boolean.TRUE.toString()));
    }
-    public boolean storeTXCache() {
-        final String r = get(STORE_TXCACHE);
-        if (r == null) return false;
-        return (r.equals(Boolean.TRUE.toString()));
-    }
    public boolean remoteIndexing() {
        final String r = get(REMOTE_INDEXING);
        if (r == null) return false;
--- a/source/de/anomic/crawler/CrawlSwitchboard.java
+++ b/source/de/anomic/crawler/CrawlSwitchboard.java
@ -170,7 +170,7 @@ public final class CrawlSwitchboard {
                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, false,
                    true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/,
                    true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/,
-                    true, true,
+                    true,
                    false /*getConfigBool(PROXY_INDEXING_REMOTE, false)*/, true, true, true,
                    CrawlProfile.CacheStrategy.IFFRESH);
            this.profilesActiveCrawls.put(this.defaultProxyProfile.handle().getBytes(), this.defaultProxyProfile);
@ -178,38 +178,38 @@ public final class CrawlSwitchboard {
        if (this.defaultRemoteProfile == null) {
            // generate new default entry for remote crawling
            this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    -1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
+                    -1, -1, true, true, true, false, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
            this.profilesActiveCrawls.put(this.defaultRemoteProfile.handle().getBytes(), this.defaultRemoteProfile);
        }
        if (this.defaultTextSnippetLocalProfile == null) {
            // generate new default entry for snippet fetch and optional crawling
            this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
+                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
            this.profilesActiveCrawls.put(this.defaultTextSnippetLocalProfile.handle().getBytes(), this.defaultTextSnippetLocalProfile);
        }
        if (this.defaultTextSnippetGlobalProfile == null) {
            // generate new default entry for snippet fetch and optional crawling
            this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
+                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
            this.profilesActiveCrawls.put(this.defaultTextSnippetGlobalProfile.handle().getBytes(), this.defaultTextSnippetGlobalProfile);
        }
        this.defaultTextSnippetGlobalProfile.setCacheStrategy(CrawlProfile.CacheStrategy.IFEXIST);
        if (this.defaultMediaSnippetLocalProfile == null) {
            // generate new default entry for snippet fetch and optional crawling
            this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, true, false, false, true, false, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
+                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
            this.profilesActiveCrawls.put(this.defaultMediaSnippetLocalProfile.handle().getBytes(), this.defaultMediaSnippetLocalProfile);
        }
        if (this.defaultMediaSnippetGlobalProfile == null) {
            // generate new default entry for snippet fetch and optional crawling
            this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
+                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, true, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
            this.profilesActiveCrawls.put(this.defaultMediaSnippetGlobalProfile.handle().getBytes(), this.defaultMediaSnippetGlobalProfile);
        }
        if (this.defaultSurrogateProfile == null) {
            // generate new default entry for surrogate parsing
            this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE);
+                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, true, true, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE);
            this.profilesActiveCrawls.put(this.defaultSurrogateProfile.handle().getBytes(), this.defaultSurrogateProfile);
        }
    }
--- a/source/de/anomic/crawler/ResultURLs.java
+++ b/source/de/anomic/crawler/ResultURLs.java
@ -159,6 +159,10 @@ public final class ResultURLs {
        return resultDomains.get(stack);
    }

+    public void clearStacks() {
+        for (EventOrigin origin: EventOrigin.values()) clearStack(origin);
+    }
+    
    public synchronized void clearStack(final EventOrigin stack) {
        final Map<String, InitExecEntry> resultStack = getStack(stack);
        if (resultStack != null) resultStack.clear();
--- a/source/de/anomic/crawler/retrieval/HTTPLoader.java
+++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java
@ -118,7 +118,11 @@ public final class HTTPLoader {
            // FIXME: 30*-handling (bottom) is never reached
            // we always get the final content because httpClient.followRedirects = true

-        	if (responseBody != null && (code == 200 || code == 203)) {
+        	if (responseBody == null) {
+        	    // no response, reject file
+                sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "no response body (you may increase the maxmimum file size)");
+                throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
+        	} else if (code == 200 || code == 203) {
                // the transfer is ok
                
                // we write the new cache entry to file system directly
@ -180,7 +184,7 @@ public final class HTTPLoader {
                }
            } else {
                // if the response has not the right response type then reject file
-            	sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "wrong http status code " + code +  ")");
+            	sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "wrong http status code " + code);
                throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
            }
        return response;
--- a/source/de/anomic/search/Switchboard.java
+++ b/source/de/anomic/search/Switchboard.java
@ -858,6 +858,9 @@ public final class Switchboard extends serverSwitch {
            this.queuesRoot = new File(new File(indexPrimaryPath, networkName), "QUEUES");
            this.networkRoot.mkdirs();
            this.queuesRoot.mkdirs();
+
+            // clear statistic data
+            this.crawlResults.clearStacks();
            
            // relocate
            this.crawlQueues.relocate(this.queuesRoot); // cannot be closed because the busy threads are working with that object
--- a/source/net/yacy/cora/document/MultiProtocolURI.java
+++ b/source/net/yacy/cora/document/MultiProtocolURI.java
@ -225,22 +225,9 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
                // parse file url
                String h = url.substring(p + 1);
                if (h.startsWith("//")) {
-                    // host may be given, but may be also empty
-                    final int q = h.indexOf('/', 2);
-                    if (q <= 0) {
-                        // no host given
-                        host = null;
-                        path = h.substring(2);
-                    } else {
-                        host = h.substring(2, q);
-                        if (host.length() == 0 || host.equals("localhost")) host = null;
-                        h = h.substring(q);
-                        char c = h.charAt(2);
-                        if (c == ':' || c == '|')
-                            path = h.substring(1);
-                        else
-                            path = h;
-                    }
+                    // no host given
+                    host = null;
+                    path = h.substring(2);
                } else {
                    host = null;
                    if (h.length() > 0 && h.charAt(0) == '/') {
--- a/source/net/yacy/cora/protocol/Domains.java
+++ b/source/net/yacy/cora/protocol/Domains.java
@ -23,11 +23,20 @@ package net.yacy.cora.protocol;
 import java.net.InetAddress;
 import java.net.UnknownHostException;
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.Collections;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
+import java.util.concurrent.Callable;
+import java.util.concurrent.CancellationException;
 import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
 import java.util.regex.Pattern;

 import net.yacy.cora.storage.ARC;
@ -454,6 +463,80 @@ public class Domains {
        return false;
    }

+    public static String getHostName(final InetAddress i) {
+        Collection<String> hosts = nameCacheHit.getKeys(i);
+        if (hosts.size() > 0) return hosts.iterator().next();
+        
+        // call i.getHostName() using concurrency to interrupt execution in case of a time-out
+        final Callable<String> callable = new Callable<String>() {
+            public String call() { return i.getHostName(); }
+        };
+        ExecutorService service = Executors.newSingleThreadExecutor();
+        final Future<String> taskFuture = service.submit(callable);
+        Runnable t = new Runnable() {         
+            public void run() { taskFuture.cancel(true); }
+        };
+        service.execute(t);
+        service.shutdown();
+        try {
+            return taskFuture.get(500, TimeUnit.MILLISECONDS);
+        } catch (CancellationException e) {
+            // callable was interrupted
+            return i.getHostAddress();
+        } catch (InterruptedException e) {
+            // service was shutdown
+            return i.getHostAddress();
+        } catch(ExecutionException e) {
+            // callable failed unexpectedly
+            return i.getHostAddress();
+        } catch (TimeoutException e) {
+            // time-out
+            return i.getHostAddress();
+        }
+    }
+    
+    public static InetAddress dnsResolve(final String hostx) {
+        if ((hostx == null) || (hostx.length() == 0)) return null;
+        final String host = hostx.toLowerCase().trim();        
+        // try to simply parse the address
+        InetAddress ip = parseInetAddress(host);
+        if (ip != null) return ip;
+        
+        // try to resolve host by doing a name cache lookup
+        ip = nameCacheHit.get(host);
+        if (ip != null) return ip;
+        
+        if (nameCacheMiss.containsKey(host)) return null;
+        
+        // call dnsResolveNetBased(host) using concurrency to interrupt execution in case of a time-out
+        final Callable<InetAddress> callable = new Callable<InetAddress>() {
+            public InetAddress call() { return dnsResolveNetBased(host); }
+        };
+        ExecutorService service = Executors.newSingleThreadExecutor();
+        final Future<InetAddress> taskFuture = service.submit(callable);
+        Runnable t = new Runnable() {         
+            public void run() { taskFuture.cancel(true); }
+        };
+        service.execute(t);
+        service.shutdown();
+        try {
+            return taskFuture.get(500, TimeUnit.MILLISECONDS);
+        } catch (CancellationException e) {
+            // callable was interrupted
+            return null;
+        } catch (InterruptedException e) {
+            // service was shutdown
+            return null;
+        } catch(ExecutionException e) {
+            // callable failed unexpectedly
+            return null;
+        } catch (TimeoutException e) {
+            // time-out
+            return null;
+        }
+    }
+    
+
    private static final InetAddress parseInetAddress(final String ip) {
        if (ip == null) return null;
        if (ip.length() < 8) return null;
@ -474,33 +557,21 @@ public class Domains {
            return null;
        }
    }
-    
-    public static InetAddress dnsResolve(String host) {
-        if ((host == null) || (host.length() == 0)) return null;
-        host = host.toLowerCase().trim();        
-        // try to simply parse the address
-        InetAddress ip = parseInetAddress(host);
-        if (ip != null) return ip;
-        
-        // try to resolve host by doing a name cache lookup
-        ip = nameCacheHit.get(host);
-        if (ip != null) return ip;
-        
-        if (nameCacheMiss.containsKey(host)) return null;
-        //System.out.println("***DEBUG dnsResolve(" + host + ")");
+
+    private static InetAddress dnsResolveNetBased(String host) {
        try {
            boolean doCaching = true;
-            ip = InetAddress.getByName(host); // this makes the DNS request to backbone
+            InetAddress ip = InetAddress.getByName(host); // this makes the DNS request to backbone
            if ((ip == null) ||
                (ip.isLoopbackAddress()) ||
                (nameCacheNoCachingList.containsKey(host))
            ) {
                doCaching = false;
            } else {
-            	if (matchesList(host, nameCacheNoCachingPatterns)) {
-            		nameCacheNoCachingList.put(host, PRESENT);
+                if (matchesList(host, nameCacheNoCachingPatterns)) {
+                    nameCacheNoCachingList.put(host, PRESENT);
                    doCaching = false;
-            	}
+                }
            }
            
            if (doCaching && ip != null) {
@ -519,6 +590,7 @@ public class Domains {
        return null;
    }

+    
    /**
    * Returns the number of entries in the nameCacheHit map
    *
@ -565,7 +637,7 @@ public class Domains {
        public void run() {
            String lhn = localHostName;
            try {
-                lhn = InetAddress.getLocalHost().getHostName();
+                lhn = getHostName(InetAddress.getLocalHost());
            } catch (UnknownHostException e) {}
            try {
                localHostAddresses = InetAddress.getAllByName(lhn);
@ -656,7 +728,8 @@ public class Domains {
        // finally check if there are other local IP addresses that are not in
        // the standard IP range
        for (int i = 0; i < localHostAddresses.length; i++) {
-            if (localHostAddresses[i].getHostName().equals(host)) return true;
+            String hostname = getHostName(localHostAddresses[i]);
+            if (hostname != null && hostname.equals(host)) return true;
            if (localHostAddresses[i].getHostAddress().equals(host)) return true;
        }

--- a/source/net/yacy/cora/protocol/Scanner.java
+++ b/source/net/yacy/cora/protocol/Scanner.java
@ -121,7 +121,8 @@ public class Scanner extends Thread {
    private void addProtocol(String protocol, boolean bigrange) {
        for (InetAddress i: genlist(bigrange)) {
            try {
-                this.scanqueue.put(new MultiProtocolURI(protocol + "://" + i.getHostName() + "/"));
+                
+                this.scanqueue.put(new MultiProtocolURI(protocol + "://" + Domains.getHostName(i) + "/"));
            } catch (MalformedURLException e) {
                Log.logException(e);
            } catch (InterruptedException e) {
--- a/source/net/yacy/cora/storage/ARC.java
+++ b/source/net/yacy/cora/storage/ARC.java
@ -21,6 +21,7 @@

 package net.yacy.cora.storage;

+import java.util.Collection;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.Set;
@ -62,14 +63,21 @@ public interface ARC<K, V> extends Iterable<Map.Entry<K, V>> {
     * @return the value
     */
    public V get(K s);
+
+    /**
+     * check if the map contains the value
+     * @param value
+     * @return the keys that have the given value
+     */
+    public Collection<K> getKeys(V value);
    
    /**
     * check if the map contains the key
-     * @param s
-     * @return
+     * @param key
+     * @return true if the map contains the key
     */
-    public boolean containsKey(K s);
-    
+    public boolean containsKey(K key);
+
    /**
     * remove an entry from the cache
     * @param s
--- a/source/net/yacy/cora/storage/ConcurrentARC.java
+++ b/source/net/yacy/cora/storage/ConcurrentARC.java
@ -21,6 +21,8 @@
 package net.yacy.cora.storage;

 import java.util.AbstractMap;
+import java.util.ArrayList;
+import java.util.Collection;
 import java.util.Comparator;
 import java.util.HashSet;
 import java.util.Iterator;
@ -105,6 +107,17 @@ public final class ConcurrentARC<K, V> extends AbstractMap<K, V> implements Map<
    	return this.arc[getPartition(s)].get((K) s);
    }
    
+    /**
+     * check if the map contains the value
+     * @param value
+     * @return the keys that have the given value
+     */
+    public Collection<K> getKeys(V value) {
+        ArrayList<K> keys = new ArrayList<K>();
+        for (int i = 0; i < this.arc.length; i++) keys.addAll(this.arc[i].getKeys(value));
+        return keys;
+    }
+    
    /**
     * check if the map contains the key
     * @param s
--- a/source/net/yacy/cora/storage/SimpleARC.java
+++ b/source/net/yacy/cora/storage/SimpleARC.java
@ -22,6 +22,8 @@
 package net.yacy.cora.storage;

 import java.util.AbstractMap;
+import java.util.ArrayList;
+import java.util.Collection;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Map;
@ -98,6 +100,26 @@ abstract class SimpleARC<K, V> extends AbstractMap<K, V> implements Map<K, V>, I
        assert (this.levelB.size() <= cacheSize); // the cache should shrink automatically
        return v;
    }
+
+    /**
+     * check if the map contains the value
+     * @param value
+     * @return the keys that have the given value
+     */
+    public Collection<K> getKeys(V value) {
+        ArrayList<K> keys = new ArrayList<K>();
+        synchronized (this.levelB) {
+            for (Map.Entry<K, V> entry: this.levelB.entrySet()) {
+                if (value.equals(entry.getValue())) keys.add(entry.getKey());
+            }
+        }
+        synchronized (this) {
+            for (Map.Entry<K, V> entry: this.levelA.entrySet()) {
+                if (value.equals(entry.getValue())) keys.add(entry.getKey());
+            }
+        }
+        return keys;
+    }
    
    /**
     * check if the map contains the key
--- a/source/net/yacy/kelondro/table/Table.java
+++ b/source/net/yacy/kelondro/table/Table.java
@ -305,6 +305,7 @@ public class Table implements Index, Iterable<Row.Entry> {
            assert table == null || table.size() == index.size() : "table.size() = " + table.size() + ", index.size() = " + index.size();
        }
        final HashMap<String, String> map = new HashMap<String, String>(8);
+        if (index == null) return map; // possibly closed or beeing closed
        map.put("tableSize", Integer.toString(index.size()));
        map.put("tableKeyChunkSize", Integer.toString(index.row().objectsize));
        map.put("tableKeyMem", Integer.toString(index.row().objectsize * index.size()));