From 2c549ae3412a497a6181afa442e8af2f815041dd Mon Sep 17 00:00:00 2001
From: orbiter <orbiter@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Thu, 30 Sep 2010 23:57:58 +0000
Subject: [PATCH] fixed a number of small bugs: - better crawl star for files
 paths and smb paths - added time-out wrapper for dns resolving and reverse
 resolving to prevent blockings - fixed intranet scanner result list check
 boxes - prevented htcache usage in case of file and smb crawling (not
 necessary, documents are locally available) - fixed rss feed loader - fixes
 sitemap loader which had not been restricted to single files (crawl-depth
 must be zero) - clearing of crawl result lists when a network switch was done
 - higher maximum file size for crawler

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7214 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 defaults/yacy.init                            |   4 +-
 htroot/CrawlProfileEditor_p.java              |   1 -
 htroot/CrawlResults.java                      |  11 +-
 htroot/CrawlStartIntranet_p.html              |   2 +-
 htroot/CrawlStartSite_p.html                  |   3 +-
 htroot/Crawler_p.java                         |  57 +++++----
 htroot/QuickCrawlLink_p.java                  |   1 -
 htroot/SettingsAck_p.java                     |   3 +-
 htroot/api/util/getpageinfo_p.java            |   6 +-
 htroot/js/IndexCreate.js                      |   4 +-
 source/de/anomic/crawler/CrawlProfile.java    |  20 ++--
 .../de/anomic/crawler/CrawlSwitchboard.java   |  14 +--
 source/de/anomic/crawler/ResultURLs.java      |   4 +
 .../anomic/crawler/retrieval/HTTPLoader.java  |   8 +-
 source/de/anomic/search/Switchboard.java      |   3 +
 .../yacy/cora/document/MultiProtocolURI.java  |  19 +--
 source/net/yacy/cora/protocol/Domains.java    | 113 ++++++++++++++----
 source/net/yacy/cora/protocol/Scanner.java    |   3 +-
 source/net/yacy/cora/storage/ARC.java         |  16 ++-
 .../net/yacy/cora/storage/ConcurrentARC.java  |  13 ++
 source/net/yacy/cora/storage/SimpleARC.java   |  22 ++++
 source/net/yacy/kelondro/table/Table.java     |   1 +
 22 files changed, 229 insertions(+), 99 deletions(-)

diff --git a/defaults/yacy.init b/defaults/yacy.init
index 5ca48584a..7a1f7bbf4 100644
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@@ -674,10 +674,10 @@ crawler.clientTimeout=9000
 crawler.http.acceptEncoding=gzip
 crawler.http.acceptLanguage=en-us,en;q=0.5
 crawler.http.acceptCharset=ISO-8859-1,utf-8;q=0.7,*;q=0.7
-crawler.http.maxFileSize=1048576
+crawler.http.maxFileSize=10485760
 
 # ftp crawler specific settings; size in bytes
-crawler.ftp.maxFileSize=1048576
+crawler.ftp.maxFileSize=10485760
 
 # smb crawler specific settings: maximum size
 crawler.smb.maxFileSize=100000000
diff --git a/htroot/CrawlProfileEditor_p.java b/htroot/CrawlProfileEditor_p.java
index 50f0cf8e2..1c3ef1c0b 100644
--- a/htroot/CrawlProfileEditor_p.java
+++ b/htroot/CrawlProfileEditor_p.java
@@ -92,7 +92,6 @@ public class CrawlProfileEditor_p {
         labels.add(new eentry(CrawlProfile.INDEX_TEXT,          "Index Text",            false, eentry.BOOLEAN));
         labels.add(new eentry(CrawlProfile.INDEX_MEDIA,         "Index Media",           false, eentry.BOOLEAN));
         labels.add(new eentry(CrawlProfile.STORE_HTCACHE,       "Store in HTCache",      false, eentry.BOOLEAN));
-        labels.add(new eentry(CrawlProfile.STORE_TXCACHE,       "Store in TXCache",      false, eentry.BOOLEAN));
         labels.add(new eentry(CrawlProfile.REMOTE_INDEXING,     "Remote Indexing",       false, eentry.BOOLEAN));
         labels.add(new eentry(CrawlProfile.XSSTOPW,             "Static stop-words",     false, eentry.BOOLEAN));
         labels.add(new eentry(CrawlProfile.XDSTOPW,             "Dynamic stop-words",    false, eentry.BOOLEAN));
diff --git a/htroot/CrawlResults.java b/htroot/CrawlResults.java
index d5e37280a..484f40da9 100644
--- a/htroot/CrawlResults.java
+++ b/htroot/CrawlResults.java
@@ -188,15 +188,16 @@ public class CrawlResults {
                 try {
                     urle = sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).load(entry.getKey().getBytes(), null, 0);
                     if (urle == null) {
-                        Log.logWarning("PLASMA", "CrawlResults: URL not in index with url hash "+ entry.getKey());
+                        Log.logWarning("PLASMA", "CrawlResults: URL not in index with url hash " + entry.getKey());
                         urlstr = null;
                         urltxt = null;
                         metadata = null;
-                    } else {
-                        metadata = urle.metadata();
-                        urlstr = metadata.url().toNormalform(false, true);
-                        urltxt = nxTools.shortenURLString(urlstr, 72); // shorten the string text like a URL
+                        continue;
                     }
+                    metadata = urle.metadata();
+                    urlstr = metadata.url().toNormalform(false, true);
+                    urltxt = nxTools.shortenURLString(urlstr, 72); // shorten the string text like a URL
+                    
                     initiatorSeed = entry.getValue() == null || entry.getValue().initiatorHash == null ? null : sb.peers.getConnected(new String(entry.getValue().initiatorHash));
                     executorSeed = entry.getValue() == null || entry.getValue().executorHash == null ? null : sb.peers.getConnected(new String(entry.getValue().executorHash));
 
diff --git a/htroot/CrawlStartIntranet_p.html b/htroot/CrawlStartIntranet_p.html
index f1faa9b61..e85dafdad 100644
--- a/htroot/CrawlStartIntranet_p.html
+++ b/htroot/CrawlStartIntranet_p.html
@@ -36,7 +36,7 @@
 	#(/notintranet)#
 	
 	#(servertable)#::
-	<form id="servertable" action="CrawlStartIntranet_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8" ><fieldset>
+	<form id="servertable" name="servertable" action="CrawlStartIntranet_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8" ><fieldset>
 	  <legend><label for="servertable">Available Intranet Server</label></legend>
       <table class="sortable" border="0" cellpadding="2" cellspacing="1">
         <tr class="TableHeader" valign="bottom">
diff --git a/htroot/CrawlStartSite_p.html b/htroot/CrawlStartSite_p.html
index 153f752e8..6871dd735 100644
--- a/htroot/CrawlStartSite_p.html
+++ b/htroot/CrawlStartSite_p.html
@@ -45,8 +45,7 @@
             <span id="robotsOK"></span><img align="top" src="/env/grafics/empty.gif" name="ajax" alt="empty" />
             </td>
           </tr><tr>
-            <td><input type="radio" name="crawlingMode" id="sitelist" value="sitelist" disabled="disabled"
-            onmousedown="document.getElementById('rangeDomain').disabled=true;document.getElementById('rangeSubpath').disabled=true;document.getElementById('crawlingDomMaxCheck').disabled=true;document.getElementById('crawlingDomMaxPages').disabled=true;document.getElementById('crawlingQ').disabled=true;"/>Link-List of URL</td>
+            <td><input type="radio" name="crawlingMode" id="sitelist" value="sitelist" disabled="disabled" />Link-List of URL</td>
             <td><div id="sitelistURLs"></div></td>
           </tr><tr>
             <td><input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled"
diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java
index a9a25d0db..408395f5a 100644
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@@ -138,14 +138,13 @@ public class Crawler_p {
                 final boolean fullDomain = post.get("range", "wide").equals("domain"); // special property in simple crawl start
                 final boolean subPath    = post.get("range", "wide").equals("subpath"); // special property in simple crawl start
                 
-                
                 // set the crawl filter
                 String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL);
                 String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER);
                 if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL; // avoid that all urls are filtered out if bad value was submitted
                 // special cases:
                 if (crawlingStartURL!= null && fullDomain) {
-                    newcrawlingMustMatch = ".*" + crawlingStartURL.getHost() + ".*";
+                    newcrawlingMustMatch = crawlingStartURL.isFile() ? "file:///.*" : crawlingStartURL.isSMB() ? "smb://.*" : ".*" + crawlingStartURL.getHost() + ".*";
                 }
                 if (crawlingStart!= null && subPath && (pos = crawlingStart.lastIndexOf('/')) > 0) {
                     newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*";
@@ -203,7 +202,8 @@ public class Crawler_p {
                 final boolean indexMedia = post.get("indexMedia", "off").equals("on");
                 env.setConfig("indexMedia", (indexMedia) ? "true" : "false");
                 
-                final boolean storeHTCache = post.get("storeHTCache", "off").equals("on");
+                boolean storeHTCache = post.get("storeHTCache", "off").equals("on");
+                if (crawlingStartURL.isFile() || crawlingStartURL.isSMB()) storeHTCache = false;
                 env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false");
                 
                 final String cachePolicyString = post.get("cachePolicy", "iffresh");
@@ -247,15 +247,21 @@ public class Crawler_p {
                         // stack url
                         sb.crawler.profilesPassiveCrawls.remove(crawlingStartURL.hash()); // if there is an old entry, delete it
                         final CrawlProfile pe = new CrawlProfile(
-                                (crawlingStartURL.getHost() == null) ? Long.toHexString(System.currentTimeMillis()) : crawlingStartURL.getHost(),
+                                (crawlingStartURL.getHost() == null) ? crawlingStartURL.toNormalform(true, false) : crawlingStartURL.getHost(),
                                 crawlingStartURL,
                                 newcrawlingMustMatch,
                                 newcrawlingMustNotMatch,
                                 newcrawlingdepth,
-                                crawlingIfOlder, crawlingDomMaxPages,
+                                crawlingIfOlder,
+                                crawlingDomMaxPages,
                                 crawlingQ,
                                 indexText, indexMedia,
-                                storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cachePolicy);
+                                storeHTCache,
+                                crawlOrder,
+                                xsstopw,
+                                xdstopw,
+                                xpstopw,
+                                cachePolicy);
                         sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
                         final String reasonString = sb.crawlStacker.stackCrawl(new Request(
                                 sb.peers.mySeed().hash.getBytes(),
@@ -352,7 +358,8 @@ public class Crawler_p {
                             final Map<MultiProtocolURI, String> hyperlinks = scraper.getAnchors();
                             final DigestURI crawlURL = new DigestURI("file://" + file.toString(), null);
                             final CrawlProfile profile = new CrawlProfile(
-                                    fileName, crawlURL,
+                                    fileName,
+                                    crawlURL,
                                     newcrawlingMustMatch,
                                     CrawlProfile.MATCH_NEVER,
                                     newcrawlingdepth,
@@ -362,9 +369,10 @@ public class Crawler_p {
                                     indexText,
                                     indexMedia,
                                     storeHTCache,
-                                    true,
                                     crawlOrder,
-                                    xsstopw, xdstopw, xpstopw,
+                                    xsstopw,
+                                    xdstopw,
+                                    xpstopw,
                                     cachePolicy);
                             sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
                             sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
@@ -405,15 +413,21 @@ public class Crawler_p {
                 	try {
                 		final DigestURI sitemapURL = new DigestURI(sitemapURLStr, null);
                 		final CrawlProfile pe = new CrawlProfile(
-                				sitemapURLStr, sitemapURL,
-                				newcrawlingMustMatch,
+                				sitemapURLStr,
+                				sitemapURL,
+                				CrawlProfile.MATCH_ALL,
                 				CrawlProfile.MATCH_NEVER,
-                				newcrawlingdepth,
-                				crawlingIfOlder, crawlingDomMaxPages,
-                				crawlingQ,
-                				indexText, indexMedia,
-                				storeHTCache, true, crawlOrder,
-                				xsstopw, xdstopw, xpstopw,
+                				0,
+                				crawlingIfOlder,
+                				crawlingDomMaxPages,
+                				true,
+                				indexText,
+                				indexMedia,
+                				storeHTCache,
+                				crawlOrder,
+                				xsstopw,
+                				xdstopw,
+                				xpstopw,
                 				cachePolicy);
                 		sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
                 		final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, pe);
@@ -431,7 +445,7 @@ public class Crawler_p {
                         // download document
                         ContentScraper scraper = null;
                         scraper = sb.loader.parseResource(sitelistURL, CrawlProfile.CacheStrategy.IFFRESH);
-                        String title = scraper.getTitle();
+                        // String title = scraper.getTitle();
                         // String description = scraper.getDescription();
                         
                         // get links and generate filter
@@ -444,7 +458,7 @@ public class Crawler_p {
 
                         // put links onto crawl queue
                         final CrawlProfile profile = new CrawlProfile(
-                                title == null || title.length() == 0 ? sitelistURL.getHost() : title,
+                                sitelistURL.getHost(),
                                 sitelistURL,
                                 newcrawlingMustMatch,
                                 CrawlProfile.MATCH_NEVER,
@@ -455,9 +469,10 @@ public class Crawler_p {
                                 indexText,
                                 indexMedia,
                                 storeHTCache,
-                                true,
                                 crawlOrder,
-                                xsstopw, xdstopw, xpstopw,
+                                xsstopw,
+                                xdstopw,
+                                xpstopw,
                                 cachePolicy);
                         sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
                         sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java
index c470db791..043c26159 100644
--- a/htroot/QuickCrawlLink_p.java
+++ b/htroot/QuickCrawlLink_p.java
@@ -157,7 +157,6 @@ public class QuickCrawlLink_p {
                         indexText,
                         indexMedia,
                         storeHTCache,
-                        true,
                         remoteIndexing,
                         xsstopw,
                         xdstopw,
diff --git a/htroot/SettingsAck_p.java b/htroot/SettingsAck_p.java
index 32d974486..f9b3df8d3 100644
--- a/htroot/SettingsAck_p.java
+++ b/htroot/SettingsAck_p.java
@@ -36,6 +36,7 @@ import java.util.regex.Pattern;
 import java.util.regex.PatternSyntaxException;
 
 import net.yacy.cora.document.MultiProtocolURI;
+import net.yacy.cora.protocol.Domains;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.kelondro.order.Base64Order;
 import net.yacy.kelondro.order.Digest;
@@ -111,7 +112,7 @@ public class SettingsAck_p {
                 final serverCore theServerCore = (serverCore) env.getThread("10_httpd");
                 try {
                     final InetSocketAddress theNewAddress = theServerCore.generateSocketAddress(port);
-                    final String hostName = theNewAddress.getHostName();
+                    final String hostName = Domains.getHostName(theNewAddress.getAddress());
                     prop.put("info_restart", "1");
                     prop.put("info_restart_ip",(hostName.equals("0.0.0.0"))? "localhost" : hostName);
                     prop.put("info_restart_port", theNewAddress.getPort());
diff --git a/htroot/api/util/getpageinfo_p.java b/htroot/api/util/getpageinfo_p.java
index 89bc7ad8e..882c6601b 100755
--- a/htroot/api/util/getpageinfo_p.java
+++ b/htroot/api/util/getpageinfo_p.java
@@ -26,11 +26,13 @@ public class getpageinfo_p {
         prop.put("robots-allowed", "3"); //unknown
         prop.put("sitemap", "");
         prop.put("favicon","");        
+        prop.put("sitelist", "");
+        prop.put("filter", ".*");
         
         // default actions
         String actions="title,robots";
         
-        if(post!=null && post.containsKey("url")){
+        if (post != null && post.containsKey("url")) {
             if(post.containsKey("actions"))
                 actions=post.get("actions");
             String url=post.get("url");
@@ -97,7 +99,7 @@ public class getpageinfo_p {
                     prop.putXML("filter", filter.length() > 0 ? filter.substring(1) : ".*");
                 }
             }
-            if(actions.indexOf("robots")>=0){
+            if (actions.indexOf("robots")>=0) {
                 try {
                     final DigestURI theURL = new DigestURI(url, null);
                     
diff --git a/htroot/js/IndexCreate.js b/htroot/js/IndexCreate.js
index b411f2261..ab7a72333 100644
--- a/htroot/js/IndexCreate.js
+++ b/htroot/js/IndexCreate.js
@@ -48,14 +48,14 @@ function handleResponse(){
 		        sitemap=response.getElementsByTagName("sitemap")[0].firstChild.nodeValue;
 		    }		
 			document.getElementsByName("sitemapURL")[0].value=sitemap;
-			document.getElementById("sitemap").disabled=false;
+			if (sitemap) document.getElementById("sitemap").disabled=false;
 		}
 			sitelist="";		
 	        if (response.getElementsByTagName("sitelist")[0].firstChild!=null){
 		        sitelist=response.getElementsByTagName("sitelist")[0].firstChild.nodeValue;
 		    }
 			document.getElementById("sitelistURLs").innerHTML = sitelist;
-			document.getElementById("sitelist").disabled=false;
+			if (sitelist) document.getElementById("sitelist").disabled=false;
         
 		// clear the ajax image
 		document.getElementsByName("ajax")[0].setAttribute("src", AJAX_OFF);
diff --git a/source/de/anomic/crawler/CrawlProfile.java b/source/de/anomic/crawler/CrawlProfile.java
index 23e26fa9d..08c028c3b 100644
--- a/source/de/anomic/crawler/CrawlProfile.java
+++ b/source/de/anomic/crawler/CrawlProfile.java
@@ -53,7 +53,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
     public static final String INDEX_TEXT       = "indexText";
     public static final String INDEX_MEDIA      = "indexMedia";
     public static final String STORE_HTCACHE    = "storeHTCache";
-    public static final String STORE_TXCACHE    = "storeTXCache";
     public static final String REMOTE_INDEXING  = "remoteIndexing";
     public static final String XSSTOPW          = "xsstopw";
     public static final String XDSTOPW          = "xdstopw";
@@ -64,17 +63,22 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
     private Pattern mustmatch = null, mustnotmatch = null;
     
     
-    public CrawlProfile(final String name, final DigestURI startURL,
+    public CrawlProfile(
+                 final String name,
+                 final DigestURI startURL,
                  final String mustmatch,
                  final String mustnotmatch,
                  final int depth,
                  final long recrawlIfOlder /*date*/,
                  final int domMaxPages,
                  final boolean crawlingQ,
-                 final boolean indexText, final boolean indexMedia,
-                 final boolean storeHTCache, final boolean storeTXCache,
+                 final boolean indexText,
+                 final boolean indexMedia,
+                 final boolean storeHTCache,
                  final boolean remoteIndexing,
-                 final boolean xsstopw, final boolean xdstopw, final boolean xpstopw,
+                 final boolean xsstopw,
+                 final boolean xdstopw,
+                 final boolean xpstopw,
                  final CacheStrategy cacheStrategy) {
         super(40);
         if (name == null || name.length() == 0) throw new NullPointerException("name must not be null");
@@ -91,7 +95,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
         put(INDEX_TEXT,       indexText);
         put(INDEX_MEDIA,      indexMedia);
         put(STORE_HTCACHE,    storeHTCache);
-        put(STORE_TXCACHE,    storeTXCache);
         put(REMOTE_INDEXING,  remoteIndexing);
         put(XSSTOPW,          xsstopw); // exclude static stop-words
         put(XDSTOPW,          xdstopw); // exclude dynamic stop-word
@@ -218,11 +221,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
         if (r == null) return false;
         return (r.equals(Boolean.TRUE.toString()));
     }
-    public boolean storeTXCache() {
-        final String r = get(STORE_TXCACHE);
-        if (r == null) return false;
-        return (r.equals(Boolean.TRUE.toString()));
-    }
     public boolean remoteIndexing() {
         final String r = get(REMOTE_INDEXING);
         if (r == null) return false;
diff --git a/source/de/anomic/crawler/CrawlSwitchboard.java b/source/de/anomic/crawler/CrawlSwitchboard.java
index f90b0f40b..025369ba4 100644
--- a/source/de/anomic/crawler/CrawlSwitchboard.java
+++ b/source/de/anomic/crawler/CrawlSwitchboard.java
@@ -170,7 +170,7 @@ public final class CrawlSwitchboard {
                     CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, false,
                     true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/,
                     true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/,
-                    true, true,
+                    true,
                     false /*getConfigBool(PROXY_INDEXING_REMOTE, false)*/, true, true, true,
                     CrawlProfile.CacheStrategy.IFFRESH);
             this.profilesActiveCrawls.put(this.defaultProxyProfile.handle().getBytes(), this.defaultProxyProfile);
@@ -178,38 +178,38 @@ public final class CrawlSwitchboard {
         if (this.defaultRemoteProfile == null) {
             // generate new default entry for remote crawling
             this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    -1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
+                    -1, -1, true, true, true, false, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
             this.profilesActiveCrawls.put(this.defaultRemoteProfile.handle().getBytes(), this.defaultRemoteProfile);
         }
         if (this.defaultTextSnippetLocalProfile == null) {
             // generate new default entry for snippet fetch and optional crawling
             this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
+                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
             this.profilesActiveCrawls.put(this.defaultTextSnippetLocalProfile.handle().getBytes(), this.defaultTextSnippetLocalProfile);
         }
         if (this.defaultTextSnippetGlobalProfile == null) {
             // generate new default entry for snippet fetch and optional crawling
             this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
+                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
             this.profilesActiveCrawls.put(this.defaultTextSnippetGlobalProfile.handle().getBytes(), this.defaultTextSnippetGlobalProfile);
         }
         this.defaultTextSnippetGlobalProfile.setCacheStrategy(CrawlProfile.CacheStrategy.IFEXIST);
         if (this.defaultMediaSnippetLocalProfile == null) {
             // generate new default entry for snippet fetch and optional crawling
             this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, true, false, false, true, false, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
+                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
             this.profilesActiveCrawls.put(this.defaultMediaSnippetLocalProfile.handle().getBytes(), this.defaultMediaSnippetLocalProfile);
         }
         if (this.defaultMediaSnippetGlobalProfile == null) {
             // generate new default entry for snippet fetch and optional crawling
             this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
+                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, true, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
             this.profilesActiveCrawls.put(this.defaultMediaSnippetGlobalProfile.handle().getBytes(), this.defaultMediaSnippetGlobalProfile);
         }
         if (this.defaultSurrogateProfile == null) {
             // generate new default entry for surrogate parsing
             this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE);
+                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, true, true, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE);
             this.profilesActiveCrawls.put(this.defaultSurrogateProfile.handle().getBytes(), this.defaultSurrogateProfile);
         }
     }
diff --git a/source/de/anomic/crawler/ResultURLs.java b/source/de/anomic/crawler/ResultURLs.java
index ca2e9d70e..27bbac4cc 100644
--- a/source/de/anomic/crawler/ResultURLs.java
+++ b/source/de/anomic/crawler/ResultURLs.java
@@ -159,6 +159,10 @@ public final class ResultURLs {
         return resultDomains.get(stack);
     }
 
+    public void clearStacks() {
+        for (EventOrigin origin: EventOrigin.values()) clearStack(origin);
+    }
+    
     public synchronized void clearStack(final EventOrigin stack) {
         final Map<String, InitExecEntry> resultStack = getStack(stack);
         if (resultStack != null) resultStack.clear();
diff --git a/source/de/anomic/crawler/retrieval/HTTPLoader.java b/source/de/anomic/crawler/retrieval/HTTPLoader.java
index 9d26b262b..c0bb1ef8f 100644
--- a/source/de/anomic/crawler/retrieval/HTTPLoader.java
+++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java
@@ -118,7 +118,11 @@ public final class HTTPLoader {
             // FIXME: 30*-handling (bottom) is never reached
             // we always get the final content because httpClient.followRedirects = true
 
-        	if (responseBody != null && (code == 200 || code == 203)) {
+        	if (responseBody == null) {
+        	    // no response, reject file
+                sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "no response body (you may increase the maxmimum file size)");
+                throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
+        	} else if (code == 200 || code == 203) {
                 // the transfer is ok
                 
                 // we write the new cache entry to file system directly
@@ -180,7 +184,7 @@ public final class HTTPLoader {
                 }
             } else {
                 // if the response has not the right response type then reject file
-            	sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "wrong http status code " + code +  ")");
+            	sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "wrong http status code " + code);
                 throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
             }
         return response;
diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java
index 09528623d..a8f5f01c8 100644
--- a/source/de/anomic/search/Switchboard.java
+++ b/source/de/anomic/search/Switchboard.java
@@ -858,6 +858,9 @@ public final class Switchboard extends serverSwitch {
             this.queuesRoot = new File(new File(indexPrimaryPath, networkName), "QUEUES");
             this.networkRoot.mkdirs();
             this.queuesRoot.mkdirs();
+
+            // clear statistic data
+            this.crawlResults.clearStacks();
             
             // relocate
             this.crawlQueues.relocate(this.queuesRoot); // cannot be closed because the busy threads are working with that object
diff --git a/source/net/yacy/cora/document/MultiProtocolURI.java b/source/net/yacy/cora/document/MultiProtocolURI.java
index 685097b01..10dbdb5de 100644
--- a/source/net/yacy/cora/document/MultiProtocolURI.java
+++ b/source/net/yacy/cora/document/MultiProtocolURI.java
@@ -225,22 +225,9 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
                 // parse file url
                 String h = url.substring(p + 1);
                 if (h.startsWith("//")) {
-                    // host may be given, but may be also empty
-                    final int q = h.indexOf('/', 2);
-                    if (q <= 0) {
-                        // no host given
-                        host = null;
-                        path = h.substring(2);
-                    } else {
-                        host = h.substring(2, q);
-                        if (host.length() == 0 || host.equals("localhost")) host = null;
-                        h = h.substring(q);
-                        char c = h.charAt(2);
-                        if (c == ':' || c == '|')
-                            path = h.substring(1);
-                        else
-                            path = h;
-                    }
+                    // no host given
+                    host = null;
+                    path = h.substring(2);
                 } else {
                     host = null;
                     if (h.length() > 0 && h.charAt(0) == '/') {
diff --git a/source/net/yacy/cora/protocol/Domains.java b/source/net/yacy/cora/protocol/Domains.java
index c42f76371..f415a930b 100644
--- a/source/net/yacy/cora/protocol/Domains.java
+++ b/source/net/yacy/cora/protocol/Domains.java
@@ -23,11 +23,20 @@ package net.yacy.cora.protocol;
 import java.net.InetAddress;
 import java.net.UnknownHostException;
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.Collections;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
+import java.util.concurrent.Callable;
+import java.util.concurrent.CancellationException;
 import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
 import java.util.regex.Pattern;
 
 import net.yacy.cora.storage.ARC;
@@ -454,6 +463,80 @@ public class Domains {
         return false;
     }
 
+    public static String getHostName(final InetAddress i) {
+        Collection<String> hosts = nameCacheHit.getKeys(i);
+        if (hosts.size() > 0) return hosts.iterator().next();
+        
+        // call i.getHostName() using concurrency to interrupt execution in case of a time-out
+        final Callable<String> callable = new Callable<String>() {
+            public String call() { return i.getHostName(); }
+        };
+        ExecutorService service = Executors.newSingleThreadExecutor();
+        final Future<String> taskFuture = service.submit(callable);
+        Runnable t = new Runnable() {         
+            public void run() { taskFuture.cancel(true); }
+        };
+        service.execute(t);
+        service.shutdown();
+        try {
+            return taskFuture.get(500, TimeUnit.MILLISECONDS);
+        } catch (CancellationException e) {
+            // callable was interrupted
+            return i.getHostAddress();
+        } catch (InterruptedException e) {
+            // service was shutdown
+            return i.getHostAddress();
+        } catch(ExecutionException e) {
+            // callable failed unexpectedly
+            return i.getHostAddress();
+        } catch (TimeoutException e) {
+            // time-out
+            return i.getHostAddress();
+        }
+    }
+    
+    public static InetAddress dnsResolve(final String hostx) {
+        if ((hostx == null) || (hostx.length() == 0)) return null;
+        final String host = hostx.toLowerCase().trim();        
+        // try to simply parse the address
+        InetAddress ip = parseInetAddress(host);
+        if (ip != null) return ip;
+        
+        // try to resolve host by doing a name cache lookup
+        ip = nameCacheHit.get(host);
+        if (ip != null) return ip;
+        
+        if (nameCacheMiss.containsKey(host)) return null;
+        
+        // call dnsResolveNetBased(host) using concurrency to interrupt execution in case of a time-out
+        final Callable<InetAddress> callable = new Callable<InetAddress>() {
+            public InetAddress call() { return dnsResolveNetBased(host); }
+        };
+        ExecutorService service = Executors.newSingleThreadExecutor();
+        final Future<InetAddress> taskFuture = service.submit(callable);
+        Runnable t = new Runnable() {         
+            public void run() { taskFuture.cancel(true); }
+        };
+        service.execute(t);
+        service.shutdown();
+        try {
+            return taskFuture.get(500, TimeUnit.MILLISECONDS);
+        } catch (CancellationException e) {
+            // callable was interrupted
+            return null;
+        } catch (InterruptedException e) {
+            // service was shutdown
+            return null;
+        } catch(ExecutionException e) {
+            // callable failed unexpectedly
+            return null;
+        } catch (TimeoutException e) {
+            // time-out
+            return null;
+        }
+    }
+    
+
     private static final InetAddress parseInetAddress(final String ip) {
         if (ip == null) return null;
         if (ip.length() < 8) return null;
@@ -474,33 +557,21 @@ public class Domains {
             return null;
         }
     }
-    
-    public static InetAddress dnsResolve(String host) {
-        if ((host == null) || (host.length() == 0)) return null;
-        host = host.toLowerCase().trim();        
-        // try to simply parse the address
-        InetAddress ip = parseInetAddress(host);
-        if (ip != null) return ip;
-        
-        // try to resolve host by doing a name cache lookup
-        ip = nameCacheHit.get(host);
-        if (ip != null) return ip;
-        
-        if (nameCacheMiss.containsKey(host)) return null;
-        //System.out.println("***DEBUG dnsResolve(" + host + ")");
+
+    private static InetAddress dnsResolveNetBased(String host) {
         try {
             boolean doCaching = true;
-            ip = InetAddress.getByName(host); // this makes the DNS request to backbone
+            InetAddress ip = InetAddress.getByName(host); // this makes the DNS request to backbone
             if ((ip == null) ||
                 (ip.isLoopbackAddress()) ||
                 (nameCacheNoCachingList.containsKey(host))
             ) {
                 doCaching = false;
             } else {
-            	if (matchesList(host, nameCacheNoCachingPatterns)) {
-            		nameCacheNoCachingList.put(host, PRESENT);
+                if (matchesList(host, nameCacheNoCachingPatterns)) {
+                    nameCacheNoCachingList.put(host, PRESENT);
                     doCaching = false;
-            	}
+                }
             }
             
             if (doCaching && ip != null) {
@@ -519,6 +590,7 @@ public class Domains {
         return null;
     }
 
+    
     /**
     * Returns the number of entries in the nameCacheHit map
     *
@@ -565,7 +637,7 @@ public class Domains {
         public void run() {
             String lhn = localHostName;
             try {
-                lhn = InetAddress.getLocalHost().getHostName();
+                lhn = getHostName(InetAddress.getLocalHost());
             } catch (UnknownHostException e) {}
             try {
                 localHostAddresses = InetAddress.getAllByName(lhn);
@@ -656,7 +728,8 @@ public class Domains {
         // finally check if there are other local IP addresses that are not in
         // the standard IP range
         for (int i = 0; i < localHostAddresses.length; i++) {
-            if (localHostAddresses[i].getHostName().equals(host)) return true;
+            String hostname = getHostName(localHostAddresses[i]);
+            if (hostname != null && hostname.equals(host)) return true;
             if (localHostAddresses[i].getHostAddress().equals(host)) return true;
         }
 
diff --git a/source/net/yacy/cora/protocol/Scanner.java b/source/net/yacy/cora/protocol/Scanner.java
index d37088f4b..578068044 100644
--- a/source/net/yacy/cora/protocol/Scanner.java
+++ b/source/net/yacy/cora/protocol/Scanner.java
@@ -121,7 +121,8 @@ public class Scanner extends Thread {
     private void addProtocol(String protocol, boolean bigrange) {
         for (InetAddress i: genlist(bigrange)) {
             try {
-                this.scanqueue.put(new MultiProtocolURI(protocol + "://" + i.getHostName() + "/"));
+                
+                this.scanqueue.put(new MultiProtocolURI(protocol + "://" + Domains.getHostName(i) + "/"));
             } catch (MalformedURLException e) {
                 Log.logException(e);
             } catch (InterruptedException e) {
diff --git a/source/net/yacy/cora/storage/ARC.java b/source/net/yacy/cora/storage/ARC.java
index 47aebc939..d0cec8065 100644
--- a/source/net/yacy/cora/storage/ARC.java
+++ b/source/net/yacy/cora/storage/ARC.java
@@ -21,6 +21,7 @@
 
 package net.yacy.cora.storage;
 
+import java.util.Collection;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.Set;
@@ -62,14 +63,21 @@ public interface ARC<K, V> extends Iterable<Map.Entry<K, V>> {
      * @return the value
      */
     public V get(K s);
+
+    /**
+     * check if the map contains the value
+     * @param value
+     * @return the keys that have the given value
+     */
+    public Collection<K> getKeys(V value);
     
     /**
      * check if the map contains the key
-     * @param s
-     * @return
+     * @param key
+     * @return true if the map contains the key
      */
-    public boolean containsKey(K s);
-    
+    public boolean containsKey(K key);
+
     /**
      * remove an entry from the cache
      * @param s
diff --git a/source/net/yacy/cora/storage/ConcurrentARC.java b/source/net/yacy/cora/storage/ConcurrentARC.java
index d339f0d0a..741da71bb 100644
--- a/source/net/yacy/cora/storage/ConcurrentARC.java
+++ b/source/net/yacy/cora/storage/ConcurrentARC.java
@@ -21,6 +21,8 @@
 package net.yacy.cora.storage;
 
 import java.util.AbstractMap;
+import java.util.ArrayList;
+import java.util.Collection;
 import java.util.Comparator;
 import java.util.HashSet;
 import java.util.Iterator;
@@ -105,6 +107,17 @@ public final class ConcurrentARC<K, V> extends AbstractMap<K, V> implements Map<
     	return this.arc[getPartition(s)].get((K) s);
     }
     
+    /**
+     * check if the map contains the value
+     * @param value
+     * @return the keys that have the given value
+     */
+    public Collection<K> getKeys(V value) {
+        ArrayList<K> keys = new ArrayList<K>();
+        for (int i = 0; i < this.arc.length; i++) keys.addAll(this.arc[i].getKeys(value));
+        return keys;
+    }
+    
     /**
      * check if the map contains the key
      * @param s
diff --git a/source/net/yacy/cora/storage/SimpleARC.java b/source/net/yacy/cora/storage/SimpleARC.java
index 7e5af9c18..9552d2200 100644
--- a/source/net/yacy/cora/storage/SimpleARC.java
+++ b/source/net/yacy/cora/storage/SimpleARC.java
@@ -22,6 +22,8 @@
 package net.yacy.cora.storage;
 
 import java.util.AbstractMap;
+import java.util.ArrayList;
+import java.util.Collection;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Map;
@@ -98,6 +100,26 @@ abstract class SimpleARC<K, V> extends AbstractMap<K, V> implements Map<K, V>, I
         assert (this.levelB.size() <= cacheSize); // the cache should shrink automatically
         return v;
     }
+
+    /**
+     * check if the map contains the value
+     * @param value
+     * @return the keys that have the given value
+     */
+    public Collection<K> getKeys(V value) {
+        ArrayList<K> keys = new ArrayList<K>();
+        synchronized (this.levelB) {
+            for (Map.Entry<K, V> entry: this.levelB.entrySet()) {
+                if (value.equals(entry.getValue())) keys.add(entry.getKey());
+            }
+        }
+        synchronized (this) {
+            for (Map.Entry<K, V> entry: this.levelA.entrySet()) {
+                if (value.equals(entry.getValue())) keys.add(entry.getKey());
+            }
+        }
+        return keys;
+    }
     
     /**
      * check if the map contains the key
diff --git a/source/net/yacy/kelondro/table/Table.java b/source/net/yacy/kelondro/table/Table.java
index 4a35b2d7f..5794f5ab2 100644
--- a/source/net/yacy/kelondro/table/Table.java
+++ b/source/net/yacy/kelondro/table/Table.java
@@ -305,6 +305,7 @@ public class Table implements Index, Iterable<Row.Entry> {
             assert table == null || table.size() == index.size() : "table.size() = " + table.size() + ", index.size() = " + index.size();
         }
         final HashMap<String, String> map = new HashMap<String, String>(8);
+        if (index == null) return map; // possibly closed or beeing closed
         map.put("tableSize", Integer.toString(index.size()));
         map.put("tableKeyChunkSize", Integer.toString(index.row().objectsize));
         map.put("tableKeyMem", Integer.toString(index.row().objectsize * index.size()));