From 2c549ae3412a497a6181afa442e8af2f815041dd Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 30 Sep 2010 23:57:58 +0000 Subject: [PATCH] fixed a number of small bugs: - better crawl star for files paths and smb paths - added time-out wrapper for dns resolving and reverse resolving to prevent blockings - fixed intranet scanner result list check boxes - prevented htcache usage in case of file and smb crawling (not necessary, documents are locally available) - fixed rss feed loader - fixes sitemap loader which had not been restricted to single files (crawl-depth must be zero) - clearing of crawl result lists when a network switch was done - higher maximum file size for crawler git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7214 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- defaults/yacy.init | 4 +- htroot/CrawlProfileEditor_p.java | 1 - htroot/CrawlResults.java | 11 +- htroot/CrawlStartIntranet_p.html | 2 +- htroot/CrawlStartSite_p.html | 3 +- htroot/Crawler_p.java | 57 +++++---- htroot/QuickCrawlLink_p.java | 1 - htroot/SettingsAck_p.java | 3 +- htroot/api/util/getpageinfo_p.java | 6 +- htroot/js/IndexCreate.js | 4 +- source/de/anomic/crawler/CrawlProfile.java | 20 ++-- .../de/anomic/crawler/CrawlSwitchboard.java | 14 +-- source/de/anomic/crawler/ResultURLs.java | 4 + .../anomic/crawler/retrieval/HTTPLoader.java | 8 +- source/de/anomic/search/Switchboard.java | 3 + .../yacy/cora/document/MultiProtocolURI.java | 19 +-- source/net/yacy/cora/protocol/Domains.java | 113 ++++++++++++++---- source/net/yacy/cora/protocol/Scanner.java | 3 +- source/net/yacy/cora/storage/ARC.java | 16 ++- .../net/yacy/cora/storage/ConcurrentARC.java | 13 ++ source/net/yacy/cora/storage/SimpleARC.java | 22 ++++ source/net/yacy/kelondro/table/Table.java | 1 + 22 files changed, 229 insertions(+), 99 deletions(-) diff --git a/defaults/yacy.init b/defaults/yacy.init index 5ca48584a..7a1f7bbf4 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -674,10 +674,10 @@ crawler.clientTimeout=9000 crawler.http.acceptEncoding=gzip crawler.http.acceptLanguage=en-us,en;q=0.5 crawler.http.acceptCharset=ISO-8859-1,utf-8;q=0.7,*;q=0.7 -crawler.http.maxFileSize=1048576 +crawler.http.maxFileSize=10485760 # ftp crawler specific settings; size in bytes -crawler.ftp.maxFileSize=1048576 +crawler.ftp.maxFileSize=10485760 # smb crawler specific settings: maximum size crawler.smb.maxFileSize=100000000 diff --git a/htroot/CrawlProfileEditor_p.java b/htroot/CrawlProfileEditor_p.java index 50f0cf8e2..1c3ef1c0b 100644 --- a/htroot/CrawlProfileEditor_p.java +++ b/htroot/CrawlProfileEditor_p.java @@ -92,7 +92,6 @@ public class CrawlProfileEditor_p { labels.add(new eentry(CrawlProfile.INDEX_TEXT, "Index Text", false, eentry.BOOLEAN)); labels.add(new eentry(CrawlProfile.INDEX_MEDIA, "Index Media", false, eentry.BOOLEAN)); labels.add(new eentry(CrawlProfile.STORE_HTCACHE, "Store in HTCache", false, eentry.BOOLEAN)); - labels.add(new eentry(CrawlProfile.STORE_TXCACHE, "Store in TXCache", false, eentry.BOOLEAN)); labels.add(new eentry(CrawlProfile.REMOTE_INDEXING, "Remote Indexing", false, eentry.BOOLEAN)); labels.add(new eentry(CrawlProfile.XSSTOPW, "Static stop-words", false, eentry.BOOLEAN)); labels.add(new eentry(CrawlProfile.XDSTOPW, "Dynamic stop-words", false, eentry.BOOLEAN)); diff --git a/htroot/CrawlResults.java b/htroot/CrawlResults.java index d5e37280a..484f40da9 100644 --- a/htroot/CrawlResults.java +++ b/htroot/CrawlResults.java @@ -188,15 +188,16 @@ public class CrawlResults { try { urle = sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).load(entry.getKey().getBytes(), null, 0); if (urle == null) { - Log.logWarning("PLASMA", "CrawlResults: URL not in index with url hash "+ entry.getKey()); + Log.logWarning("PLASMA", "CrawlResults: URL not in index with url hash " + entry.getKey()); urlstr = null; urltxt = null; metadata = null; - } else { - metadata = urle.metadata(); - urlstr = metadata.url().toNormalform(false, true); - urltxt = nxTools.shortenURLString(urlstr, 72); // shorten the string text like a URL + continue; } + metadata = urle.metadata(); + urlstr = metadata.url().toNormalform(false, true); + urltxt = nxTools.shortenURLString(urlstr, 72); // shorten the string text like a URL + initiatorSeed = entry.getValue() == null || entry.getValue().initiatorHash == null ? null : sb.peers.getConnected(new String(entry.getValue().initiatorHash)); executorSeed = entry.getValue() == null || entry.getValue().executorHash == null ? null : sb.peers.getConnected(new String(entry.getValue().executorHash)); diff --git a/htroot/CrawlStartIntranet_p.html b/htroot/CrawlStartIntranet_p.html index f1faa9b61..e85dafdad 100644 --- a/htroot/CrawlStartIntranet_p.html +++ b/htroot/CrawlStartIntranet_p.html @@ -36,7 +36,7 @@ #(/notintranet)# #(servertable)#:: -
+
diff --git a/htroot/CrawlStartSite_p.html b/htroot/CrawlStartSite_p.html index 153f752e8..6871dd735 100644 --- a/htroot/CrawlStartSite_p.html +++ b/htroot/CrawlStartSite_p.html @@ -45,8 +45,7 @@ empty - +
Link-List of URLLink-List of URL
0) { newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*"; @@ -203,7 +202,8 @@ public class Crawler_p { final boolean indexMedia = post.get("indexMedia", "off").equals("on"); env.setConfig("indexMedia", (indexMedia) ? "true" : "false"); - final boolean storeHTCache = post.get("storeHTCache", "off").equals("on"); + boolean storeHTCache = post.get("storeHTCache", "off").equals("on"); + if (crawlingStartURL.isFile() || crawlingStartURL.isSMB()) storeHTCache = false; env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false"); final String cachePolicyString = post.get("cachePolicy", "iffresh"); @@ -247,15 +247,21 @@ public class Crawler_p { // stack url sb.crawler.profilesPassiveCrawls.remove(crawlingStartURL.hash()); // if there is an old entry, delete it final CrawlProfile pe = new CrawlProfile( - (crawlingStartURL.getHost() == null) ? Long.toHexString(System.currentTimeMillis()) : crawlingStartURL.getHost(), + (crawlingStartURL.getHost() == null) ? crawlingStartURL.toNormalform(true, false) : crawlingStartURL.getHost(), crawlingStartURL, newcrawlingMustMatch, newcrawlingMustNotMatch, newcrawlingdepth, - crawlingIfOlder, crawlingDomMaxPages, + crawlingIfOlder, + crawlingDomMaxPages, crawlingQ, indexText, indexMedia, - storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cachePolicy); + storeHTCache, + crawlOrder, + xsstopw, + xdstopw, + xpstopw, + cachePolicy); sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe); final String reasonString = sb.crawlStacker.stackCrawl(new Request( sb.peers.mySeed().hash.getBytes(), @@ -352,7 +358,8 @@ public class Crawler_p { final Map hyperlinks = scraper.getAnchors(); final DigestURI crawlURL = new DigestURI("file://" + file.toString(), null); final CrawlProfile profile = new CrawlProfile( - fileName, crawlURL, + fileName, + crawlURL, newcrawlingMustMatch, CrawlProfile.MATCH_NEVER, newcrawlingdepth, @@ -362,9 +369,10 @@ public class Crawler_p { indexText, indexMedia, storeHTCache, - true, crawlOrder, - xsstopw, xdstopw, xpstopw, + xsstopw, + xdstopw, + xpstopw, cachePolicy); sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile); sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); @@ -405,15 +413,21 @@ public class Crawler_p { try { final DigestURI sitemapURL = new DigestURI(sitemapURLStr, null); final CrawlProfile pe = new CrawlProfile( - sitemapURLStr, sitemapURL, - newcrawlingMustMatch, + sitemapURLStr, + sitemapURL, + CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, - newcrawlingdepth, - crawlingIfOlder, crawlingDomMaxPages, - crawlingQ, - indexText, indexMedia, - storeHTCache, true, crawlOrder, - xsstopw, xdstopw, xpstopw, + 0, + crawlingIfOlder, + crawlingDomMaxPages, + true, + indexText, + indexMedia, + storeHTCache, + crawlOrder, + xsstopw, + xdstopw, + xpstopw, cachePolicy); sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe); final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, pe); @@ -431,7 +445,7 @@ public class Crawler_p { // download document ContentScraper scraper = null; scraper = sb.loader.parseResource(sitelistURL, CrawlProfile.CacheStrategy.IFFRESH); - String title = scraper.getTitle(); + // String title = scraper.getTitle(); // String description = scraper.getDescription(); // get links and generate filter @@ -444,7 +458,7 @@ public class Crawler_p { // put links onto crawl queue final CrawlProfile profile = new CrawlProfile( - title == null || title.length() == 0 ? sitelistURL.getHost() : title, + sitelistURL.getHost(), sitelistURL, newcrawlingMustMatch, CrawlProfile.MATCH_NEVER, @@ -455,9 +469,10 @@ public class Crawler_p { indexText, indexMedia, storeHTCache, - true, crawlOrder, - xsstopw, xdstopw, xpstopw, + xsstopw, + xdstopw, + xpstopw, cachePolicy); sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile); sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index c470db791..043c26159 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -157,7 +157,6 @@ public class QuickCrawlLink_p { indexText, indexMedia, storeHTCache, - true, remoteIndexing, xsstopw, xdstopw, diff --git a/htroot/SettingsAck_p.java b/htroot/SettingsAck_p.java index 32d974486..f9b3df8d3 100644 --- a/htroot/SettingsAck_p.java +++ b/htroot/SettingsAck_p.java @@ -36,6 +36,7 @@ import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.RequestHeader; import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.order.Digest; @@ -111,7 +112,7 @@ public class SettingsAck_p { final serverCore theServerCore = (serverCore) env.getThread("10_httpd"); try { final InetSocketAddress theNewAddress = theServerCore.generateSocketAddress(port); - final String hostName = theNewAddress.getHostName(); + final String hostName = Domains.getHostName(theNewAddress.getAddress()); prop.put("info_restart", "1"); prop.put("info_restart_ip",(hostName.equals("0.0.0.0"))? "localhost" : hostName); prop.put("info_restart_port", theNewAddress.getPort()); diff --git a/htroot/api/util/getpageinfo_p.java b/htroot/api/util/getpageinfo_p.java index 89bc7ad8e..882c6601b 100755 --- a/htroot/api/util/getpageinfo_p.java +++ b/htroot/api/util/getpageinfo_p.java @@ -26,11 +26,13 @@ public class getpageinfo_p { prop.put("robots-allowed", "3"); //unknown prop.put("sitemap", ""); prop.put("favicon",""); + prop.put("sitelist", ""); + prop.put("filter", ".*"); // default actions String actions="title,robots"; - if(post!=null && post.containsKey("url")){ + if (post != null && post.containsKey("url")) { if(post.containsKey("actions")) actions=post.get("actions"); String url=post.get("url"); @@ -97,7 +99,7 @@ public class getpageinfo_p { prop.putXML("filter", filter.length() > 0 ? filter.substring(1) : ".*"); } } - if(actions.indexOf("robots")>=0){ + if (actions.indexOf("robots")>=0) { try { final DigestURI theURL = new DigestURI(url, null); diff --git a/htroot/js/IndexCreate.js b/htroot/js/IndexCreate.js index b411f2261..ab7a72333 100644 --- a/htroot/js/IndexCreate.js +++ b/htroot/js/IndexCreate.js @@ -48,14 +48,14 @@ function handleResponse(){ sitemap=response.getElementsByTagName("sitemap")[0].firstChild.nodeValue; } document.getElementsByName("sitemapURL")[0].value=sitemap; - document.getElementById("sitemap").disabled=false; + if (sitemap) document.getElementById("sitemap").disabled=false; } sitelist=""; if (response.getElementsByTagName("sitelist")[0].firstChild!=null){ sitelist=response.getElementsByTagName("sitelist")[0].firstChild.nodeValue; } document.getElementById("sitelistURLs").innerHTML = sitelist; - document.getElementById("sitelist").disabled=false; + if (sitelist) document.getElementById("sitelist").disabled=false; // clear the ajax image document.getElementsByName("ajax")[0].setAttribute("src", AJAX_OFF); diff --git a/source/de/anomic/crawler/CrawlProfile.java b/source/de/anomic/crawler/CrawlProfile.java index 23e26fa9d..08c028c3b 100644 --- a/source/de/anomic/crawler/CrawlProfile.java +++ b/source/de/anomic/crawler/CrawlProfile.java @@ -53,7 +53,6 @@ public class CrawlProfile extends ConcurrentHashMap implements M public static final String INDEX_TEXT = "indexText"; public static final String INDEX_MEDIA = "indexMedia"; public static final String STORE_HTCACHE = "storeHTCache"; - public static final String STORE_TXCACHE = "storeTXCache"; public static final String REMOTE_INDEXING = "remoteIndexing"; public static final String XSSTOPW = "xsstopw"; public static final String XDSTOPW = "xdstopw"; @@ -64,17 +63,22 @@ public class CrawlProfile extends ConcurrentHashMap implements M private Pattern mustmatch = null, mustnotmatch = null; - public CrawlProfile(final String name, final DigestURI startURL, + public CrawlProfile( + final String name, + final DigestURI startURL, final String mustmatch, final String mustnotmatch, final int depth, final long recrawlIfOlder /*date*/, final int domMaxPages, final boolean crawlingQ, - final boolean indexText, final boolean indexMedia, - final boolean storeHTCache, final boolean storeTXCache, + final boolean indexText, + final boolean indexMedia, + final boolean storeHTCache, final boolean remoteIndexing, - final boolean xsstopw, final boolean xdstopw, final boolean xpstopw, + final boolean xsstopw, + final boolean xdstopw, + final boolean xpstopw, final CacheStrategy cacheStrategy) { super(40); if (name == null || name.length() == 0) throw new NullPointerException("name must not be null"); @@ -91,7 +95,6 @@ public class CrawlProfile extends ConcurrentHashMap implements M put(INDEX_TEXT, indexText); put(INDEX_MEDIA, indexMedia); put(STORE_HTCACHE, storeHTCache); - put(STORE_TXCACHE, storeTXCache); put(REMOTE_INDEXING, remoteIndexing); put(XSSTOPW, xsstopw); // exclude static stop-words put(XDSTOPW, xdstopw); // exclude dynamic stop-word @@ -218,11 +221,6 @@ public class CrawlProfile extends ConcurrentHashMap implements M if (r == null) return false; return (r.equals(Boolean.TRUE.toString())); } - public boolean storeTXCache() { - final String r = get(STORE_TXCACHE); - if (r == null) return false; - return (r.equals(Boolean.TRUE.toString())); - } public boolean remoteIndexing() { final String r = get(REMOTE_INDEXING); if (r == null) return false; diff --git a/source/de/anomic/crawler/CrawlSwitchboard.java b/source/de/anomic/crawler/CrawlSwitchboard.java index f90b0f40b..025369ba4 100644 --- a/source/de/anomic/crawler/CrawlSwitchboard.java +++ b/source/de/anomic/crawler/CrawlSwitchboard.java @@ -170,7 +170,7 @@ public final class CrawlSwitchboard { CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, false, true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/, true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/, - true, true, + true, false /*getConfigBool(PROXY_INDEXING_REMOTE, false)*/, true, true, true, CrawlProfile.CacheStrategy.IFFRESH); this.profilesActiveCrawls.put(this.defaultProxyProfile.handle().getBytes(), this.defaultProxyProfile); @@ -178,38 +178,38 @@ public final class CrawlSwitchboard { if (this.defaultRemoteProfile == null) { // generate new default entry for remote crawling this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, - -1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH); + -1, -1, true, true, true, false, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH); this.profilesActiveCrawls.put(this.defaultRemoteProfile.handle().getBytes(), this.defaultRemoteProfile); } if (this.defaultTextSnippetLocalProfile == null) { // generate new default entry for snippet fetch and optional crawling this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, - CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH); + CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH); this.profilesActiveCrawls.put(this.defaultTextSnippetLocalProfile.handle().getBytes(), this.defaultTextSnippetLocalProfile); } if (this.defaultTextSnippetGlobalProfile == null) { // generate new default entry for snippet fetch and optional crawling this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, - CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST); + CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST); this.profilesActiveCrawls.put(this.defaultTextSnippetGlobalProfile.handle().getBytes(), this.defaultTextSnippetGlobalProfile); } this.defaultTextSnippetGlobalProfile.setCacheStrategy(CrawlProfile.CacheStrategy.IFEXIST); if (this.defaultMediaSnippetLocalProfile == null) { // generate new default entry for snippet fetch and optional crawling this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, - CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, true, false, false, true, false, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST); + CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST); this.profilesActiveCrawls.put(this.defaultMediaSnippetLocalProfile.handle().getBytes(), this.defaultMediaSnippetLocalProfile); } if (this.defaultMediaSnippetGlobalProfile == null) { // generate new default entry for snippet fetch and optional crawling this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, - CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST); + CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, true, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST); this.profilesActiveCrawls.put(this.defaultMediaSnippetGlobalProfile.handle().getBytes(), this.defaultMediaSnippetGlobalProfile); } if (this.defaultSurrogateProfile == null) { // generate new default entry for surrogate parsing this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, - CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE); + CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, true, true, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE); this.profilesActiveCrawls.put(this.defaultSurrogateProfile.handle().getBytes(), this.defaultSurrogateProfile); } } diff --git a/source/de/anomic/crawler/ResultURLs.java b/source/de/anomic/crawler/ResultURLs.java index ca2e9d70e..27bbac4cc 100644 --- a/source/de/anomic/crawler/ResultURLs.java +++ b/source/de/anomic/crawler/ResultURLs.java @@ -159,6 +159,10 @@ public final class ResultURLs { return resultDomains.get(stack); } + public void clearStacks() { + for (EventOrigin origin: EventOrigin.values()) clearStack(origin); + } + public synchronized void clearStack(final EventOrigin stack) { final Map resultStack = getStack(stack); if (resultStack != null) resultStack.clear(); diff --git a/source/de/anomic/crawler/retrieval/HTTPLoader.java b/source/de/anomic/crawler/retrieval/HTTPLoader.java index 9d26b262b..c0bb1ef8f 100644 --- a/source/de/anomic/crawler/retrieval/HTTPLoader.java +++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java @@ -118,7 +118,11 @@ public final class HTTPLoader { // FIXME: 30*-handling (bottom) is never reached // we always get the final content because httpClient.followRedirects = true - if (responseBody != null && (code == 200 || code == 203)) { + if (responseBody == null) { + // no response, reject file + sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "no response body (you may increase the maxmimum file size)"); + throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString()); + } else if (code == 200 || code == 203) { // the transfer is ok // we write the new cache entry to file system directly @@ -180,7 +184,7 @@ public final class HTTPLoader { } } else { // if the response has not the right response type then reject file - sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "wrong http status code " + code + ")"); + sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "wrong http status code " + code); throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString()); } return response; diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 09528623d..a8f5f01c8 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -858,6 +858,9 @@ public final class Switchboard extends serverSwitch { this.queuesRoot = new File(new File(indexPrimaryPath, networkName), "QUEUES"); this.networkRoot.mkdirs(); this.queuesRoot.mkdirs(); + + // clear statistic data + this.crawlResults.clearStacks(); // relocate this.crawlQueues.relocate(this.queuesRoot); // cannot be closed because the busy threads are working with that object diff --git a/source/net/yacy/cora/document/MultiProtocolURI.java b/source/net/yacy/cora/document/MultiProtocolURI.java index 685097b01..10dbdb5de 100644 --- a/source/net/yacy/cora/document/MultiProtocolURI.java +++ b/source/net/yacy/cora/document/MultiProtocolURI.java @@ -225,22 +225,9 @@ public class MultiProtocolURI implements Serializable, Comparable 0 && h.charAt(0) == '/') { diff --git a/source/net/yacy/cora/protocol/Domains.java b/source/net/yacy/cora/protocol/Domains.java index c42f76371..f415a930b 100644 --- a/source/net/yacy/cora/protocol/Domains.java +++ b/source/net/yacy/cora/protocol/Domains.java @@ -23,11 +23,20 @@ package net.yacy.cora.protocol; import java.net.InetAddress; import java.net.UnknownHostException; import java.util.ArrayList; +import java.util.Collection; import java.util.Collections; import java.util.LinkedList; import java.util.List; import java.util.Map; +import java.util.concurrent.Callable; +import java.util.concurrent.CancellationException; import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; import java.util.regex.Pattern; import net.yacy.cora.storage.ARC; @@ -454,6 +463,80 @@ public class Domains { return false; } + public static String getHostName(final InetAddress i) { + Collection hosts = nameCacheHit.getKeys(i); + if (hosts.size() > 0) return hosts.iterator().next(); + + // call i.getHostName() using concurrency to interrupt execution in case of a time-out + final Callable callable = new Callable() { + public String call() { return i.getHostName(); } + }; + ExecutorService service = Executors.newSingleThreadExecutor(); + final Future taskFuture = service.submit(callable); + Runnable t = new Runnable() { + public void run() { taskFuture.cancel(true); } + }; + service.execute(t); + service.shutdown(); + try { + return taskFuture.get(500, TimeUnit.MILLISECONDS); + } catch (CancellationException e) { + // callable was interrupted + return i.getHostAddress(); + } catch (InterruptedException e) { + // service was shutdown + return i.getHostAddress(); + } catch(ExecutionException e) { + // callable failed unexpectedly + return i.getHostAddress(); + } catch (TimeoutException e) { + // time-out + return i.getHostAddress(); + } + } + + public static InetAddress dnsResolve(final String hostx) { + if ((hostx == null) || (hostx.length() == 0)) return null; + final String host = hostx.toLowerCase().trim(); + // try to simply parse the address + InetAddress ip = parseInetAddress(host); + if (ip != null) return ip; + + // try to resolve host by doing a name cache lookup + ip = nameCacheHit.get(host); + if (ip != null) return ip; + + if (nameCacheMiss.containsKey(host)) return null; + + // call dnsResolveNetBased(host) using concurrency to interrupt execution in case of a time-out + final Callable callable = new Callable() { + public InetAddress call() { return dnsResolveNetBased(host); } + }; + ExecutorService service = Executors.newSingleThreadExecutor(); + final Future taskFuture = service.submit(callable); + Runnable t = new Runnable() { + public void run() { taskFuture.cancel(true); } + }; + service.execute(t); + service.shutdown(); + try { + return taskFuture.get(500, TimeUnit.MILLISECONDS); + } catch (CancellationException e) { + // callable was interrupted + return null; + } catch (InterruptedException e) { + // service was shutdown + return null; + } catch(ExecutionException e) { + // callable failed unexpectedly + return null; + } catch (TimeoutException e) { + // time-out + return null; + } + } + + private static final InetAddress parseInetAddress(final String ip) { if (ip == null) return null; if (ip.length() < 8) return null; @@ -474,33 +557,21 @@ public class Domains { return null; } } - - public static InetAddress dnsResolve(String host) { - if ((host == null) || (host.length() == 0)) return null; - host = host.toLowerCase().trim(); - // try to simply parse the address - InetAddress ip = parseInetAddress(host); - if (ip != null) return ip; - - // try to resolve host by doing a name cache lookup - ip = nameCacheHit.get(host); - if (ip != null) return ip; - - if (nameCacheMiss.containsKey(host)) return null; - //System.out.println("***DEBUG dnsResolve(" + host + ")"); + + private static InetAddress dnsResolveNetBased(String host) { try { boolean doCaching = true; - ip = InetAddress.getByName(host); // this makes the DNS request to backbone + InetAddress ip = InetAddress.getByName(host); // this makes the DNS request to backbone if ((ip == null) || (ip.isLoopbackAddress()) || (nameCacheNoCachingList.containsKey(host)) ) { doCaching = false; } else { - if (matchesList(host, nameCacheNoCachingPatterns)) { - nameCacheNoCachingList.put(host, PRESENT); + if (matchesList(host, nameCacheNoCachingPatterns)) { + nameCacheNoCachingList.put(host, PRESENT); doCaching = false; - } + } } if (doCaching && ip != null) { @@ -519,6 +590,7 @@ public class Domains { return null; } + /** * Returns the number of entries in the nameCacheHit map * @@ -565,7 +637,7 @@ public class Domains { public void run() { String lhn = localHostName; try { - lhn = InetAddress.getLocalHost().getHostName(); + lhn = getHostName(InetAddress.getLocalHost()); } catch (UnknownHostException e) {} try { localHostAddresses = InetAddress.getAllByName(lhn); @@ -656,7 +728,8 @@ public class Domains { // finally check if there are other local IP addresses that are not in // the standard IP range for (int i = 0; i < localHostAddresses.length; i++) { - if (localHostAddresses[i].getHostName().equals(host)) return true; + String hostname = getHostName(localHostAddresses[i]); + if (hostname != null && hostname.equals(host)) return true; if (localHostAddresses[i].getHostAddress().equals(host)) return true; } diff --git a/source/net/yacy/cora/protocol/Scanner.java b/source/net/yacy/cora/protocol/Scanner.java index d37088f4b..578068044 100644 --- a/source/net/yacy/cora/protocol/Scanner.java +++ b/source/net/yacy/cora/protocol/Scanner.java @@ -121,7 +121,8 @@ public class Scanner extends Thread { private void addProtocol(String protocol, boolean bigrange) { for (InetAddress i: genlist(bigrange)) { try { - this.scanqueue.put(new MultiProtocolURI(protocol + "://" + i.getHostName() + "/")); + + this.scanqueue.put(new MultiProtocolURI(protocol + "://" + Domains.getHostName(i) + "/")); } catch (MalformedURLException e) { Log.logException(e); } catch (InterruptedException e) { diff --git a/source/net/yacy/cora/storage/ARC.java b/source/net/yacy/cora/storage/ARC.java index 47aebc939..d0cec8065 100644 --- a/source/net/yacy/cora/storage/ARC.java +++ b/source/net/yacy/cora/storage/ARC.java @@ -21,6 +21,7 @@ package net.yacy.cora.storage; +import java.util.Collection; import java.util.Iterator; import java.util.Map; import java.util.Set; @@ -62,14 +63,21 @@ public interface ARC extends Iterable> { * @return the value */ public V get(K s); + + /** + * check if the map contains the value + * @param value + * @return the keys that have the given value + */ + public Collection getKeys(V value); /** * check if the map contains the key - * @param s - * @return + * @param key + * @return true if the map contains the key */ - public boolean containsKey(K s); - + public boolean containsKey(K key); + /** * remove an entry from the cache * @param s diff --git a/source/net/yacy/cora/storage/ConcurrentARC.java b/source/net/yacy/cora/storage/ConcurrentARC.java index d339f0d0a..741da71bb 100644 --- a/source/net/yacy/cora/storage/ConcurrentARC.java +++ b/source/net/yacy/cora/storage/ConcurrentARC.java @@ -21,6 +21,8 @@ package net.yacy.cora.storage; import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.Collection; import java.util.Comparator; import java.util.HashSet; import java.util.Iterator; @@ -105,6 +107,17 @@ public final class ConcurrentARC extends AbstractMap implements Map< return this.arc[getPartition(s)].get((K) s); } + /** + * check if the map contains the value + * @param value + * @return the keys that have the given value + */ + public Collection getKeys(V value) { + ArrayList keys = new ArrayList(); + for (int i = 0; i < this.arc.length; i++) keys.addAll(this.arc[i].getKeys(value)); + return keys; + } + /** * check if the map contains the key * @param s diff --git a/source/net/yacy/cora/storage/SimpleARC.java b/source/net/yacy/cora/storage/SimpleARC.java index 7e5af9c18..9552d2200 100644 --- a/source/net/yacy/cora/storage/SimpleARC.java +++ b/source/net/yacy/cora/storage/SimpleARC.java @@ -22,6 +22,8 @@ package net.yacy.cora.storage; import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.Collection; import java.util.HashSet; import java.util.Iterator; import java.util.Map; @@ -98,6 +100,26 @@ abstract class SimpleARC extends AbstractMap implements Map, I assert (this.levelB.size() <= cacheSize); // the cache should shrink automatically return v; } + + /** + * check if the map contains the value + * @param value + * @return the keys that have the given value + */ + public Collection getKeys(V value) { + ArrayList keys = new ArrayList(); + synchronized (this.levelB) { + for (Map.Entry entry: this.levelB.entrySet()) { + if (value.equals(entry.getValue())) keys.add(entry.getKey()); + } + } + synchronized (this) { + for (Map.Entry entry: this.levelA.entrySet()) { + if (value.equals(entry.getValue())) keys.add(entry.getKey()); + } + } + return keys; + } /** * check if the map contains the key diff --git a/source/net/yacy/kelondro/table/Table.java b/source/net/yacy/kelondro/table/Table.java index 4a35b2d7f..5794f5ab2 100644 --- a/source/net/yacy/kelondro/table/Table.java +++ b/source/net/yacy/kelondro/table/Table.java @@ -305,6 +305,7 @@ public class Table implements Index, Iterable { assert table == null || table.size() == index.size() : "table.size() = " + table.size() + ", index.size() = " + index.size(); } final HashMap map = new HashMap(8); + if (index == null) return map; // possibly closed or beeing closed map.put("tableSize", Integer.toString(index.size())); map.put("tableKeyChunkSize", Integer.toString(index.row().objectsize)); map.put("tableKeyMem", Integer.toString(index.row().objectsize * index.size()));