fixed a number of small bugs:

- better crawl star for files paths and smb paths
- added time-out wrapper for dns resolving and reverse resolving to prevent blockings
- fixed intranet scanner result list check boxes
- prevented htcache usage in case of file and smb crawling (not necessary, documents are locally available)
- fixed rss feed loader
- fixes sitemap loader which had not been restricted to single files (crawl-depth must be zero)
- clearing of crawl result lists when a network switch was done
- higher maximum file size for crawler

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7214 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent f6eebb6f99
commit 2c549ae341

@ -674,10 +674,10 @@ crawler.clientTimeout=9000
crawler.http.acceptEncoding=gzip
crawler.http.acceptLanguage=en-us,en;q=0.5
crawler.http.acceptCharset=ISO-8859-1,utf-8;q=0.7,*;q=0.7
crawler.http.maxFileSize=1048576
crawler.http.maxFileSize=10485760
# ftp crawler specific settings; size in bytes
crawler.ftp.maxFileSize=1048576
crawler.ftp.maxFileSize=10485760
# smb crawler specific settings: maximum size
crawler.smb.maxFileSize=100000000

@ -92,7 +92,6 @@ public class CrawlProfileEditor_p {
labels.add(new eentry(CrawlProfile.INDEX_TEXT, "Index Text", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.INDEX_MEDIA, "Index Media", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.STORE_HTCACHE, "Store in HTCache", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.STORE_TXCACHE, "Store in TXCache", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.REMOTE_INDEXING, "Remote Indexing", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.XSSTOPW, "Static stop-words", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.XDSTOPW, "Dynamic stop-words", false, eentry.BOOLEAN));

@ -188,15 +188,16 @@ public class CrawlResults {
try {
urle = sb.indexSegments.urlMetadata(Segments.Process.LOCALCRAWLING).load(entry.getKey().getBytes(), null, 0);
if (urle == null) {
Log.logWarning("PLASMA", "CrawlResults: URL not in index with url hash "+ entry.getKey());
Log.logWarning("PLASMA", "CrawlResults: URL not in index with url hash " + entry.getKey());
urlstr = null;
urltxt = null;
metadata = null;
} else {
metadata = urle.metadata();
urlstr = metadata.url().toNormalform(false, true);
urltxt = nxTools.shortenURLString(urlstr, 72); // shorten the string text like a URL
continue;
}
metadata = urle.metadata();
urlstr = metadata.url().toNormalform(false, true);
urltxt = nxTools.shortenURLString(urlstr, 72); // shorten the string text like a URL
initiatorSeed = entry.getValue() == null || entry.getValue().initiatorHash == null ? null : sb.peers.getConnected(new String(entry.getValue().initiatorHash));
executorSeed = entry.getValue() == null || entry.getValue().executorHash == null ? null : sb.peers.getConnected(new String(entry.getValue().executorHash));

@ -36,7 +36,7 @@
#(/notintranet)#
#(servertable)#::
<form id="servertable" action="CrawlStartIntranet_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8" ><fieldset>
<form id="servertable" name="servertable" action="CrawlStartIntranet_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8" ><fieldset>
<legend><label for="servertable">Available Intranet Server</label></legend>
<table class="sortable" border="0" cellpadding="2" cellspacing="1">
<tr class="TableHeader" valign="bottom">

@ -45,8 +45,7 @@
<span id="robotsOK"></span><img align="top" src="/env/grafics/empty.gif" name="ajax" alt="empty" />
</td>
</tr><tr>
<td><input type="radio" name="crawlingMode" id="sitelist" value="sitelist" disabled="disabled"
onmousedown="document.getElementById('rangeDomain').disabled=true;document.getElementById('rangeSubpath').disabled=true;document.getElementById('crawlingDomMaxCheck').disabled=true;document.getElementById('crawlingDomMaxPages').disabled=true;document.getElementById('crawlingQ').disabled=true;"/>Link-List of URL</td>
<td><input type="radio" name="crawlingMode" id="sitelist" value="sitelist" disabled="disabled" />Link-List of URL</td>
<td><div id="sitelistURLs"></div></td>
</tr><tr>
<td><input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled"

@ -138,14 +138,13 @@ public class Crawler_p {
final boolean fullDomain = post.get("range", "wide").equals("domain"); // special property in simple crawl start
final boolean subPath = post.get("range", "wide").equals("subpath"); // special property in simple crawl start
// set the crawl filter
String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL);
String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER);
if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL; // avoid that all urls are filtered out if bad value was submitted
// special cases:
if (crawlingStartURL!= null && fullDomain) {
newcrawlingMustMatch = ".*" + crawlingStartURL.getHost() + ".*";
newcrawlingMustMatch = crawlingStartURL.isFile() ? "file:///.*" : crawlingStartURL.isSMB() ? "smb://.*" : ".*" + crawlingStartURL.getHost() + ".*";
}
if (crawlingStart!= null && subPath && (pos = crawlingStart.lastIndexOf('/')) > 0) {
newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*";
@ -203,7 +202,8 @@ public class Crawler_p {
final boolean indexMedia = post.get("indexMedia", "off").equals("on");
env.setConfig("indexMedia", (indexMedia) ? "true" : "false");
final boolean storeHTCache = post.get("storeHTCache", "off").equals("on");
boolean storeHTCache = post.get("storeHTCache", "off").equals("on");
if (crawlingStartURL.isFile() || crawlingStartURL.isSMB()) storeHTCache = false;
env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false");
final String cachePolicyString = post.get("cachePolicy", "iffresh");
@ -247,15 +247,21 @@ public class Crawler_p {
// stack url
sb.crawler.profilesPassiveCrawls.remove(crawlingStartURL.hash()); // if there is an old entry, delete it
final CrawlProfile pe = new CrawlProfile(
(crawlingStartURL.getHost() == null) ? Long.toHexString(System.currentTimeMillis()) : crawlingStartURL.getHost(),
(crawlingStartURL.getHost() == null) ? crawlingStartURL.toNormalform(true, false) : crawlingStartURL.getHost(),
crawlingStartURL,
newcrawlingMustMatch,
newcrawlingMustNotMatch,
newcrawlingdepth,
crawlingIfOlder, crawlingDomMaxPages,
crawlingIfOlder,
crawlingDomMaxPages,
crawlingQ,
indexText, indexMedia,
storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cachePolicy);
storeHTCache,
crawlOrder,
xsstopw,
xdstopw,
xpstopw,
cachePolicy);
sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
final String reasonString = sb.crawlStacker.stackCrawl(new Request(
sb.peers.mySeed().hash.getBytes(),
@ -352,7 +358,8 @@ public class Crawler_p {
final Map<MultiProtocolURI, String> hyperlinks = scraper.getAnchors();
final DigestURI crawlURL = new DigestURI("file://" + file.toString(), null);
final CrawlProfile profile = new CrawlProfile(
fileName, crawlURL,
fileName,
crawlURL,
newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER,
newcrawlingdepth,
@ -362,9 +369,10 @@ public class Crawler_p {
indexText,
indexMedia,
storeHTCache,
true,
crawlOrder,
xsstopw, xdstopw, xpstopw,
xsstopw,
xdstopw,
xpstopw,
cachePolicy);
sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
@ -405,15 +413,21 @@ public class Crawler_p {
try {
final DigestURI sitemapURL = new DigestURI(sitemapURLStr, null);
final CrawlProfile pe = new CrawlProfile(
sitemapURLStr, sitemapURL,
newcrawlingMustMatch,
sitemapURLStr,
sitemapURL,
CrawlProfile.MATCH_ALL,
CrawlProfile.MATCH_NEVER,
newcrawlingdepth,
crawlingIfOlder, crawlingDomMaxPages,
crawlingQ,
indexText, indexMedia,
storeHTCache, true, crawlOrder,
xsstopw, xdstopw, xpstopw,
0,
crawlingIfOlder,
crawlingDomMaxPages,
true,
indexText,
indexMedia,
storeHTCache,
crawlOrder,
xsstopw,
xdstopw,
xpstopw,
cachePolicy);
sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, pe);
@ -431,7 +445,7 @@ public class Crawler_p {
// download document
ContentScraper scraper = null;
scraper = sb.loader.parseResource(sitelistURL, CrawlProfile.CacheStrategy.IFFRESH);
String title = scraper.getTitle();
// String title = scraper.getTitle();
// String description = scraper.getDescription();
// get links and generate filter
@ -444,7 +458,7 @@ public class Crawler_p {
// put links onto crawl queue
final CrawlProfile profile = new CrawlProfile(
title == null || title.length() == 0 ? sitelistURL.getHost() : title,
sitelistURL.getHost(),
sitelistURL,
newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER,
@ -455,9 +469,10 @@ public class Crawler_p {
indexText,
indexMedia,
storeHTCache,
true,
crawlOrder,
xsstopw, xdstopw, xpstopw,
xsstopw,
xdstopw,
xpstopw,
cachePolicy);
sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);

@ -157,7 +157,6 @@ public class QuickCrawlLink_p {
indexText,
indexMedia,
storeHTCache,
true,
remoteIndexing,
xsstopw,
xdstopw,

@ -36,6 +36,7 @@ import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.order.Digest;
@ -111,7 +112,7 @@ public class SettingsAck_p {
final serverCore theServerCore = (serverCore) env.getThread("10_httpd");
try {
final InetSocketAddress theNewAddress = theServerCore.generateSocketAddress(port);
final String hostName = theNewAddress.getHostName();
final String hostName = Domains.getHostName(theNewAddress.getAddress());
prop.put("info_restart", "1");
prop.put("info_restart_ip",(hostName.equals("0.0.0.0"))? "localhost" : hostName);
prop.put("info_restart_port", theNewAddress.getPort());

@ -26,11 +26,13 @@ public class getpageinfo_p {
prop.put("robots-allowed", "3"); //unknown
prop.put("sitemap", "");
prop.put("favicon","");
prop.put("sitelist", "");
prop.put("filter", ".*");
// default actions
String actions="title,robots";
if(post!=null && post.containsKey("url")){
if (post != null && post.containsKey("url")) {
if(post.containsKey("actions"))
actions=post.get("actions");
String url=post.get("url");
@ -97,7 +99,7 @@ public class getpageinfo_p {
prop.putXML("filter", filter.length() > 0 ? filter.substring(1) : ".*");
}
}
if(actions.indexOf("robots")>=0){
if (actions.indexOf("robots")>=0) {
try {
final DigestURI theURL = new DigestURI(url, null);

@ -48,14 +48,14 @@ function handleResponse(){
sitemap=response.getElementsByTagName("sitemap")[0].firstChild.nodeValue;
}
document.getElementsByName("sitemapURL")[0].value=sitemap;
document.getElementById("sitemap").disabled=false;
if (sitemap) document.getElementById("sitemap").disabled=false;
}
sitelist="";
if (response.getElementsByTagName("sitelist")[0].firstChild!=null){
sitelist=response.getElementsByTagName("sitelist")[0].firstChild.nodeValue;
}
document.getElementById("sitelistURLs").innerHTML = sitelist;
document.getElementById("sitelist").disabled=false;
if (sitelist) document.getElementById("sitelist").disabled=false;
// clear the ajax image
document.getElementsByName("ajax")[0].setAttribute("src", AJAX_OFF);

@ -53,7 +53,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public static final String INDEX_TEXT = "indexText";
public static final String INDEX_MEDIA = "indexMedia";
public static final String STORE_HTCACHE = "storeHTCache";
public static final String STORE_TXCACHE = "storeTXCache";
public static final String REMOTE_INDEXING = "remoteIndexing";
public static final String XSSTOPW = "xsstopw";
public static final String XDSTOPW = "xdstopw";
@ -64,17 +63,22 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
private Pattern mustmatch = null, mustnotmatch = null;
public CrawlProfile(final String name, final DigestURI startURL,
public CrawlProfile(
final String name,
final DigestURI startURL,
final String mustmatch,
final String mustnotmatch,
final int depth,
final long recrawlIfOlder /*date*/,
final int domMaxPages,
final boolean crawlingQ,
final boolean indexText, final boolean indexMedia,
final boolean storeHTCache, final boolean storeTXCache,
final boolean indexText,
final boolean indexMedia,
final boolean storeHTCache,
final boolean remoteIndexing,
final boolean xsstopw, final boolean xdstopw, final boolean xpstopw,
final boolean xsstopw,
final boolean xdstopw,
final boolean xpstopw,
final CacheStrategy cacheStrategy) {
super(40);
if (name == null || name.length() == 0) throw new NullPointerException("name must not be null");
@ -91,7 +95,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
put(INDEX_TEXT, indexText);
put(INDEX_MEDIA, indexMedia);
put(STORE_HTCACHE, storeHTCache);
put(STORE_TXCACHE, storeTXCache);
put(REMOTE_INDEXING, remoteIndexing);
put(XSSTOPW, xsstopw); // exclude static stop-words
put(XDSTOPW, xdstopw); // exclude dynamic stop-word
@ -218,11 +221,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean storeTXCache() {
final String r = get(STORE_TXCACHE);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean remoteIndexing() {
final String r = get(REMOTE_INDEXING);
if (r == null) return false;

@ -170,7 +170,7 @@ public final class CrawlSwitchboard {
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, false,
true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/,
true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/,
true, true,
true,
false /*getConfigBool(PROXY_INDEXING_REMOTE, false)*/, true, true, true,
CrawlProfile.CacheStrategy.IFFRESH);
this.profilesActiveCrawls.put(this.defaultProxyProfile.handle().getBytes(), this.defaultProxyProfile);
@ -178,38 +178,38 @@ public final class CrawlSwitchboard {
if (this.defaultRemoteProfile == null) {
// generate new default entry for remote crawling
this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
-1, -1, true, true, true, false, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
this.profilesActiveCrawls.put(this.defaultRemoteProfile.handle().getBytes(), this.defaultRemoteProfile);
}
if (this.defaultTextSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
this.profilesActiveCrawls.put(this.defaultTextSnippetLocalProfile.handle().getBytes(), this.defaultTextSnippetLocalProfile);
}
if (this.defaultTextSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(this.defaultTextSnippetGlobalProfile.handle().getBytes(), this.defaultTextSnippetGlobalProfile);
}
this.defaultTextSnippetGlobalProfile.setCacheStrategy(CrawlProfile.CacheStrategy.IFEXIST);
if (this.defaultMediaSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, true, false, false, true, false, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(this.defaultMediaSnippetLocalProfile.handle().getBytes(), this.defaultMediaSnippetLocalProfile);
}
if (this.defaultMediaSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, true, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(this.defaultMediaSnippetGlobalProfile.handle().getBytes(), this.defaultMediaSnippetGlobalProfile);
}
if (this.defaultSurrogateProfile == null) {
// generate new default entry for surrogate parsing
this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE);
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, true, true, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE);
this.profilesActiveCrawls.put(this.defaultSurrogateProfile.handle().getBytes(), this.defaultSurrogateProfile);
}
}

@ -159,6 +159,10 @@ public final class ResultURLs {
return resultDomains.get(stack);
}
public void clearStacks() {
for (EventOrigin origin: EventOrigin.values()) clearStack(origin);
}
public synchronized void clearStack(final EventOrigin stack) {
final Map<String, InitExecEntry> resultStack = getStack(stack);
if (resultStack != null) resultStack.clear();

@ -118,7 +118,11 @@ public final class HTTPLoader {
// FIXME: 30*-handling (bottom) is never reached
// we always get the final content because httpClient.followRedirects = true
if (responseBody != null && (code == 200 || code == 203)) {
if (responseBody == null) {
// no response, reject file
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "no response body (you may increase the maxmimum file size)");
throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
} else if (code == 200 || code == 203) {
// the transfer is ok
// we write the new cache entry to file system directly
@ -180,7 +184,7 @@ public final class HTTPLoader {
}
} else {
// if the response has not the right response type then reject file
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "wrong http status code " + code + ")");
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "wrong http status code " + code);
throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
}
return response;

@ -858,6 +858,9 @@ public final class Switchboard extends serverSwitch {
this.queuesRoot = new File(new File(indexPrimaryPath, networkName), "QUEUES");
this.networkRoot.mkdirs();
this.queuesRoot.mkdirs();
// clear statistic data
this.crawlResults.clearStacks();
// relocate
this.crawlQueues.relocate(this.queuesRoot); // cannot be closed because the busy threads are working with that object

@ -225,22 +225,9 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
// parse file url
String h = url.substring(p + 1);
if (h.startsWith("//")) {
// host may be given, but may be also empty
final int q = h.indexOf('/', 2);
if (q <= 0) {
// no host given
host = null;
path = h.substring(2);
} else {
host = h.substring(2, q);
if (host.length() == 0 || host.equals("localhost")) host = null;
h = h.substring(q);
char c = h.charAt(2);
if (c == ':' || c == '|')
path = h.substring(1);
else
path = h;
}
// no host given
host = null;
path = h.substring(2);
} else {
host = null;
if (h.length() > 0 && h.charAt(0) == '/') {

@ -23,11 +23,20 @@ package net.yacy.cora.protocol;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.CancellationException;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.regex.Pattern;
import net.yacy.cora.storage.ARC;
@ -454,6 +463,80 @@ public class Domains {
return false;
}
public static String getHostName(final InetAddress i) {
Collection<String> hosts = nameCacheHit.getKeys(i);
if (hosts.size() > 0) return hosts.iterator().next();
// call i.getHostName() using concurrency to interrupt execution in case of a time-out
final Callable<String> callable = new Callable<String>() {
public String call() { return i.getHostName(); }
};
ExecutorService service = Executors.newSingleThreadExecutor();
final Future<String> taskFuture = service.submit(callable);
Runnable t = new Runnable() {
public void run() { taskFuture.cancel(true); }
};
service.execute(t);
service.shutdown();
try {
return taskFuture.get(500, TimeUnit.MILLISECONDS);
} catch (CancellationException e) {
// callable was interrupted
return i.getHostAddress();
} catch (InterruptedException e) {
// service was shutdown
return i.getHostAddress();
} catch(ExecutionException e) {
// callable failed unexpectedly
return i.getHostAddress();
} catch (TimeoutException e) {
// time-out
return i.getHostAddress();
}
}
public static InetAddress dnsResolve(final String hostx) {
if ((hostx == null) || (hostx.length() == 0)) return null;
final String host = hostx.toLowerCase().trim();
// try to simply parse the address
InetAddress ip = parseInetAddress(host);
if (ip != null) return ip;
// try to resolve host by doing a name cache lookup
ip = nameCacheHit.get(host);
if (ip != null) return ip;
if (nameCacheMiss.containsKey(host)) return null;
// call dnsResolveNetBased(host) using concurrency to interrupt execution in case of a time-out
final Callable<InetAddress> callable = new Callable<InetAddress>() {
public InetAddress call() { return dnsResolveNetBased(host); }
};
ExecutorService service = Executors.newSingleThreadExecutor();
final Future<InetAddress> taskFuture = service.submit(callable);
Runnable t = new Runnable() {
public void run() { taskFuture.cancel(true); }
};
service.execute(t);
service.shutdown();
try {
return taskFuture.get(500, TimeUnit.MILLISECONDS);
} catch (CancellationException e) {
// callable was interrupted
return null;
} catch (InterruptedException e) {
// service was shutdown
return null;
} catch(ExecutionException e) {
// callable failed unexpectedly
return null;
} catch (TimeoutException e) {
// time-out
return null;
}
}
private static final InetAddress parseInetAddress(final String ip) {
if (ip == null) return null;
if (ip.length() < 8) return null;
@ -474,33 +557,21 @@ public class Domains {
return null;
}
}
public static InetAddress dnsResolve(String host) {
if ((host == null) || (host.length() == 0)) return null;
host = host.toLowerCase().trim();
// try to simply parse the address
InetAddress ip = parseInetAddress(host);
if (ip != null) return ip;
// try to resolve host by doing a name cache lookup
ip = nameCacheHit.get(host);
if (ip != null) return ip;
if (nameCacheMiss.containsKey(host)) return null;
//System.out.println("***DEBUG dnsResolve(" + host + ")");
private static InetAddress dnsResolveNetBased(String host) {
try {
boolean doCaching = true;
ip = InetAddress.getByName(host); // this makes the DNS request to backbone
InetAddress ip = InetAddress.getByName(host); // this makes the DNS request to backbone
if ((ip == null) ||
(ip.isLoopbackAddress()) ||
(nameCacheNoCachingList.containsKey(host))
) {
doCaching = false;
} else {
if (matchesList(host, nameCacheNoCachingPatterns)) {
nameCacheNoCachingList.put(host, PRESENT);
if (matchesList(host, nameCacheNoCachingPatterns)) {
nameCacheNoCachingList.put(host, PRESENT);
doCaching = false;
}
}
}
if (doCaching && ip != null) {
@ -519,6 +590,7 @@ public class Domains {
return null;
}
/**
* Returns the number of entries in the nameCacheHit map
*
@ -565,7 +637,7 @@ public class Domains {
public void run() {
String lhn = localHostName;
try {
lhn = InetAddress.getLocalHost().getHostName();
lhn = getHostName(InetAddress.getLocalHost());
} catch (UnknownHostException e) {}
try {
localHostAddresses = InetAddress.getAllByName(lhn);
@ -656,7 +728,8 @@ public class Domains {
// finally check if there are other local IP addresses that are not in
// the standard IP range
for (int i = 0; i < localHostAddresses.length; i++) {
if (localHostAddresses[i].getHostName().equals(host)) return true;
String hostname = getHostName(localHostAddresses[i]);
if (hostname != null && hostname.equals(host)) return true;
if (localHostAddresses[i].getHostAddress().equals(host)) return true;
}

@ -121,7 +121,8 @@ public class Scanner extends Thread {
private void addProtocol(String protocol, boolean bigrange) {
for (InetAddress i: genlist(bigrange)) {
try {
this.scanqueue.put(new MultiProtocolURI(protocol + "://" + i.getHostName() + "/"));
this.scanqueue.put(new MultiProtocolURI(protocol + "://" + Domains.getHostName(i) + "/"));
} catch (MalformedURLException e) {
Log.logException(e);
} catch (InterruptedException e) {

@ -21,6 +21,7 @@
package net.yacy.cora.storage;
import java.util.Collection;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
@ -62,14 +63,21 @@ public interface ARC<K, V> extends Iterable<Map.Entry<K, V>> {
* @return the value
*/
public V get(K s);
/**
* check if the map contains the value
* @param value
* @return the keys that have the given value
*/
public Collection<K> getKeys(V value);
/**
* check if the map contains the key
* @param s
* @return
* @param key
* @return true if the map contains the key
*/
public boolean containsKey(K s);
public boolean containsKey(K key);
/**
* remove an entry from the cache
* @param s

@ -21,6 +21,8 @@
package net.yacy.cora.storage;
import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
@ -105,6 +107,17 @@ public final class ConcurrentARC<K, V> extends AbstractMap<K, V> implements Map<
return this.arc[getPartition(s)].get((K) s);
}
/**
* check if the map contains the value
* @param value
* @return the keys that have the given value
*/
public Collection<K> getKeys(V value) {
ArrayList<K> keys = new ArrayList<K>();
for (int i = 0; i < this.arc.length; i++) keys.addAll(this.arc[i].getKeys(value));
return keys;
}
/**
* check if the map contains the key
* @param s

@ -22,6 +22,8 @@
package net.yacy.cora.storage;
import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
@ -98,6 +100,26 @@ abstract class SimpleARC<K, V> extends AbstractMap<K, V> implements Map<K, V>, I
assert (this.levelB.size() <= cacheSize); // the cache should shrink automatically
return v;
}
/**
* check if the map contains the value
* @param value
* @return the keys that have the given value
*/
public Collection<K> getKeys(V value) {
ArrayList<K> keys = new ArrayList<K>();
synchronized (this.levelB) {
for (Map.Entry<K, V> entry: this.levelB.entrySet()) {
if (value.equals(entry.getValue())) keys.add(entry.getKey());
}
}
synchronized (this) {
for (Map.Entry<K, V> entry: this.levelA.entrySet()) {
if (value.equals(entry.getValue())) keys.add(entry.getKey());
}
}
return keys;
}
/**
* check if the map contains the key

@ -305,6 +305,7 @@ public class Table implements Index, Iterable<Row.Entry> {
assert table == null || table.size() == index.size() : "table.size() = " + table.size() + ", index.size() = " + index.size();
}
final HashMap<String, String> map = new HashMap<String, String>(8);
if (index == null) return map; // possibly closed or beeing closed
map.put("tableSize", Integer.toString(index.size()));
map.put("tableKeyChunkSize", Integer.toString(index.row().objectsize));
map.put("tableKeyMem", Integer.toString(index.row().objectsize * index.size()));

Loading…
Cancel
Save