diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index 15565bb08..c9bf36f14 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -351,31 +351,28 @@ public final class CrawlStacker { * @return null if the url can be accepted, a string containing a rejection reason if the url cannot be accepted */ public String urlInAcceptedDomain(final DigestURI url) { - // returns true if the url can be accepted accoring to network.unit.domain + // returns true if the url can be accepted according to network.unit.domain if (url == null) return "url is null"; + final boolean local = url.isLocal(); + if (this.acceptLocalURLs && local) return null; + if (this.acceptGlobalURLs && !local) return null; final String host = url.getHost(); - if (this.acceptLocalURLs && host == null && url.getProtocol().equals("file")) return null; if (host == null) return "url.host is null"; - if (this.acceptGlobalURLs && this.acceptLocalURLs) return null; // fast shortcut to avoid dnsResolve // check if this is a local address and we are allowed to index local pages: //boolean local = hostAddress.isSiteLocalAddress() || hostAddress.isLoopbackAddress(); - final boolean local = url.isLocal(); //assert local == yacyURL.isLocalDomain(url.hash()); // TODO: remove the dnsResolve above! - if ((this.acceptGlobalURLs && !local) || (this.acceptLocalURLs && local)) return null; return (local) ? ("the host '" + host + "' is local, but local addresses are not accepted") : ("the host '" + host + "' is global, but global addresses are not accepted"); } public String urlInAcceptedDomainHash(final byte[] urlhash) { - // returns true if the url can be accepted accoring to network.unit.domain + // returns true if the url can be accepted according to network.unit.domain if (urlhash == null) return "url is null"; - if (this.acceptGlobalURLs && this.acceptLocalURLs) return null; // fast shortcut to avoid dnsResolve // check if this is a local address and we are allowed to index local pages: - //boolean local = hostAddress.isSiteLocalAddress() || hostAddress.isLoopbackAddress(); final boolean local = DigestURI.isLocal(urlhash); - //assert local == yacyURL.isLocalDomain(url.hash()); // TODO: remove the dnsResolve above! - if ((this.acceptGlobalURLs && !local) || (this.acceptLocalURLs && local)) return null; + if (this.acceptLocalURLs && local) return null; + if (this.acceptGlobalURLs && !local) return null; return (local) ? ("the urlhash '" + new String(urlhash) + "' is local, but local addresses are not accepted") : ("the urlhash '" + new String(urlhash) + "' is global, but global addresses are not accepted"); diff --git a/source/de/anomic/crawler/Latency.java b/source/de/anomic/crawler/Latency.java index 50a357b23..7b7f6098b 100644 --- a/source/de/anomic/crawler/Latency.java +++ b/source/de/anomic/crawler/Latency.java @@ -28,6 +28,7 @@ import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.kelondro.util.Domains; import de.anomic.search.Switchboard; @@ -123,7 +124,7 @@ public class Latency { final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc(); // find the minimum waiting time based on the network domain (local or global) - final boolean local = MultiProtocolURI.isLocal(hostname); + final boolean local = Domains.isLocal(hostname); long waiting = (local) ? minimumLocalDelta : minimumGlobalDelta; // if we have accessed the domain many times, get slower (the flux factor) diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 1302e34e0..57e4a09e6 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -161,7 +161,7 @@ public final class yacyClient { // send request final long start = System.currentTimeMillis(); // final byte[] content = HttpConnector.wput("http://" + address + "/yacy/hello.html", yacySeed.b64Hash2hexHash(otherHash) + ".yacyh", post, 10000, false); - final byte[] content = HttpConnector.wput("http://" + address + "/yacy/hello.html", yacySeed.b64Hash2hexHash(otherHash) + ".yacyh", parts, 10000); + final byte[] content = HttpConnector.wput("http://" + address + "/yacy/hello.html", yacySeed.b64Hash2hexHash(otherHash) + ".yacyh", parts, 30000); yacyCore.log.logInfo("yacyClient.publishMySeed thread '" + Thread.currentThread().getName() + "' contacted peer at " + address + ", received " + ((content == null) ? "null" : content.length) + " bytes, time = " + (System.currentTimeMillis() - start) + " milliseconds"); result = FileUtils.table(content); } catch (final Exception e) { diff --git a/source/net/yacy/cora/document/MultiProtocolURI.java b/source/net/yacy/cora/document/MultiProtocolURI.java index ace4d1c20..4553fe954 100644 --- a/source/net/yacy/cora/document/MultiProtocolURI.java +++ b/source/net/yacy/cora/document/MultiProtocolURI.java @@ -39,6 +39,7 @@ import jcifs.smb.SmbFile; import jcifs.smb.SmbFileInputStream; import net.yacy.cora.document.Punycode.PunycodeException; +import net.yacy.kelondro.util.Domains; /** * MultiProtocolURI provides a URL object for multiple protocols like http, https, ftp, smb and file @@ -776,11 +777,7 @@ public class MultiProtocolURI implements Serializable { // checks for local/global IP range and local IP public boolean isLocal() { - return isLocal(this.host); - } - - public static boolean isLocal(String host) { - return host.startsWith("127.") || host.equals("localhost") || host.startsWith("0:0:0:0:0:0:0:1"); + return this.protocol.equals("file") || this.protocol.equals("smb") || Domains.isLocal(this.host); } // language calculation diff --git a/source/net/yacy/kelondro/index/ARC.java b/source/net/yacy/kelondro/index/ARC.java index eaaa001c8..757469663 100644 --- a/source/net/yacy/kelondro/index/ARC.java +++ b/source/net/yacy/kelondro/index/ARC.java @@ -36,6 +36,12 @@ package net.yacy.kelondro.index; public interface ARC { + /** + * get the size of the ARC. this returns the sum of main and ghost cache + * @return the complete number of entries in the ARC cache + */ + public int size(); + /** * put a value to the cache. * @param s diff --git a/source/net/yacy/kelondro/index/ConcurrentARC.java b/source/net/yacy/kelondro/index/ConcurrentARC.java index 36aa10f99..7c939662f 100644 --- a/source/net/yacy/kelondro/index/ConcurrentARC.java +++ b/source/net/yacy/kelondro/index/ConcurrentARC.java @@ -91,4 +91,11 @@ public final class ConcurrentARC implements ARC { public final void clear() { for (ARC a: this.arc) a.clear(); } + + @Override + public final int size() { + int s = 0; + for (ARC a: this.arc) s += a.size(); + return s; + } } diff --git a/source/net/yacy/kelondro/index/SimpleARC.java b/source/net/yacy/kelondro/index/SimpleARC.java index 49d9db217..671e340e5 100644 --- a/source/net/yacy/kelondro/index/SimpleARC.java +++ b/source/net/yacy/kelondro/index/SimpleARC.java @@ -120,4 +120,13 @@ public final class SimpleARC implements ARC { this.levelA.clear(); this.levelB.clear(); } + + /** + * get the size of the ARC. this returns the sum of main and ghost cache + * @return the complete number of entries in the ARC cache + */ + @Override + public final synchronized int size() { + return this.levelA.size() + this.levelB.size(); + } } diff --git a/source/net/yacy/kelondro/util/Domains.java b/source/net/yacy/kelondro/util/Domains.java index 7fa23e91f..2b85d3817 100644 --- a/source/net/yacy/kelondro/util/Domains.java +++ b/source/net/yacy/kelondro/util/Domains.java @@ -29,26 +29,30 @@ import java.net.InetAddress; import java.net.UnknownHostException; import java.util.ArrayList; import java.util.Collections; -import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; -import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.regex.Pattern; +import net.yacy.kelondro.index.ARC; +import net.yacy.kelondro.index.ConcurrentARC; + public class Domains { + private static final String PRESENT = ""; private static final String localPatterns = "10\\..*,127.*,172.(1[6-9]|2[0-9]|3[0-1])\\..*,169.254.*,192.168.*,localhost"; + private static final int maxNameCacheHitSize = 20000; + private static final int maxNameCacheMissSize = 20000; + private static final int maxNameNoCachingListSize = 20000; + private static final int concurrencyLevel = Runtime.getRuntime().availableProcessors() + 1; // a dns cache - private static final Map nameCacheHit = new ConcurrentHashMap(); // a not-synchronized map resulted in deadlocks - private static final Set nameCacheMiss = Collections.synchronizedSet(new HashSet()); - private static final int maxNameCacheHitSize = 8000; - private static final int maxNameCacheMissSize = 8000; + private static final ARC nameCacheHit = new ConcurrentARC(maxNameCacheHitSize, concurrencyLevel); + private static final ARC nameCacheMiss = new ConcurrentARC(maxNameCacheMissSize, concurrencyLevel); + private static final ARC nameCacheNoCachingList = new ConcurrentARC(maxNameNoCachingListSize, concurrencyLevel); public static List nameCacheNoCachingPatterns = Collections.synchronizedList(new LinkedList()); public static final List localhostPatterns = makePatterns(localPatterns); - private static final Set nameCacheNoCachingList = Collections.synchronizedSet(new HashSet()); /** * ! ! ! A T T E N T I O N A T T E N T I O N A T T E N T I O N ! ! ! @@ -413,7 +417,7 @@ public class Domains { final InetAddress ip = nameCacheHit.get(host); if (ip != null) return ip; - if (nameCacheMiss.contains(host)) return null; + if (nameCacheMiss.containsKey(host)) return null; throw new UnknownHostException("host not in cache"); } @@ -445,26 +449,24 @@ public class Domains { InetAddress ip = nameCacheHit.get(host); if (ip != null) return ip; - if (nameCacheMiss.contains(host)) return null; + if (nameCacheMiss.containsKey(host)) return null; //System.out.println("***DEBUG dnsResolve(" + host + ")"); try { boolean doCaching = true; ip = InetAddress.getByName(host); // this makes the DNS request to backbone if ((ip == null) || (ip.isLoopbackAddress()) || - (nameCacheNoCachingList.contains(host)) + (nameCacheNoCachingList.containsKey(host)) ) { doCaching = false; } else { if (matchesList(host, nameCacheNoCachingPatterns)) { - nameCacheNoCachingList.add(host); + nameCacheNoCachingList.put(host, PRESENT); doCaching = false; } } if (doCaching && ip != null) { - // remove old entries - flushHitNameCache(); // add new entries nameCacheHit.put(host, ip); @@ -475,7 +477,7 @@ public class Domains { flushMissNameCache(); // add new entries - nameCacheMiss.add(host); + nameCacheMiss.put(host, PRESENT); } return null; } @@ -501,15 +503,7 @@ public class Domains { public static int nameCacheNoCachingListSize() { return nameCacheNoCachingList.size(); } - - /** - * Removes old entries from the dns hit cache - */ - public static void flushHitNameCache() { - if (nameCacheHit.size() > maxNameCacheHitSize) nameCacheHit.clear(); - } - /** * Removes old entries from the dns miss cache */ @@ -580,35 +574,32 @@ public class Domains { } public static int getDomainID(final String host) { - if (host == null) return TLD_Local_ID; + if (host == null || host.length() == 0) return TLD_Local_ID; + if (isLocal(host)) return TLD_Local_ID; final int p = host.lastIndexOf('.'); - String tld = ""; - if (p > 0) { - tld = host.substring(p + 1); - } + String tld = (p > 0) ? host.substring(p + 1) : ""; final Integer i = TLDID.get(tld); - if (i == null) { - return (isLocal(host)) ? TLD_Local_ID : TLD_Generic_ID; - } + if (i == null) return TLD_Generic_ID; return i.intValue(); } public static boolean isLocal(final String host) { - if (host == null) return true; + if (host == null || host.length() == 0) return true; // FIXME IPv4 only // check local ip addresses if (matchesList(host, localhostPatterns)) return true; if (host.startsWith("0:0:0:0:0:0:0:1")) return true; - // finally check if there are other local IP adresses that are not in + // finally check if there are other local IP addresses that are not in // the standard IP range for (int i = 0; i < localHostAddresses.length; i++) { if (localHostAddresses[i].getHostName().equals(host)) return true; if (localHostAddresses[i].getHostAddress().equals(host)) return true; } - // the address must be a global address - return false; + // check dns lookup: may be a local address even if the domain name looks global + InetAddress a = dnsResolve(host); + return a == null || a.isAnyLocalAddress() || a.isLinkLocalAddress() || a.isLoopbackAddress(); } } diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index ac2b0e884..f2820419d 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -46,7 +46,6 @@ import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.TransformerWriter; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; -import net.yacy.kelondro.util.Domains; import net.yacy.kelondro.util.FileUtils; import de.anomic.crawler.CrawlProfile; @@ -151,12 +150,13 @@ public final class LoaderDispatcher { */ public Response load(final Request request, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize) throws IOException { // get the protocol of the next URL - final String protocol = request.url().getProtocol(); - final String host = request.url().getHost(); + final DigestURI url = request.url(); + final String protocol = url.getProtocol(); + final String host = url.getHost(); // check if this loads a page from localhost, which must be prevented to protect the server // against attacks to the administration interface when localhost access is granted - if (Domains.isLocal(host) && sb.getConfigBool("adminAccountForLocalhost", false)) throw new IOException("access to localhost not granted for url " + request.url()); + if (url.isLocal() && sb.getConfigBool("adminAccountForLocalhost", false)) throw new IOException("access to localhost not granted for url " + url); // check if we have the page in the cache @@ -165,8 +165,8 @@ public final class LoaderDispatcher { // we have passed a first test if caching is allowed // now see if there is a cache entry - ResponseHeader cachedResponse = (request.url().isLocal()) ? null : Cache.getResponseHeader(request.url()); - byte[] content = (cachedResponse == null) ? null : Cache.getContent(request.url()); + ResponseHeader cachedResponse = (url.isLocal()) ? null : Cache.getResponseHeader(url); + byte[] content = (cachedResponse == null) ? null : Cache.getContent(url); if (cachedResponse != null && content != null) { // yes we have the content @@ -188,17 +188,17 @@ public final class LoaderDispatcher { // check which caching strategy shall be used if (cacheStrategy == CrawlProfile.CacheStrategy.IFEXIST || cacheStrategy == CrawlProfile.CacheStrategy.CACHEONLY) { // well, just take the cache and don't care about freshness of the content - log.logInfo("cache hit/useall for: " + request.url().toNormalform(true, false)); + log.logInfo("cache hit/useall for: " + url.toNormalform(true, false)); return response; } // now the cacheStrategy must be CACHE_STRATEGY_IFFRESH, that means we should do a proxy freshness test assert cacheStrategy == CrawlProfile.CacheStrategy.IFFRESH : "cacheStrategy = " + cacheStrategy; if (response.isFreshForProxy()) { - log.logInfo("cache hit/fresh for: " + request.url().toNormalform(true, false)); + log.logInfo("cache hit/fresh for: " + url.toNormalform(true, false)); return response; } else { - log.logInfo("cache hit/stale for: " + request.url().toNormalform(true, false)); + log.logInfo("cache hit/stale for: " + url.toNormalform(true, false)); } } } @@ -213,7 +213,7 @@ public final class LoaderDispatcher { // check access time: this is a double-check (we checked possibly already in the balancer) // to make sure that we don't DoS the target by mistake - if (!request.url().isLocal()) { + if (!url.isLocal()) { final Long lastAccess = accessTime.get(host); long wait = 0; if (lastAccess != null) wait = Math.max(0, minDelay + lastAccess.longValue() - System.currentTimeMillis()); @@ -246,7 +246,7 @@ public final class LoaderDispatcher { String storeError = response.shallStoreCacheForCrawler(); if (storeError == null) { try { - Cache.store(request.url(), response.getResponseHeader(), response.getContent()); + Cache.store(url, response.getResponseHeader(), response.getContent()); } catch (IOException e) { log.logWarning("cannot write " + response.url() + " to Cache (3): " + e.getMessage(), e); } @@ -256,7 +256,7 @@ public final class LoaderDispatcher { return response; } - throw new IOException("Unsupported protocol '" + protocol + "' in url " + request.url()); + throw new IOException("Unsupported protocol '" + protocol + "' in url " + url); } /** @@ -281,10 +281,11 @@ public final class LoaderDispatcher { // load resource final Response response = load(request, cacheStrategy, maxFileSize); - if (response == null) throw new IOException("no Response for url " + request.url()); + final DigestURI url = request.url(); + if (response == null) throw new IOException("no Response for url " + url); // if it is still not available, report an error - if (response.getContent() == null || response.getResponseHeader() == null) throw new IOException("no Content available for url " + request.url()); + if (response.getContent() == null || response.getResponseHeader() == null) throw new IOException("no Content available for url " + url); // parse resource return response.parse();