From 22dbbcfa56a33e02ac9daed3e759809799249322 Mon Sep 17 00:00:00 2001 From: orbiter Date: Sun, 18 Jul 2010 20:14:20 +0000 Subject: [PATCH] better (and corrected) recognition of intranet and internet-addresses. This corrects the isLocal property that is used by network definitions to restrict index ranges to local and global addresses. Address locations (intranet or internet) had been partly identified by the top level domain of the host address. Since intranet addresses can also be addressed using a host name that is in a country domain it is necessary to do a dns resolving for each check. The check is supported by a local dns cache so the intranet/internet check should not affect network traffic too much. To ensure that the cache works properly the cache class was upgraded to better concurrency data structures. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6977 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/crawler/CrawlStacker.java | 17 +++--- source/de/anomic/crawler/Latency.java | 3 +- source/de/anomic/yacy/yacyClient.java | 2 +- .../yacy/cora/document/MultiProtocolURI.java | 7 +-- source/net/yacy/kelondro/index/ARC.java | 6 ++ .../yacy/kelondro/index/ConcurrentARC.java | 7 +++ source/net/yacy/kelondro/index/SimpleARC.java | 9 +++ source/net/yacy/kelondro/util/Domains.java | 59 ++++++++----------- .../net/yacy/repository/LoaderDispatcher.java | 29 ++++----- 9 files changed, 74 insertions(+), 65 deletions(-) diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index 15565bb08..c9bf36f14 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -351,31 +351,28 @@ public final class CrawlStacker { * @return null if the url can be accepted, a string containing a rejection reason if the url cannot be accepted */ public String urlInAcceptedDomain(final DigestURI url) { - // returns true if the url can be accepted accoring to network.unit.domain + // returns true if the url can be accepted according to network.unit.domain if (url == null) return "url is null"; + final boolean local = url.isLocal(); + if (this.acceptLocalURLs && local) return null; + if (this.acceptGlobalURLs && !local) return null; final String host = url.getHost(); - if (this.acceptLocalURLs && host == null && url.getProtocol().equals("file")) return null; if (host == null) return "url.host is null"; - if (this.acceptGlobalURLs && this.acceptLocalURLs) return null; // fast shortcut to avoid dnsResolve // check if this is a local address and we are allowed to index local pages: //boolean local = hostAddress.isSiteLocalAddress() || hostAddress.isLoopbackAddress(); - final boolean local = url.isLocal(); //assert local == yacyURL.isLocalDomain(url.hash()); // TODO: remove the dnsResolve above! - if ((this.acceptGlobalURLs && !local) || (this.acceptLocalURLs && local)) return null; return (local) ? ("the host '" + host + "' is local, but local addresses are not accepted") : ("the host '" + host + "' is global, but global addresses are not accepted"); } public String urlInAcceptedDomainHash(final byte[] urlhash) { - // returns true if the url can be accepted accoring to network.unit.domain + // returns true if the url can be accepted according to network.unit.domain if (urlhash == null) return "url is null"; - if (this.acceptGlobalURLs && this.acceptLocalURLs) return null; // fast shortcut to avoid dnsResolve // check if this is a local address and we are allowed to index local pages: - //boolean local = hostAddress.isSiteLocalAddress() || hostAddress.isLoopbackAddress(); final boolean local = DigestURI.isLocal(urlhash); - //assert local == yacyURL.isLocalDomain(url.hash()); // TODO: remove the dnsResolve above! - if ((this.acceptGlobalURLs && !local) || (this.acceptLocalURLs && local)) return null; + if (this.acceptLocalURLs && local) return null; + if (this.acceptGlobalURLs && !local) return null; return (local) ? ("the urlhash '" + new String(urlhash) + "' is local, but local addresses are not accepted") : ("the urlhash '" + new String(urlhash) + "' is global, but global addresses are not accepted"); diff --git a/source/de/anomic/crawler/Latency.java b/source/de/anomic/crawler/Latency.java index 50a357b23..7b7f6098b 100644 --- a/source/de/anomic/crawler/Latency.java +++ b/source/de/anomic/crawler/Latency.java @@ -28,6 +28,7 @@ import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.kelondro.util.Domains; import de.anomic.search.Switchboard; @@ -123,7 +124,7 @@ public class Latency { final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc(); // find the minimum waiting time based on the network domain (local or global) - final boolean local = MultiProtocolURI.isLocal(hostname); + final boolean local = Domains.isLocal(hostname); long waiting = (local) ? minimumLocalDelta : minimumGlobalDelta; // if we have accessed the domain many times, get slower (the flux factor) diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 1302e34e0..57e4a09e6 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -161,7 +161,7 @@ public final class yacyClient { // send request final long start = System.currentTimeMillis(); // final byte[] content = HttpConnector.wput("http://" + address + "/yacy/hello.html", yacySeed.b64Hash2hexHash(otherHash) + ".yacyh", post, 10000, false); - final byte[] content = HttpConnector.wput("http://" + address + "/yacy/hello.html", yacySeed.b64Hash2hexHash(otherHash) + ".yacyh", parts, 10000); + final byte[] content = HttpConnector.wput("http://" + address + "/yacy/hello.html", yacySeed.b64Hash2hexHash(otherHash) + ".yacyh", parts, 30000); yacyCore.log.logInfo("yacyClient.publishMySeed thread '" + Thread.currentThread().getName() + "' contacted peer at " + address + ", received " + ((content == null) ? "null" : content.length) + " bytes, time = " + (System.currentTimeMillis() - start) + " milliseconds"); result = FileUtils.table(content); } catch (final Exception e) { diff --git a/source/net/yacy/cora/document/MultiProtocolURI.java b/source/net/yacy/cora/document/MultiProtocolURI.java index ace4d1c20..4553fe954 100644 --- a/source/net/yacy/cora/document/MultiProtocolURI.java +++ b/source/net/yacy/cora/document/MultiProtocolURI.java @@ -39,6 +39,7 @@ import jcifs.smb.SmbFile; import jcifs.smb.SmbFileInputStream; import net.yacy.cora.document.Punycode.PunycodeException; +import net.yacy.kelondro.util.Domains; /** * MultiProtocolURI provides a URL object for multiple protocols like http, https, ftp, smb and file @@ -776,11 +777,7 @@ public class MultiProtocolURI implements Serializable { // checks for local/global IP range and local IP public boolean isLocal() { - return isLocal(this.host); - } - - public static boolean isLocal(String host) { - return host.startsWith("127.") || host.equals("localhost") || host.startsWith("0:0:0:0:0:0:0:1"); + return this.protocol.equals("file") || this.protocol.equals("smb") || Domains.isLocal(this.host); } // language calculation diff --git a/source/net/yacy/kelondro/index/ARC.java b/source/net/yacy/kelondro/index/ARC.java index eaaa001c8..757469663 100644 --- a/source/net/yacy/kelondro/index/ARC.java +++ b/source/net/yacy/kelondro/index/ARC.java @@ -36,6 +36,12 @@ package net.yacy.kelondro.index; public interface ARC { + /** + * get the size of the ARC. this returns the sum of main and ghost cache + * @return the complete number of entries in the ARC cache + */ + public int size(); + /** * put a value to the cache. * @param s diff --git a/source/net/yacy/kelondro/index/ConcurrentARC.java b/source/net/yacy/kelondro/index/ConcurrentARC.java index 36aa10f99..7c939662f 100644 --- a/source/net/yacy/kelondro/index/ConcurrentARC.java +++ b/source/net/yacy/kelondro/index/ConcurrentARC.java @@ -91,4 +91,11 @@ public final class ConcurrentARC implements ARC { public final void clear() { for (ARC a: this.arc) a.clear(); } + + @Override + public final int size() { + int s = 0; + for (ARC a: this.arc) s += a.size(); + return s; + } } diff --git a/source/net/yacy/kelondro/index/SimpleARC.java b/source/net/yacy/kelondro/index/SimpleARC.java index 49d9db217..671e340e5 100644 --- a/source/net/yacy/kelondro/index/SimpleARC.java +++ b/source/net/yacy/kelondro/index/SimpleARC.java @@ -120,4 +120,13 @@ public final class SimpleARC implements ARC { this.levelA.clear(); this.levelB.clear(); } + + /** + * get the size of the ARC. this returns the sum of main and ghost cache + * @return the complete number of entries in the ARC cache + */ + @Override + public final synchronized int size() { + return this.levelA.size() + this.levelB.size(); + } } diff --git a/source/net/yacy/kelondro/util/Domains.java b/source/net/yacy/kelondro/util/Domains.java index 7fa23e91f..2b85d3817 100644 --- a/source/net/yacy/kelondro/util/Domains.java +++ b/source/net/yacy/kelondro/util/Domains.java @@ -29,26 +29,30 @@ import java.net.InetAddress; import java.net.UnknownHostException; import java.util.ArrayList; import java.util.Collections; -import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; -import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.regex.Pattern; +import net.yacy.kelondro.index.ARC; +import net.yacy.kelondro.index.ConcurrentARC; + public class Domains { + private static final String PRESENT = ""; private static final String localPatterns = "10\\..*,127.*,172.(1[6-9]|2[0-9]|3[0-1])\\..*,169.254.*,192.168.*,localhost"; + private static final int maxNameCacheHitSize = 20000; + private static final int maxNameCacheMissSize = 20000; + private static final int maxNameNoCachingListSize = 20000; + private static final int concurrencyLevel = Runtime.getRuntime().availableProcessors() + 1; // a dns cache - private static final Map nameCacheHit = new ConcurrentHashMap(); // a not-synchronized map resulted in deadlocks - private static final Set nameCacheMiss = Collections.synchronizedSet(new HashSet()); - private static final int maxNameCacheHitSize = 8000; - private static final int maxNameCacheMissSize = 8000; + private static final ARC nameCacheHit = new ConcurrentARC(maxNameCacheHitSize, concurrencyLevel); + private static final ARC nameCacheMiss = new ConcurrentARC(maxNameCacheMissSize, concurrencyLevel); + private static final ARC nameCacheNoCachingList = new ConcurrentARC(maxNameNoCachingListSize, concurrencyLevel); public static List nameCacheNoCachingPatterns = Collections.synchronizedList(new LinkedList()); public static final List localhostPatterns = makePatterns(localPatterns); - private static final Set nameCacheNoCachingList = Collections.synchronizedSet(new HashSet()); /** * ! ! ! A T T E N T I O N A T T E N T I O N A T T E N T I O N ! ! ! @@ -413,7 +417,7 @@ public class Domains { final InetAddress ip = nameCacheHit.get(host); if (ip != null) return ip; - if (nameCacheMiss.contains(host)) return null; + if (nameCacheMiss.containsKey(host)) return null; throw new UnknownHostException("host not in cache"); } @@ -445,26 +449,24 @@ public class Domains { InetAddress ip = nameCacheHit.get(host); if (ip != null) return ip; - if (nameCacheMiss.contains(host)) return null; + if (nameCacheMiss.containsKey(host)) return null; //System.out.println("***DEBUG dnsResolve(" + host + ")"); try { boolean doCaching = true; ip = InetAddress.getByName(host); // this makes the DNS request to backbone if ((ip == null) || (ip.isLoopbackAddress()) || - (nameCacheNoCachingList.contains(host)) + (nameCacheNoCachingList.containsKey(host)) ) { doCaching = false; } else { if (matchesList(host, nameCacheNoCachingPatterns)) { - nameCacheNoCachingList.add(host); + nameCacheNoCachingList.put(host, PRESENT); doCaching = false; } } if (doCaching && ip != null) { - // remove old entries - flushHitNameCache(); // add new entries nameCacheHit.put(host, ip); @@ -475,7 +477,7 @@ public class Domains { flushMissNameCache(); // add new entries - nameCacheMiss.add(host); + nameCacheMiss.put(host, PRESENT); } return null; } @@ -501,15 +503,7 @@ public class Domains { public static int nameCacheNoCachingListSize() { return nameCacheNoCachingList.size(); } - - /** - * Removes old entries from the dns hit cache - */ - public static void flushHitNameCache() { - if (nameCacheHit.size() > maxNameCacheHitSize) nameCacheHit.clear(); - } - /** * Removes old entries from the dns miss cache */ @@ -580,35 +574,32 @@ public class Domains { } public static int getDomainID(final String host) { - if (host == null) return TLD_Local_ID; + if (host == null || host.length() == 0) return TLD_Local_ID; + if (isLocal(host)) return TLD_Local_ID; final int p = host.lastIndexOf('.'); - String tld = ""; - if (p > 0) { - tld = host.substring(p + 1); - } + String tld = (p > 0) ? host.substring(p + 1) : ""; final Integer i = TLDID.get(tld); - if (i == null) { - return (isLocal(host)) ? TLD_Local_ID : TLD_Generic_ID; - } + if (i == null) return TLD_Generic_ID; return i.intValue(); } public static boolean isLocal(final String host) { - if (host == null) return true; + if (host == null || host.length() == 0) return true; // FIXME IPv4 only // check local ip addresses if (matchesList(host, localhostPatterns)) return true; if (host.startsWith("0:0:0:0:0:0:0:1")) return true; - // finally check if there are other local IP adresses that are not in + // finally check if there are other local IP addresses that are not in // the standard IP range for (int i = 0; i < localHostAddresses.length; i++) { if (localHostAddresses[i].getHostName().equals(host)) return true; if (localHostAddresses[i].getHostAddress().equals(host)) return true; } - // the address must be a global address - return false; + // check dns lookup: may be a local address even if the domain name looks global + InetAddress a = dnsResolve(host); + return a == null || a.isAnyLocalAddress() || a.isLinkLocalAddress() || a.isLoopbackAddress(); } } diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index ac2b0e884..f2820419d 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -46,7 +46,6 @@ import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.TransformerWriter; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; -import net.yacy.kelondro.util.Domains; import net.yacy.kelondro.util.FileUtils; import de.anomic.crawler.CrawlProfile; @@ -151,12 +150,13 @@ public final class LoaderDispatcher { */ public Response load(final Request request, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize) throws IOException { // get the protocol of the next URL - final String protocol = request.url().getProtocol(); - final String host = request.url().getHost(); + final DigestURI url = request.url(); + final String protocol = url.getProtocol(); + final String host = url.getHost(); // check if this loads a page from localhost, which must be prevented to protect the server // against attacks to the administration interface when localhost access is granted - if (Domains.isLocal(host) && sb.getConfigBool("adminAccountForLocalhost", false)) throw new IOException("access to localhost not granted for url " + request.url()); + if (url.isLocal() && sb.getConfigBool("adminAccountForLocalhost", false)) throw new IOException("access to localhost not granted for url " + url); // check if we have the page in the cache @@ -165,8 +165,8 @@ public final class LoaderDispatcher { // we have passed a first test if caching is allowed // now see if there is a cache entry - ResponseHeader cachedResponse = (request.url().isLocal()) ? null : Cache.getResponseHeader(request.url()); - byte[] content = (cachedResponse == null) ? null : Cache.getContent(request.url()); + ResponseHeader cachedResponse = (url.isLocal()) ? null : Cache.getResponseHeader(url); + byte[] content = (cachedResponse == null) ? null : Cache.getContent(url); if (cachedResponse != null && content != null) { // yes we have the content @@ -188,17 +188,17 @@ public final class LoaderDispatcher { // check which caching strategy shall be used if (cacheStrategy == CrawlProfile.CacheStrategy.IFEXIST || cacheStrategy == CrawlProfile.CacheStrategy.CACHEONLY) { // well, just take the cache and don't care about freshness of the content - log.logInfo("cache hit/useall for: " + request.url().toNormalform(true, false)); + log.logInfo("cache hit/useall for: " + url.toNormalform(true, false)); return response; } // now the cacheStrategy must be CACHE_STRATEGY_IFFRESH, that means we should do a proxy freshness test assert cacheStrategy == CrawlProfile.CacheStrategy.IFFRESH : "cacheStrategy = " + cacheStrategy; if (response.isFreshForProxy()) { - log.logInfo("cache hit/fresh for: " + request.url().toNormalform(true, false)); + log.logInfo("cache hit/fresh for: " + url.toNormalform(true, false)); return response; } else { - log.logInfo("cache hit/stale for: " + request.url().toNormalform(true, false)); + log.logInfo("cache hit/stale for: " + url.toNormalform(true, false)); } } } @@ -213,7 +213,7 @@ public final class LoaderDispatcher { // check access time: this is a double-check (we checked possibly already in the balancer) // to make sure that we don't DoS the target by mistake - if (!request.url().isLocal()) { + if (!url.isLocal()) { final Long lastAccess = accessTime.get(host); long wait = 0; if (lastAccess != null) wait = Math.max(0, minDelay + lastAccess.longValue() - System.currentTimeMillis()); @@ -246,7 +246,7 @@ public final class LoaderDispatcher { String storeError = response.shallStoreCacheForCrawler(); if (storeError == null) { try { - Cache.store(request.url(), response.getResponseHeader(), response.getContent()); + Cache.store(url, response.getResponseHeader(), response.getContent()); } catch (IOException e) { log.logWarning("cannot write " + response.url() + " to Cache (3): " + e.getMessage(), e); } @@ -256,7 +256,7 @@ public final class LoaderDispatcher { return response; } - throw new IOException("Unsupported protocol '" + protocol + "' in url " + request.url()); + throw new IOException("Unsupported protocol '" + protocol + "' in url " + url); } /** @@ -281,10 +281,11 @@ public final class LoaderDispatcher { // load resource final Response response = load(request, cacheStrategy, maxFileSize); - if (response == null) throw new IOException("no Response for url " + request.url()); + final DigestURI url = request.url(); + if (response == null) throw new IOException("no Response for url " + url); // if it is still not available, report an error - if (response.getContent() == null || response.getResponseHeader() == null) throw new IOException("no Content available for url " + request.url()); + if (response.getContent() == null || response.getResponseHeader() == null) throw new IOException("no Content available for url " + url); // parse resource return response.parse();