better (and corrected) recognition of intranet and internet-addresses. This corrects the isLocal property that is used by network definitions to restrict index ranges to local and global addresses. Address locations (intranet or internet) had been partly identified by the top level domain of the host address. Since intranet addresses can also be addressed using a host name that is in a country domain it is necessary to do a dns resolving for each check. The check is supported by a local dns cache so the intranet/internet check should not affect network traffic too much. To ensure that the cache works properly the cache class was upgraded to better concurrency data structures.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6977 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 2d2771a12e
commit 22dbbcfa56

@ -351,31 +351,28 @@ public final class CrawlStacker {
* @return null if the url can be accepted, a string containing a rejection reason if the url cannot be accepted * @return null if the url can be accepted, a string containing a rejection reason if the url cannot be accepted
*/ */
public String urlInAcceptedDomain(final DigestURI url) { public String urlInAcceptedDomain(final DigestURI url) {
// returns true if the url can be accepted accoring to network.unit.domain // returns true if the url can be accepted according to network.unit.domain
if (url == null) return "url is null"; if (url == null) return "url is null";
final boolean local = url.isLocal();
if (this.acceptLocalURLs && local) return null;
if (this.acceptGlobalURLs && !local) return null;
final String host = url.getHost(); final String host = url.getHost();
if (this.acceptLocalURLs && host == null && url.getProtocol().equals("file")) return null;
if (host == null) return "url.host is null"; if (host == null) return "url.host is null";
if (this.acceptGlobalURLs && this.acceptLocalURLs) return null; // fast shortcut to avoid dnsResolve
// check if this is a local address and we are allowed to index local pages: // check if this is a local address and we are allowed to index local pages:
//boolean local = hostAddress.isSiteLocalAddress() || hostAddress.isLoopbackAddress(); //boolean local = hostAddress.isSiteLocalAddress() || hostAddress.isLoopbackAddress();
final boolean local = url.isLocal();
//assert local == yacyURL.isLocalDomain(url.hash()); // TODO: remove the dnsResolve above! //assert local == yacyURL.isLocalDomain(url.hash()); // TODO: remove the dnsResolve above!
if ((this.acceptGlobalURLs && !local) || (this.acceptLocalURLs && local)) return null;
return (local) ? return (local) ?
("the host '" + host + "' is local, but local addresses are not accepted") : ("the host '" + host + "' is local, but local addresses are not accepted") :
("the host '" + host + "' is global, but global addresses are not accepted"); ("the host '" + host + "' is global, but global addresses are not accepted");
} }
public String urlInAcceptedDomainHash(final byte[] urlhash) { public String urlInAcceptedDomainHash(final byte[] urlhash) {
// returns true if the url can be accepted accoring to network.unit.domain // returns true if the url can be accepted according to network.unit.domain
if (urlhash == null) return "url is null"; if (urlhash == null) return "url is null";
if (this.acceptGlobalURLs && this.acceptLocalURLs) return null; // fast shortcut to avoid dnsResolve
// check if this is a local address and we are allowed to index local pages: // check if this is a local address and we are allowed to index local pages:
//boolean local = hostAddress.isSiteLocalAddress() || hostAddress.isLoopbackAddress();
final boolean local = DigestURI.isLocal(urlhash); final boolean local = DigestURI.isLocal(urlhash);
//assert local == yacyURL.isLocalDomain(url.hash()); // TODO: remove the dnsResolve above! if (this.acceptLocalURLs && local) return null;
if ((this.acceptGlobalURLs && !local) || (this.acceptLocalURLs && local)) return null; if (this.acceptGlobalURLs && !local) return null;
return (local) ? return (local) ?
("the urlhash '" + new String(urlhash) + "' is local, but local addresses are not accepted") : ("the urlhash '" + new String(urlhash) + "' is local, but local addresses are not accepted") :
("the urlhash '" + new String(urlhash) + "' is global, but global addresses are not accepted"); ("the urlhash '" + new String(urlhash) + "' is global, but global addresses are not accepted");

@ -28,6 +28,7 @@ import java.util.Map;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.kelondro.util.Domains;
import de.anomic.search.Switchboard; import de.anomic.search.Switchboard;
@ -123,7 +124,7 @@ public class Latency {
final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc(); final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc();
// find the minimum waiting time based on the network domain (local or global) // find the minimum waiting time based on the network domain (local or global)
final boolean local = MultiProtocolURI.isLocal(hostname); final boolean local = Domains.isLocal(hostname);
long waiting = (local) ? minimumLocalDelta : minimumGlobalDelta; long waiting = (local) ? minimumLocalDelta : minimumGlobalDelta;
// if we have accessed the domain many times, get slower (the flux factor) // if we have accessed the domain many times, get slower (the flux factor)

@ -161,7 +161,7 @@ public final class yacyClient {
// send request // send request
final long start = System.currentTimeMillis(); final long start = System.currentTimeMillis();
// final byte[] content = HttpConnector.wput("http://" + address + "/yacy/hello.html", yacySeed.b64Hash2hexHash(otherHash) + ".yacyh", post, 10000, false); // final byte[] content = HttpConnector.wput("http://" + address + "/yacy/hello.html", yacySeed.b64Hash2hexHash(otherHash) + ".yacyh", post, 10000, false);
final byte[] content = HttpConnector.wput("http://" + address + "/yacy/hello.html", yacySeed.b64Hash2hexHash(otherHash) + ".yacyh", parts, 10000); final byte[] content = HttpConnector.wput("http://" + address + "/yacy/hello.html", yacySeed.b64Hash2hexHash(otherHash) + ".yacyh", parts, 30000);
yacyCore.log.logInfo("yacyClient.publishMySeed thread '" + Thread.currentThread().getName() + "' contacted peer at " + address + ", received " + ((content == null) ? "null" : content.length) + " bytes, time = " + (System.currentTimeMillis() - start) + " milliseconds"); yacyCore.log.logInfo("yacyClient.publishMySeed thread '" + Thread.currentThread().getName() + "' contacted peer at " + address + ", received " + ((content == null) ? "null" : content.length) + " bytes, time = " + (System.currentTimeMillis() - start) + " milliseconds");
result = FileUtils.table(content); result = FileUtils.table(content);
} catch (final Exception e) { } catch (final Exception e) {

@ -39,6 +39,7 @@ import jcifs.smb.SmbFile;
import jcifs.smb.SmbFileInputStream; import jcifs.smb.SmbFileInputStream;
import net.yacy.cora.document.Punycode.PunycodeException; import net.yacy.cora.document.Punycode.PunycodeException;
import net.yacy.kelondro.util.Domains;
/** /**
* MultiProtocolURI provides a URL object for multiple protocols like http, https, ftp, smb and file * MultiProtocolURI provides a URL object for multiple protocols like http, https, ftp, smb and file
@ -776,11 +777,7 @@ public class MultiProtocolURI implements Serializable {
// checks for local/global IP range and local IP // checks for local/global IP range and local IP
public boolean isLocal() { public boolean isLocal() {
return isLocal(this.host); return this.protocol.equals("file") || this.protocol.equals("smb") || Domains.isLocal(this.host);
}
public static boolean isLocal(String host) {
return host.startsWith("127.") || host.equals("localhost") || host.startsWith("0:0:0:0:0:0:0:1");
} }
// language calculation // language calculation

@ -36,6 +36,12 @@ package net.yacy.kelondro.index;
public interface ARC<K, V> { public interface ARC<K, V> {
/**
* get the size of the ARC. this returns the sum of main and ghost cache
* @return the complete number of entries in the ARC cache
*/
public int size();
/** /**
* put a value to the cache. * put a value to the cache.
* @param s * @param s

@ -91,4 +91,11 @@ public final class ConcurrentARC<K, V> implements ARC<K, V> {
public final void clear() { public final void clear() {
for (ARC<K, V> a: this.arc) a.clear(); for (ARC<K, V> a: this.arc) a.clear();
} }
@Override
public final int size() {
int s = 0;
for (ARC<K, V> a: this.arc) s += a.size();
return s;
}
} }

@ -120,4 +120,13 @@ public final class SimpleARC<K, V> implements ARC<K, V> {
this.levelA.clear(); this.levelA.clear();
this.levelB.clear(); this.levelB.clear();
} }
/**
* get the size of the ARC. this returns the sum of main and ghost cache
* @return the complete number of entries in the ARC cache
*/
@Override
public final synchronized int size() {
return this.levelA.size() + this.levelB.size();
}
} }

@ -29,26 +29,30 @@ import java.net.InetAddress;
import java.net.UnknownHostException; import java.net.UnknownHostException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import net.yacy.kelondro.index.ARC;
import net.yacy.kelondro.index.ConcurrentARC;
public class Domains { public class Domains {
private static final String PRESENT = "";
private static final String localPatterns = "10\\..*,127.*,172.(1[6-9]|2[0-9]|3[0-1])\\..*,169.254.*,192.168.*,localhost"; private static final String localPatterns = "10\\..*,127.*,172.(1[6-9]|2[0-9]|3[0-1])\\..*,169.254.*,192.168.*,localhost";
private static final int maxNameCacheHitSize = 20000;
private static final int maxNameCacheMissSize = 20000;
private static final int maxNameNoCachingListSize = 20000;
private static final int concurrencyLevel = Runtime.getRuntime().availableProcessors() + 1;
// a dns cache // a dns cache
private static final Map<String, InetAddress> nameCacheHit = new ConcurrentHashMap<String, InetAddress>(); // a not-synchronized map resulted in deadlocks private static final ARC<String, InetAddress> nameCacheHit = new ConcurrentARC<String, InetAddress>(maxNameCacheHitSize, concurrencyLevel);
private static final Set<String> nameCacheMiss = Collections.synchronizedSet(new HashSet<String>()); private static final ARC<String, String> nameCacheMiss = new ConcurrentARC<String, String>(maxNameCacheMissSize, concurrencyLevel);
private static final int maxNameCacheHitSize = 8000; private static final ARC<String, String> nameCacheNoCachingList = new ConcurrentARC<String, String>(maxNameNoCachingListSize, concurrencyLevel);
private static final int maxNameCacheMissSize = 8000;
public static List<Pattern> nameCacheNoCachingPatterns = Collections.synchronizedList(new LinkedList<Pattern>()); public static List<Pattern> nameCacheNoCachingPatterns = Collections.synchronizedList(new LinkedList<Pattern>());
public static final List<Pattern> localhostPatterns = makePatterns(localPatterns); public static final List<Pattern> localhostPatterns = makePatterns(localPatterns);
private static final Set<String> nameCacheNoCachingList = Collections.synchronizedSet(new HashSet<String>());
/** /**
* ! ! ! A T T E N T I O N A T T E N T I O N A T T E N T I O N ! ! ! * ! ! ! A T T E N T I O N A T T E N T I O N A T T E N T I O N ! ! !
@ -413,7 +417,7 @@ public class Domains {
final InetAddress ip = nameCacheHit.get(host); final InetAddress ip = nameCacheHit.get(host);
if (ip != null) return ip; if (ip != null) return ip;
if (nameCacheMiss.contains(host)) return null; if (nameCacheMiss.containsKey(host)) return null;
throw new UnknownHostException("host not in cache"); throw new UnknownHostException("host not in cache");
} }
@ -445,26 +449,24 @@ public class Domains {
InetAddress ip = nameCacheHit.get(host); InetAddress ip = nameCacheHit.get(host);
if (ip != null) return ip; if (ip != null) return ip;
if (nameCacheMiss.contains(host)) return null; if (nameCacheMiss.containsKey(host)) return null;
//System.out.println("***DEBUG dnsResolve(" + host + ")"); //System.out.println("***DEBUG dnsResolve(" + host + ")");
try { try {
boolean doCaching = true; boolean doCaching = true;
ip = InetAddress.getByName(host); // this makes the DNS request to backbone ip = InetAddress.getByName(host); // this makes the DNS request to backbone
if ((ip == null) || if ((ip == null) ||
(ip.isLoopbackAddress()) || (ip.isLoopbackAddress()) ||
(nameCacheNoCachingList.contains(host)) (nameCacheNoCachingList.containsKey(host))
) { ) {
doCaching = false; doCaching = false;
} else { } else {
if (matchesList(host, nameCacheNoCachingPatterns)) { if (matchesList(host, nameCacheNoCachingPatterns)) {
nameCacheNoCachingList.add(host); nameCacheNoCachingList.put(host, PRESENT);
doCaching = false; doCaching = false;
} }
} }
if (doCaching && ip != null) { if (doCaching && ip != null) {
// remove old entries
flushHitNameCache();
// add new entries // add new entries
nameCacheHit.put(host, ip); nameCacheHit.put(host, ip);
@ -475,7 +477,7 @@ public class Domains {
flushMissNameCache(); flushMissNameCache();
// add new entries // add new entries
nameCacheMiss.add(host); nameCacheMiss.put(host, PRESENT);
} }
return null; return null;
} }
@ -502,14 +504,6 @@ public class Domains {
return nameCacheNoCachingList.size(); return nameCacheNoCachingList.size();
} }
/**
* Removes old entries from the dns hit cache
*/
public static void flushHitNameCache() {
if (nameCacheHit.size() > maxNameCacheHitSize) nameCacheHit.clear();
}
/** /**
* Removes old entries from the dns miss cache * Removes old entries from the dns miss cache
*/ */
@ -580,35 +574,32 @@ public class Domains {
} }
public static int getDomainID(final String host) { public static int getDomainID(final String host) {
if (host == null) return TLD_Local_ID; if (host == null || host.length() == 0) return TLD_Local_ID;
if (isLocal(host)) return TLD_Local_ID;
final int p = host.lastIndexOf('.'); final int p = host.lastIndexOf('.');
String tld = ""; String tld = (p > 0) ? host.substring(p + 1) : "";
if (p > 0) {
tld = host.substring(p + 1);
}
final Integer i = TLDID.get(tld); final Integer i = TLDID.get(tld);
if (i == null) { if (i == null) return TLD_Generic_ID;
return (isLocal(host)) ? TLD_Local_ID : TLD_Generic_ID;
}
return i.intValue(); return i.intValue();
} }
public static boolean isLocal(final String host) { public static boolean isLocal(final String host) {
if (host == null) return true; if (host == null || host.length() == 0) return true;
// FIXME IPv4 only // FIXME IPv4 only
// check local ip addresses // check local ip addresses
if (matchesList(host, localhostPatterns)) return true; if (matchesList(host, localhostPatterns)) return true;
if (host.startsWith("0:0:0:0:0:0:0:1")) return true; if (host.startsWith("0:0:0:0:0:0:0:1")) return true;
// finally check if there are other local IP adresses that are not in // finally check if there are other local IP addresses that are not in
// the standard IP range // the standard IP range
for (int i = 0; i < localHostAddresses.length; i++) { for (int i = 0; i < localHostAddresses.length; i++) {
if (localHostAddresses[i].getHostName().equals(host)) return true; if (localHostAddresses[i].getHostName().equals(host)) return true;
if (localHostAddresses[i].getHostAddress().equals(host)) return true; if (localHostAddresses[i].getHostAddress().equals(host)) return true;
} }
// the address must be a global address // check dns lookup: may be a local address even if the domain name looks global
return false; InetAddress a = dnsResolve(host);
return a == null || a.isAnyLocalAddress() || a.isLinkLocalAddress() || a.isLoopbackAddress();
} }
} }

@ -46,7 +46,6 @@ import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.TransformerWriter; import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.Domains;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.CrawlProfile;
@ -151,12 +150,13 @@ public final class LoaderDispatcher {
*/ */
public Response load(final Request request, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize) throws IOException { public Response load(final Request request, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize) throws IOException {
// get the protocol of the next URL // get the protocol of the next URL
final String protocol = request.url().getProtocol(); final DigestURI url = request.url();
final String host = request.url().getHost(); final String protocol = url.getProtocol();
final String host = url.getHost();
// check if this loads a page from localhost, which must be prevented to protect the server // check if this loads a page from localhost, which must be prevented to protect the server
// against attacks to the administration interface when localhost access is granted // against attacks to the administration interface when localhost access is granted
if (Domains.isLocal(host) && sb.getConfigBool("adminAccountForLocalhost", false)) throw new IOException("access to localhost not granted for url " + request.url()); if (url.isLocal() && sb.getConfigBool("adminAccountForLocalhost", false)) throw new IOException("access to localhost not granted for url " + url);
// check if we have the page in the cache // check if we have the page in the cache
@ -165,8 +165,8 @@ public final class LoaderDispatcher {
// we have passed a first test if caching is allowed // we have passed a first test if caching is allowed
// now see if there is a cache entry // now see if there is a cache entry
ResponseHeader cachedResponse = (request.url().isLocal()) ? null : Cache.getResponseHeader(request.url()); ResponseHeader cachedResponse = (url.isLocal()) ? null : Cache.getResponseHeader(url);
byte[] content = (cachedResponse == null) ? null : Cache.getContent(request.url()); byte[] content = (cachedResponse == null) ? null : Cache.getContent(url);
if (cachedResponse != null && content != null) { if (cachedResponse != null && content != null) {
// yes we have the content // yes we have the content
@ -188,17 +188,17 @@ public final class LoaderDispatcher {
// check which caching strategy shall be used // check which caching strategy shall be used
if (cacheStrategy == CrawlProfile.CacheStrategy.IFEXIST || cacheStrategy == CrawlProfile.CacheStrategy.CACHEONLY) { if (cacheStrategy == CrawlProfile.CacheStrategy.IFEXIST || cacheStrategy == CrawlProfile.CacheStrategy.CACHEONLY) {
// well, just take the cache and don't care about freshness of the content // well, just take the cache and don't care about freshness of the content
log.logInfo("cache hit/useall for: " + request.url().toNormalform(true, false)); log.logInfo("cache hit/useall for: " + url.toNormalform(true, false));
return response; return response;
} }
// now the cacheStrategy must be CACHE_STRATEGY_IFFRESH, that means we should do a proxy freshness test // now the cacheStrategy must be CACHE_STRATEGY_IFFRESH, that means we should do a proxy freshness test
assert cacheStrategy == CrawlProfile.CacheStrategy.IFFRESH : "cacheStrategy = " + cacheStrategy; assert cacheStrategy == CrawlProfile.CacheStrategy.IFFRESH : "cacheStrategy = " + cacheStrategy;
if (response.isFreshForProxy()) { if (response.isFreshForProxy()) {
log.logInfo("cache hit/fresh for: " + request.url().toNormalform(true, false)); log.logInfo("cache hit/fresh for: " + url.toNormalform(true, false));
return response; return response;
} else { } else {
log.logInfo("cache hit/stale for: " + request.url().toNormalform(true, false)); log.logInfo("cache hit/stale for: " + url.toNormalform(true, false));
} }
} }
} }
@ -213,7 +213,7 @@ public final class LoaderDispatcher {
// check access time: this is a double-check (we checked possibly already in the balancer) // check access time: this is a double-check (we checked possibly already in the balancer)
// to make sure that we don't DoS the target by mistake // to make sure that we don't DoS the target by mistake
if (!request.url().isLocal()) { if (!url.isLocal()) {
final Long lastAccess = accessTime.get(host); final Long lastAccess = accessTime.get(host);
long wait = 0; long wait = 0;
if (lastAccess != null) wait = Math.max(0, minDelay + lastAccess.longValue() - System.currentTimeMillis()); if (lastAccess != null) wait = Math.max(0, minDelay + lastAccess.longValue() - System.currentTimeMillis());
@ -246,7 +246,7 @@ public final class LoaderDispatcher {
String storeError = response.shallStoreCacheForCrawler(); String storeError = response.shallStoreCacheForCrawler();
if (storeError == null) { if (storeError == null) {
try { try {
Cache.store(request.url(), response.getResponseHeader(), response.getContent()); Cache.store(url, response.getResponseHeader(), response.getContent());
} catch (IOException e) { } catch (IOException e) {
log.logWarning("cannot write " + response.url() + " to Cache (3): " + e.getMessage(), e); log.logWarning("cannot write " + response.url() + " to Cache (3): " + e.getMessage(), e);
} }
@ -256,7 +256,7 @@ public final class LoaderDispatcher {
return response; return response;
} }
throw new IOException("Unsupported protocol '" + protocol + "' in url " + request.url()); throw new IOException("Unsupported protocol '" + protocol + "' in url " + url);
} }
/** /**
@ -281,10 +281,11 @@ public final class LoaderDispatcher {
// load resource // load resource
final Response response = load(request, cacheStrategy, maxFileSize); final Response response = load(request, cacheStrategy, maxFileSize);
if (response == null) throw new IOException("no Response for url " + request.url()); final DigestURI url = request.url();
if (response == null) throw new IOException("no Response for url " + url);
// if it is still not available, report an error // if it is still not available, report an error
if (response.getContent() == null || response.getResponseHeader() == null) throw new IOException("no Content available for url " + request.url()); if (response.getContent() == null || response.getResponseHeader() == null) throw new IOException("no Content available for url " + url);
// parse resource // parse resource
return response.parse(); return response.parse();

Loading…
Cancel
Save