better (and corrected) recognition of intranet and internet-addresses. This corrects the isLocal property that is used by network definitions to restrict index ranges to local and global addresses. Address locations (intranet or internet) had been partly identified by the top level domain of the host address. Since intranet addresses can also be addressed using a host name that is in a country domain it is necessary to do a dns resolving for each check. The check is supported by a local dns cache so the intranet/internet check should not affect network traffic too much. To ensure that the cache works properly the cache class was upgraded to better concurrency data structures.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6977 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 2d2771a12e
commit 22dbbcfa56

@ -351,31 +351,28 @@ public final class CrawlStacker {
* @return null if the url can be accepted, a string containing a rejection reason if the url cannot be accepted
*/
public String urlInAcceptedDomain(final DigestURI url) {
// returns true if the url can be accepted accoring to network.unit.domain
// returns true if the url can be accepted according to network.unit.domain
if (url == null) return "url is null";
final boolean local = url.isLocal();
if (this.acceptLocalURLs && local) return null;
if (this.acceptGlobalURLs && !local) return null;
final String host = url.getHost();
if (this.acceptLocalURLs && host == null && url.getProtocol().equals("file")) return null;
if (host == null) return "url.host is null";
if (this.acceptGlobalURLs && this.acceptLocalURLs) return null; // fast shortcut to avoid dnsResolve
// check if this is a local address and we are allowed to index local pages:
//boolean local = hostAddress.isSiteLocalAddress() || hostAddress.isLoopbackAddress();
final boolean local = url.isLocal();
//assert local == yacyURL.isLocalDomain(url.hash()); // TODO: remove the dnsResolve above!
if ((this.acceptGlobalURLs && !local) || (this.acceptLocalURLs && local)) return null;
return (local) ?
("the host '" + host + "' is local, but local addresses are not accepted") :
("the host '" + host + "' is global, but global addresses are not accepted");
}
public String urlInAcceptedDomainHash(final byte[] urlhash) {
// returns true if the url can be accepted accoring to network.unit.domain
// returns true if the url can be accepted according to network.unit.domain
if (urlhash == null) return "url is null";
if (this.acceptGlobalURLs && this.acceptLocalURLs) return null; // fast shortcut to avoid dnsResolve
// check if this is a local address and we are allowed to index local pages:
//boolean local = hostAddress.isSiteLocalAddress() || hostAddress.isLoopbackAddress();
final boolean local = DigestURI.isLocal(urlhash);
//assert local == yacyURL.isLocalDomain(url.hash()); // TODO: remove the dnsResolve above!
if ((this.acceptGlobalURLs && !local) || (this.acceptLocalURLs && local)) return null;
if (this.acceptLocalURLs && local) return null;
if (this.acceptGlobalURLs && !local) return null;
return (local) ?
("the urlhash '" + new String(urlhash) + "' is local, but local addresses are not accepted") :
("the urlhash '" + new String(urlhash) + "' is global, but global addresses are not accepted");

@ -28,6 +28,7 @@ import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.kelondro.util.Domains;
import de.anomic.search.Switchboard;
@ -123,7 +124,7 @@ public class Latency {
final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc();
// find the minimum waiting time based on the network domain (local or global)
final boolean local = MultiProtocolURI.isLocal(hostname);
final boolean local = Domains.isLocal(hostname);
long waiting = (local) ? minimumLocalDelta : minimumGlobalDelta;
// if we have accessed the domain many times, get slower (the flux factor)

@ -161,7 +161,7 @@ public final class yacyClient {
// send request
final long start = System.currentTimeMillis();
// final byte[] content = HttpConnector.wput("http://" + address + "/yacy/hello.html", yacySeed.b64Hash2hexHash(otherHash) + ".yacyh", post, 10000, false);
final byte[] content = HttpConnector.wput("http://" + address + "/yacy/hello.html", yacySeed.b64Hash2hexHash(otherHash) + ".yacyh", parts, 10000);
final byte[] content = HttpConnector.wput("http://" + address + "/yacy/hello.html", yacySeed.b64Hash2hexHash(otherHash) + ".yacyh", parts, 30000);
yacyCore.log.logInfo("yacyClient.publishMySeed thread '" + Thread.currentThread().getName() + "' contacted peer at " + address + ", received " + ((content == null) ? "null" : content.length) + " bytes, time = " + (System.currentTimeMillis() - start) + " milliseconds");
result = FileUtils.table(content);
} catch (final Exception e) {

@ -39,6 +39,7 @@ import jcifs.smb.SmbFile;
import jcifs.smb.SmbFileInputStream;
import net.yacy.cora.document.Punycode.PunycodeException;
import net.yacy.kelondro.util.Domains;
/**
* MultiProtocolURI provides a URL object for multiple protocols like http, https, ftp, smb and file
@ -776,11 +777,7 @@ public class MultiProtocolURI implements Serializable {
// checks for local/global IP range and local IP
public boolean isLocal() {
return isLocal(this.host);
}
public static boolean isLocal(String host) {
return host.startsWith("127.") || host.equals("localhost") || host.startsWith("0:0:0:0:0:0:0:1");
return this.protocol.equals("file") || this.protocol.equals("smb") || Domains.isLocal(this.host);
}
// language calculation

@ -36,6 +36,12 @@ package net.yacy.kelondro.index;
public interface ARC<K, V> {
/**
* get the size of the ARC. this returns the sum of main and ghost cache
* @return the complete number of entries in the ARC cache
*/
public int size();
/**
* put a value to the cache.
* @param s

@ -91,4 +91,11 @@ public final class ConcurrentARC<K, V> implements ARC<K, V> {
public final void clear() {
for (ARC<K, V> a: this.arc) a.clear();
}
@Override
public final int size() {
int s = 0;
for (ARC<K, V> a: this.arc) s += a.size();
return s;
}
}

@ -120,4 +120,13 @@ public final class SimpleARC<K, V> implements ARC<K, V> {
this.levelA.clear();
this.levelB.clear();
}
/**
* get the size of the ARC. this returns the sum of main and ghost cache
* @return the complete number of entries in the ARC cache
*/
@Override
public final synchronized int size() {
return this.levelA.size() + this.levelB.size();
}
}

@ -29,26 +29,30 @@ import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern;
import net.yacy.kelondro.index.ARC;
import net.yacy.kelondro.index.ConcurrentARC;
public class Domains {
private static final String PRESENT = "";
private static final String localPatterns = "10\\..*,127.*,172.(1[6-9]|2[0-9]|3[0-1])\\..*,169.254.*,192.168.*,localhost";
private static final int maxNameCacheHitSize = 20000;
private static final int maxNameCacheMissSize = 20000;
private static final int maxNameNoCachingListSize = 20000;
private static final int concurrencyLevel = Runtime.getRuntime().availableProcessors() + 1;
// a dns cache
private static final Map<String, InetAddress> nameCacheHit = new ConcurrentHashMap<String, InetAddress>(); // a not-synchronized map resulted in deadlocks
private static final Set<String> nameCacheMiss = Collections.synchronizedSet(new HashSet<String>());
private static final int maxNameCacheHitSize = 8000;
private static final int maxNameCacheMissSize = 8000;
private static final ARC<String, InetAddress> nameCacheHit = new ConcurrentARC<String, InetAddress>(maxNameCacheHitSize, concurrencyLevel);
private static final ARC<String, String> nameCacheMiss = new ConcurrentARC<String, String>(maxNameCacheMissSize, concurrencyLevel);
private static final ARC<String, String> nameCacheNoCachingList = new ConcurrentARC<String, String>(maxNameNoCachingListSize, concurrencyLevel);
public static List<Pattern> nameCacheNoCachingPatterns = Collections.synchronizedList(new LinkedList<Pattern>());
public static final List<Pattern> localhostPatterns = makePatterns(localPatterns);
private static final Set<String> nameCacheNoCachingList = Collections.synchronizedSet(new HashSet<String>());
/**
* ! ! ! A T T E N T I O N A T T E N T I O N A T T E N T I O N ! ! !
@ -413,7 +417,7 @@ public class Domains {
final InetAddress ip = nameCacheHit.get(host);
if (ip != null) return ip;
if (nameCacheMiss.contains(host)) return null;
if (nameCacheMiss.containsKey(host)) return null;
throw new UnknownHostException("host not in cache");
}
@ -445,26 +449,24 @@ public class Domains {
InetAddress ip = nameCacheHit.get(host);
if (ip != null) return ip;
if (nameCacheMiss.contains(host)) return null;
if (nameCacheMiss.containsKey(host)) return null;
//System.out.println("***DEBUG dnsResolve(" + host + ")");
try {
boolean doCaching = true;
ip = InetAddress.getByName(host); // this makes the DNS request to backbone
if ((ip == null) ||
(ip.isLoopbackAddress()) ||
(nameCacheNoCachingList.contains(host))
(nameCacheNoCachingList.containsKey(host))
) {
doCaching = false;
} else {
if (matchesList(host, nameCacheNoCachingPatterns)) {
nameCacheNoCachingList.add(host);
nameCacheNoCachingList.put(host, PRESENT);
doCaching = false;
}
}
if (doCaching && ip != null) {
// remove old entries
flushHitNameCache();
// add new entries
nameCacheHit.put(host, ip);
@ -475,7 +477,7 @@ public class Domains {
flushMissNameCache();
// add new entries
nameCacheMiss.add(host);
nameCacheMiss.put(host, PRESENT);
}
return null;
}
@ -501,15 +503,7 @@ public class Domains {
public static int nameCacheNoCachingListSize() {
return nameCacheNoCachingList.size();
}
/**
* Removes old entries from the dns hit cache
*/
public static void flushHitNameCache() {
if (nameCacheHit.size() > maxNameCacheHitSize) nameCacheHit.clear();
}
/**
* Removes old entries from the dns miss cache
*/
@ -580,35 +574,32 @@ public class Domains {
}
public static int getDomainID(final String host) {
if (host == null) return TLD_Local_ID;
if (host == null || host.length() == 0) return TLD_Local_ID;
if (isLocal(host)) return TLD_Local_ID;
final int p = host.lastIndexOf('.');
String tld = "";
if (p > 0) {
tld = host.substring(p + 1);
}
String tld = (p > 0) ? host.substring(p + 1) : "";
final Integer i = TLDID.get(tld);
if (i == null) {
return (isLocal(host)) ? TLD_Local_ID : TLD_Generic_ID;
}
if (i == null) return TLD_Generic_ID;
return i.intValue();
}
public static boolean isLocal(final String host) {
if (host == null) return true;
if (host == null || host.length() == 0) return true;
// FIXME IPv4 only
// check local ip addresses
if (matchesList(host, localhostPatterns)) return true;
if (host.startsWith("0:0:0:0:0:0:0:1")) return true;
// finally check if there are other local IP adresses that are not in
// finally check if there are other local IP addresses that are not in
// the standard IP range
for (int i = 0; i < localHostAddresses.length; i++) {
if (localHostAddresses[i].getHostName().equals(host)) return true;
if (localHostAddresses[i].getHostAddress().equals(host)) return true;
}
// the address must be a global address
return false;
// check dns lookup: may be a local address even if the domain name looks global
InetAddress a = dnsResolve(host);
return a == null || a.isAnyLocalAddress() || a.isLinkLocalAddress() || a.isLoopbackAddress();
}
}

@ -46,7 +46,6 @@ import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.Domains;
import net.yacy.kelondro.util.FileUtils;
import de.anomic.crawler.CrawlProfile;
@ -151,12 +150,13 @@ public final class LoaderDispatcher {
*/
public Response load(final Request request, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize) throws IOException {
// get the protocol of the next URL
final String protocol = request.url().getProtocol();
final String host = request.url().getHost();
final DigestURI url = request.url();
final String protocol = url.getProtocol();
final String host = url.getHost();
// check if this loads a page from localhost, which must be prevented to protect the server
// against attacks to the administration interface when localhost access is granted
if (Domains.isLocal(host) && sb.getConfigBool("adminAccountForLocalhost", false)) throw new IOException("access to localhost not granted for url " + request.url());
if (url.isLocal() && sb.getConfigBool("adminAccountForLocalhost", false)) throw new IOException("access to localhost not granted for url " + url);
// check if we have the page in the cache
@ -165,8 +165,8 @@ public final class LoaderDispatcher {
// we have passed a first test if caching is allowed
// now see if there is a cache entry
ResponseHeader cachedResponse = (request.url().isLocal()) ? null : Cache.getResponseHeader(request.url());
byte[] content = (cachedResponse == null) ? null : Cache.getContent(request.url());
ResponseHeader cachedResponse = (url.isLocal()) ? null : Cache.getResponseHeader(url);
byte[] content = (cachedResponse == null) ? null : Cache.getContent(url);
if (cachedResponse != null && content != null) {
// yes we have the content
@ -188,17 +188,17 @@ public final class LoaderDispatcher {
// check which caching strategy shall be used
if (cacheStrategy == CrawlProfile.CacheStrategy.IFEXIST || cacheStrategy == CrawlProfile.CacheStrategy.CACHEONLY) {
// well, just take the cache and don't care about freshness of the content
log.logInfo("cache hit/useall for: " + request.url().toNormalform(true, false));
log.logInfo("cache hit/useall for: " + url.toNormalform(true, false));
return response;
}
// now the cacheStrategy must be CACHE_STRATEGY_IFFRESH, that means we should do a proxy freshness test
assert cacheStrategy == CrawlProfile.CacheStrategy.IFFRESH : "cacheStrategy = " + cacheStrategy;
if (response.isFreshForProxy()) {
log.logInfo("cache hit/fresh for: " + request.url().toNormalform(true, false));
log.logInfo("cache hit/fresh for: " + url.toNormalform(true, false));
return response;
} else {
log.logInfo("cache hit/stale for: " + request.url().toNormalform(true, false));
log.logInfo("cache hit/stale for: " + url.toNormalform(true, false));
}
}
}
@ -213,7 +213,7 @@ public final class LoaderDispatcher {
// check access time: this is a double-check (we checked possibly already in the balancer)
// to make sure that we don't DoS the target by mistake
if (!request.url().isLocal()) {
if (!url.isLocal()) {
final Long lastAccess = accessTime.get(host);
long wait = 0;
if (lastAccess != null) wait = Math.max(0, minDelay + lastAccess.longValue() - System.currentTimeMillis());
@ -246,7 +246,7 @@ public final class LoaderDispatcher {
String storeError = response.shallStoreCacheForCrawler();
if (storeError == null) {
try {
Cache.store(request.url(), response.getResponseHeader(), response.getContent());
Cache.store(url, response.getResponseHeader(), response.getContent());
} catch (IOException e) {
log.logWarning("cannot write " + response.url() + " to Cache (3): " + e.getMessage(), e);
}
@ -256,7 +256,7 @@ public final class LoaderDispatcher {
return response;
}
throw new IOException("Unsupported protocol '" + protocol + "' in url " + request.url());
throw new IOException("Unsupported protocol '" + protocol + "' in url " + url);
}
/**
@ -281,10 +281,11 @@ public final class LoaderDispatcher {
// load resource
final Response response = load(request, cacheStrategy, maxFileSize);
if (response == null) throw new IOException("no Response for url " + request.url());
final DigestURI url = request.url();
if (response == null) throw new IOException("no Response for url " + url);
// if it is still not available, report an error
if (response.getContent() == null || response.getResponseHeader() == null) throw new IOException("no Content available for url " + request.url());
if (response.getContent() == null || response.getResponseHeader() == null) throw new IOException("no Content available for url " + url);
// parse resource
return response.parse();

Loading…
Cancel
Save