diff --git a/htroot/yacy/transferRWI.java b/htroot/yacy/transferRWI.java index 99dddcc20..7bf1ba4e6 100644 --- a/htroot/yacy/transferRWI.java +++ b/htroot/yacy/transferRWI.java @@ -157,6 +157,15 @@ public final class transferRWI { continue; } + // check if the entry is in our network domain + final String urlRejectReason = sb.crawlStacker.urlInAcceptedDomainHash(urlHash); + if (urlRejectReason != null) { + yacyCore.log.logWarning("transferRWI: blocked URL hash '" + urlHash + "' (" + urlRejectReason + ") from peer " + otherPeerName + "; peer is suspected to be a spam-peer (or something is wrong)"); + //if (yacyCore.log.isFine()) yacyCore.log.logFine("transferRWI: blocked URL hash '" + urlHash + "' (" + urlRejectReason + ") from peer " + otherPeerName); + blocked++; + continue; + } + // learn entry try { sb.webIndex.index().add(wordHash.getBytes(), iEntry); diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index b057a41fe..cb081c2d6 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -325,15 +325,6 @@ public final class CrawlStacker { final String host = url.getHost(); if (host == null) return "url.host is null"; if (this.acceptGlobalURLs && this.acceptLocalURLs) return null; // fast shortcut to avoid dnsResolve - /* - InetAddress hostAddress = serverDomains.dnsResolve(host); - // if we don't know the host, we cannot load that resource anyway. - // But in case we use a proxy, it is possible that we dont have a DNS service. - final httpRemoteProxyConfig remoteProxyConfig = httpdProxyHandler.getRemoteProxyConfig(); - if (hostAddress == null) { - if ((remoteProxyConfig != null) && (remoteProxyConfig.useProxy())) return null; else return "the dns of the host '" + host + "' cannot be resolved"; - } - */ // check if this is a local address and we are allowed to index local pages: //boolean local = hostAddress.isSiteLocalAddress() || hostAddress.isLoopbackAddress(); final boolean local = url.isLocal(); @@ -343,6 +334,20 @@ public final class CrawlStacker { ("the host '" + host + "' is local, but local addresses are not accepted") : ("the host '" + host + "' is global, but global addresses are not accepted"); } + + public String urlInAcceptedDomainHash(final String urlhash) { + // returns true if the url can be accepted accoring to network.unit.domain + if (urlhash == null) return "url is null"; + if (this.acceptGlobalURLs && this.acceptLocalURLs) return null; // fast shortcut to avoid dnsResolve + // check if this is a local address and we are allowed to index local pages: + //boolean local = hostAddress.isSiteLocalAddress() || hostAddress.isLoopbackAddress(); + final boolean local = yacyURL.isLocal(urlhash); + //assert local == yacyURL.isLocalDomain(url.hash()); // TODO: remove the dnsResolve above! + if ((this.acceptGlobalURLs && !local) || (this.acceptLocalURLs && local)) return null; + return (local) ? + ("the urlhash '" + urlhash + "' is local, but local addresses are not accepted") : + ("the urlhash '" + urlhash + "' is global, but global addresses are not accepted"); + } public boolean acceptLocalURLs() { return this.acceptLocalURLs; diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 50e35038e..a85deb3f4 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1226,6 +1226,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch