added domain checks to surrogate reader and RWI transfer receiver to prevent spaming using surrogates

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5837 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 76af84d732
commit fa3adbbfc6

@ -157,6 +157,15 @@ public final class transferRWI {
continue; continue;
} }
// check if the entry is in our network domain
final String urlRejectReason = sb.crawlStacker.urlInAcceptedDomainHash(urlHash);
if (urlRejectReason != null) {
yacyCore.log.logWarning("transferRWI: blocked URL hash '" + urlHash + "' (" + urlRejectReason + ") from peer " + otherPeerName + "; peer is suspected to be a spam-peer (or something is wrong)");
//if (yacyCore.log.isFine()) yacyCore.log.logFine("transferRWI: blocked URL hash '" + urlHash + "' (" + urlRejectReason + ") from peer " + otherPeerName);
blocked++;
continue;
}
// learn entry // learn entry
try { try {
sb.webIndex.index().add(wordHash.getBytes(), iEntry); sb.webIndex.index().add(wordHash.getBytes(), iEntry);

@ -325,15 +325,6 @@ public final class CrawlStacker {
final String host = url.getHost(); final String host = url.getHost();
if (host == null) return "url.host is null"; if (host == null) return "url.host is null";
if (this.acceptGlobalURLs && this.acceptLocalURLs) return null; // fast shortcut to avoid dnsResolve if (this.acceptGlobalURLs && this.acceptLocalURLs) return null; // fast shortcut to avoid dnsResolve
/*
InetAddress hostAddress = serverDomains.dnsResolve(host);
// if we don't know the host, we cannot load that resource anyway.
// But in case we use a proxy, it is possible that we dont have a DNS service.
final httpRemoteProxyConfig remoteProxyConfig = httpdProxyHandler.getRemoteProxyConfig();
if (hostAddress == null) {
if ((remoteProxyConfig != null) && (remoteProxyConfig.useProxy())) return null; else return "the dns of the host '" + host + "' cannot be resolved";
}
*/
// check if this is a local address and we are allowed to index local pages: // check if this is a local address and we are allowed to index local pages:
//boolean local = hostAddress.isSiteLocalAddress() || hostAddress.isLoopbackAddress(); //boolean local = hostAddress.isSiteLocalAddress() || hostAddress.isLoopbackAddress();
final boolean local = url.isLocal(); final boolean local = url.isLocal();
@ -344,6 +335,20 @@ public final class CrawlStacker {
("the host '" + host + "' is global, but global addresses are not accepted"); ("the host '" + host + "' is global, but global addresses are not accepted");
} }
public String urlInAcceptedDomainHash(final String urlhash) {
// returns true if the url can be accepted accoring to network.unit.domain
if (urlhash == null) return "url is null";
if (this.acceptGlobalURLs && this.acceptLocalURLs) return null; // fast shortcut to avoid dnsResolve
// check if this is a local address and we are allowed to index local pages:
//boolean local = hostAddress.isSiteLocalAddress() || hostAddress.isLoopbackAddress();
final boolean local = yacyURL.isLocal(urlhash);
//assert local == yacyURL.isLocalDomain(url.hash()); // TODO: remove the dnsResolve above!
if ((this.acceptGlobalURLs && !local) || (this.acceptLocalURLs && local)) return null;
return (local) ?
("the urlhash '" + urlhash + "' is local, but local addresses are not accepted") :
("the urlhash '" + urlhash + "' is global, but global addresses are not accepted");
}
public boolean acceptLocalURLs() { public boolean acceptLocalURLs() {
return this.acceptLocalURLs; return this.acceptLocalURLs;
} }

@ -1226,6 +1226,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
Surrogate surrogate; Surrogate surrogate;
QueueEntry queueentry; QueueEntry queueentry;
while ((surrogate = reader.take()) != SurrogateReader.poison) { while ((surrogate = reader.take()) != SurrogateReader.poison) {
// check if url is in accepted domain
final String urlRejectReason = crawlStacker.urlInAcceptedDomain(surrogate.url());
if (urlRejectReason != null) {
if (this.log.isFine()) this.log.logInfo("Rejected URL '" + surrogate.url() + "': " + urlRejectReason);
continue;
}
// create a queue entry
plasmaParserDocument document = surrogate.document(); plasmaParserDocument document = surrogate.document();
queueentry = this.webIndex.queuePreStack.newEntry(surrogate.url(), null, null, false, null, 0, this.webIndex.defaultSurrogateProfile.handle(), null); queueentry = this.webIndex.queuePreStack.newEntry(surrogate.url(), null, null, false, null, 0, this.webIndex.defaultSurrogateProfile.handle(), null);
/* /*
@ -1233,6 +1241,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
final String initiator, final int depth, final String profilehandle, final String anchorName) final String initiator, final int depth, final String profilehandle, final String anchorName)
*/ */
indexingQueueEntry queueEntry = new indexingQueueEntry(queueentry, document, null); indexingQueueEntry queueEntry = new indexingQueueEntry(queueentry, document, null);
// place the queue entry into the concurrent process of the condenser (document analysis)
try { try {
indexingCondensementProcessor.enQueue(queueEntry); indexingCondensementProcessor.enQueue(queueEntry);
} catch (InterruptedException e) { } catch (InterruptedException e) {

Loading…
Cancel
Save