From a47f9238fed8ca8549b1fc1f7a89b1b3729d1eb0 Mon Sep 17 00:00:00 2001 From: theli Date: Fri, 2 Sep 2005 12:09:45 +0000 Subject: [PATCH] *) Blacklist is now also used by the crawler See: http://www.yacy-forum.de/viewtopic.php?t=1069 git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@642 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/plasma/plasmaCrawlWorker.java | 15 +++++++++++++-- source/de/anomic/plasma/plasmaSwitchboard.java | 8 ++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/source/de/anomic/plasma/plasmaCrawlWorker.java b/source/de/anomic/plasma/plasmaCrawlWorker.java index 126176ec8..b22c5b3af 100644 --- a/source/de/anomic/plasma/plasmaCrawlWorker.java +++ b/source/de/anomic/plasma/plasmaCrawlWorker.java @@ -60,6 +60,8 @@ import de.anomic.server.serverCore; import de.anomic.server.serverDate; import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverMiniLogFormatter; +import de.anomic.tools.bitfield; +import de.anomic.yacy.yacyCore; public final class plasmaCrawlWorker extends Thread { @@ -289,6 +291,9 @@ public final class plasmaCrawlWorker extends Thread { // if the recrawling limit was exceeded we stop crawling now if (crawlingRetryCount <= 0) return; + // getting a reference to the plasmaSwitchboard + plasmaSwitchboard sb = plasmaCrawlLoader.switchboard; + Date requestDate = new Date(); // remember the time... String host = url.getHost(); String path = url.getPath(); @@ -296,6 +301,14 @@ public final class plasmaCrawlWorker extends Thread { boolean ssl = url.getProtocol().equals("https"); if (port < 0) port = (ssl) ? 443 : 80; + // check if url is in blacklist + String hostlow = host.toLowerCase(); + if (plasmaSwitchboard.urlBlacklist.isListed(hostlow, path)) { + log.logInfo("CRAWLER Rejecting URL '" + url.toString() + "'. URL is in blacklist."); + sb.urlPool.errorURL.newEntry(url, referer,initiator, yacyCore.seedDB.mySeed.hash, + name, "denied_(url_in_blacklist)", new bitfield(plasmaURL.urlFlagLength), true); + } + // set referrer; in some case advertise a little bit: referer = (referer == null) ? "" : referer.trim(); if (referer.length() == 0) referer = "http://www.yacy.net/yacy/"; @@ -303,8 +316,6 @@ public final class plasmaCrawlWorker extends Thread { // take a file from the net httpc remote = null; try { - plasmaSwitchboard sb = plasmaCrawlLoader.switchboard; - // create a request header httpHeader requestHeader = new httpHeader(); requestHeader.put(httpHeader.USER_AGENT, httpdProxyHandler.userAgent); diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 44d1d9640..056e87a98 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -118,6 +118,7 @@ import de.anomic.data.messageBoard; import de.anomic.data.wikiBoard; import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.http.httpHeader; +import de.anomic.http.httpd; import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroMSetTools; import de.anomic.kelondro.kelondroTables; @@ -1098,6 +1099,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser return reason; } + // check blacklist + String hostlow = nexturl.getHost().toLowerCase(); + if (urlBlacklist.isListed(hostlow, nexturl.getPath())) { + reason = "denied_(url_in_blacklist)"; + return reason; + } + // filter deny if ((currentdepth > 0) && (profile != null) && (!(nexturlString.matches(profile.generalFilter())))) { reason = "denied_(does_not_match_filter)";