*) Blacklist is now also used by the crawler

See: http://www.yacy-forum.de/viewtopic.php?t=1069

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@642 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 20 years ago
parent dc0a2d4c11
commit a47f9238fe

@ -60,6 +60,8 @@ import de.anomic.server.serverCore;
import de.anomic.server.serverDate;
import de.anomic.server.logging.serverLog;
import de.anomic.server.logging.serverMiniLogFormatter;
import de.anomic.tools.bitfield;
import de.anomic.yacy.yacyCore;
public final class plasmaCrawlWorker extends Thread {
@ -289,6 +291,9 @@ public final class plasmaCrawlWorker extends Thread {
// if the recrawling limit was exceeded we stop crawling now
if (crawlingRetryCount <= 0) return;
// getting a reference to the plasmaSwitchboard
plasmaSwitchboard sb = plasmaCrawlLoader.switchboard;
Date requestDate = new Date(); // remember the time...
String host = url.getHost();
String path = url.getPath();
@ -296,6 +301,14 @@ public final class plasmaCrawlWorker extends Thread {
boolean ssl = url.getProtocol().equals("https");
if (port < 0) port = (ssl) ? 443 : 80;
// check if url is in blacklist
String hostlow = host.toLowerCase();
if (plasmaSwitchboard.urlBlacklist.isListed(hostlow, path)) {
log.logInfo("CRAWLER Rejecting URL '" + url.toString() + "'. URL is in blacklist.");
sb.urlPool.errorURL.newEntry(url, referer,initiator, yacyCore.seedDB.mySeed.hash,
name, "denied_(url_in_blacklist)", new bitfield(plasmaURL.urlFlagLength), true);
}
// set referrer; in some case advertise a little bit:
referer = (referer == null) ? "" : referer.trim();
if (referer.length() == 0) referer = "http://www.yacy.net/yacy/";
@ -303,8 +316,6 @@ public final class plasmaCrawlWorker extends Thread {
// take a file from the net
httpc remote = null;
try {
plasmaSwitchboard sb = plasmaCrawlLoader.switchboard;
// create a request header
httpHeader requestHeader = new httpHeader();
requestHeader.put(httpHeader.USER_AGENT, httpdProxyHandler.userAgent);

@ -118,6 +118,7 @@ import de.anomic.data.messageBoard;
import de.anomic.data.wikiBoard;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpHeader;
import de.anomic.http.httpd;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.kelondro.kelondroTables;
@ -1098,6 +1099,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
return reason;
}
// check blacklist
String hostlow = nexturl.getHost().toLowerCase();
if (urlBlacklist.isListed(hostlow, nexturl.getPath())) {
reason = "denied_(url_in_blacklist)";
return reason;
}
// filter deny
if ((currentdepth > 0) && (profile != null) && (!(nexturlString.matches(profile.generalFilter())))) {
reason = "denied_(does_not_match_filter)";

Loading…
Cancel
Save