*)News-entries with blacklisted URLs are now ignored

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3849 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
hydrox 18 years ago
parent b69559eba4
commit 4a1bc4743a

@ -64,9 +64,10 @@ public abstract class abstractURLPattern implements plasmaURLPattern {
plasmaURLPattern.BLACKLIST_PROXY,
plasmaURLPattern.BLACKLIST_DHT,
plasmaURLPattern.BLACKLIST_SEARCH,
plasmaURLPattern.BLACKLIST_SURFTIPS
plasmaURLPattern.BLACKLIST_SURFTIPS,
plasmaURLPattern.BLACKLIST_NEWS
}));
public static final String BLACKLIST_TYPES_STRING="proxy,crawler,dht,search,surftips";
public static final String BLACKLIST_TYPES_STRING="proxy,crawler,dht,search,surftips,news";
protected File blacklistRootPath = null;
protected HashMap cachedUrlHashs = null;

@ -11,6 +11,7 @@ public interface plasmaURLPattern {
public static final String BLACKLIST_PROXY = "proxy";
public static final String BLACKLIST_SEARCH = "search";
public static final String BLACKLIST_SURFTIPS = "surftips";
public static final String BLACKLIST_NEWS = "news";
public static final class blacklistFile {

@ -47,6 +47,11 @@ package de.anomic.yacy;
import java.io.File;
import java.io.IOException;
import java.util.HashSet;
import java.util.Map;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
public class yacyNewsPool {
@ -300,6 +305,19 @@ public class yacyNewsPool {
if (record.category() == null) return;
if (!(categories.contains(record.category()))) return;
if (record.created().getTime() == 0) return;
Map attributes = record.attributes();
if (attributes.containsKey("url")){
if(plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_NEWS, new URL((String) attributes.get("url")))){
System.out.println("DEBUG: ignored news-entry url blacklisted: " + attributes.get("url"));
return;
}
}
if (attributes.containsKey("startURL")){
if(plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_NEWS, new URL((String) attributes.get("startURL")))){
System.out.println("DEBUG: ignored news-entry url blacklisted: " + attributes.get("startURL"));
return;
}
}
// double-check with old news
if (newsDB.get(record.id()) != null) return;

@ -257,6 +257,7 @@ crawler.BlackLists=url.default.black
dht.BlackLists=url.default.black
search.BlackLists=url.default.black
surftips.BlackLists=url.default.black
news.BlackLists=url.default.black
proxyCookieBlackList=cookie.default.black

Loading…
Cancel
Save