From d2e8e762187ffda937cf4bd07e04d144cefe99b3 Mon Sep 17 00:00:00 2001
From: theli
Date: Sat, 12 Aug 2006 02:42:10 +0000
Subject: [PATCH] *) now it's possible to configure the yacy blacklist
separately for dht, search, proxy, crawler See:
http://www.yacy-forum.de/viewtopic.php?t=2541
http://www.yacy-forum.de/viewtopic.php?p=24516
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2389 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
htroot/Blacklist_p.html | 255 ++++++------
htroot/Blacklist_p.java | 374 +++++++++++-------
htroot/IndexControl_p.java | 3 +-
htroot/sharedBlacklist_p.java | 13 +-
htroot/yacy/transferRWI.java | 3 +-
source/de/anomic/data/listManager.java | 38 +-
source/de/anomic/http/httpdProxyHandler.java | 7 +-
source/de/anomic/plasma/plasmaCrawlLURL.java | 2 +-
.../de/anomic/plasma/plasmaCrawlStacker.java | 2 +-
.../de/anomic/plasma/plasmaCrawlWorker.java | 2 +-
.../de/anomic/plasma/plasmaSwitchboard.java | 13 +-
source/de/anomic/plasma/plasmaURLPattern.java | 140 +++++--
source/de/anomic/plasma/plasmaWordIndex.java | 2 +-
source/de/anomic/yacy/yacyClient.java | 2 +-
source/migration.java | 10 +
yacy.init | 11 +-
16 files changed, 549 insertions(+), 328 deletions(-)
diff --git a/htroot/Blacklist_p.html b/htroot/Blacklist_p.html
index 5c0690d3b..3ceeb3d0b 100644
--- a/htroot/Blacklist_p.html
+++ b/htroot/Blacklist_p.html
@@ -14,151 +14,172 @@ You may also provide your blacklist to other peers by sharing them; in return yo
collect blacklist entries from other peers.
-
-#(status)#
+
+#(status)#
::
#[item]# was removed from blacklist
::
diff --git a/htroot/Blacklist_p.java b/htroot/Blacklist_p.java
index 65a06f1b1..ab026a6c1 100644
--- a/htroot/Blacklist_p.java
+++ b/htroot/Blacklist_p.java
@@ -49,14 +49,15 @@
// if the shell's current path is HTROOT
import java.io.File;
+import java.io.FileWriter;
import java.io.IOException;
+import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Enumeration;
import de.anomic.data.listManager;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaSwitchboard;
-import de.anomic.server.serverCore;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyCore;
@@ -64,196 +65,273 @@ import de.anomic.yacy.yacySeed;
public class Blacklist_p {
private final static String BLACKLIST = "blackLists_";
- private final static String BLACKLIST_ALL = "proxyBlackLists";
- private final static String BLACKLIST_ACTIVE = "proxyBlackListsActive";
- private final static String BLACKLIST_SHARED = "proxyBlackListsShared";
+ private final static String BLACKLIST_SHARED = "BlackLists.Shared";
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
- // return variable that accumulates replacements
+
+ // initialize the list manager
listManager.switchboard = (plasmaSwitchboard) env;
listManager.listsPath = new File(listManager.switchboard.getRootPath(),listManager.switchboard.getConfig("listManager.listsPath", "DATA/LISTS"));
- final serverObjects prop = new serverObjects();
- String line;
-// String HTMLout = "";
-
- String removeItem = "removeme";
- int numItems = 0;
- int i; // need below
-
- String[] filenames = listManager.getListslistArray(BLACKLIST_ALL);
- String filename = "";
-
+
+ // getting the list of supported blacklist types
+ String supportedBlacklistTypesStr = env.getConfig("BlackLists.types", "");
+ String[] supportedBlacklistTypes = supportedBlacklistTypesStr.split(",");
+
+ String blacklistToUse = null;
+ serverObjects prop = new serverObjects();
+
+ // do all post operations
if (post != null) {
- if (post.containsKey("blackLists")) { // Blacklist selected
- filename = (String)post.get("blackLists");
- } else if (post.containsKey("filename")) {
- filename = (String)post.get("filename");
- } else if (filenames.length > 0){ // first BlackList
- filename = filenames[0];
-// } else { //No BlackList
-// System.out.println("DEBUG: No Blacklist found");
+
+ if (post.containsKey("selectList")) {
+ blacklistToUse = (String)post.get("selectedListName");
}
- prop.put("status", 0); // nothing
+ if (post.containsKey("createNewList")) {
+ /* ===========================================================
+ * Creation of a new blacklist
+ * =========================================================== */
+
+ blacklistToUse = (String)post.get("newListName");
+ if (!blacklistToUse.endsWith(".black")) blacklistToUse += ".black";
- // del list
- if (post.containsKey("dellistbutton")) {
- final File BlackListFile = new File(listManager.listsPath, filename);
+ try {
+ final File newFile = new File(listManager.listsPath, blacklistToUse);
+ newFile.createNewFile();
+
+ // share the newly created blacklist
+ listManager.addListToListslist(BLACKLIST_SHARED, blacklistToUse);
+
+ // activate it for all known blacklist types
+ for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) {
+ listManager.addListToListslist(supportedBlacklistTypes[blTypes] + ".BlackLists",blacklistToUse);
+ }
+ } catch (IOException e) {/* */}
+
+ } else if (post.containsKey("deleteList")) {
+ /* ===========================================================
+ * Delete a blacklist
+ * =========================================================== */
+
+ blacklistToUse = (String)post.get("selectedListName");
+
+ File BlackListFile = new File(listManager.listsPath, blacklistToUse);
BlackListFile.delete();
- // remove from all BlackLists Lists
- listManager.removeListFromListslist(BLACKLIST_ALL, filename);
- listManager.removeListFromListslist(BLACKLIST_ACTIVE, filename);
- listManager.removeListFromListslist(BLACKLIST_SHARED, filename);
-
+ for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) {
+ listManager.removeListFromListslist(supportedBlacklistTypes[blTypes] + ".BlackLists",blacklistToUse);
+ }
+
+ // remove it from the shared list
+ listManager.removeListFromListslist(BLACKLIST_SHARED, blacklistToUse);
+ blacklistToUse = null;
+
// reload Blacklists
listManager.reloadBlacklists();
- filenames = listManager.getListslistArray(BLACKLIST_ALL);
- if (filenames.length > 0) {
- filename = filenames[0];
- }
- // new list
- } else if (post.containsKey("newlistbutton")) {
- String newList = (String)post.get("newlist");
- if (!newList.endsWith(".black")) {
- newList += ".black";
- }
- filename = newList; //to select it in the returnes Document
- try {
- final File newFile = new File(listManager.listsPath, newList);
- newFile.createNewFile();
- listManager.addListToListslist(BLACKLIST_ALL, newList);
- listManager.addListToListslist(BLACKLIST_ACTIVE, newList);
- listManager.addListToListslist(BLACKLIST_SHARED, newList);
- } catch (IOException e) {}
+ } else if (post.containsKey("activateList")) {
+ /* ===========================================================
+ * Activate/Deactivate a blacklist
+ * =========================================================== */
+
+ blacklistToUse = (String)post.get("selectedListName");
+
+ for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) {
+ if (post.containsKey("activateList4" + supportedBlacklistTypes[blTypes])) {
+ listManager.addListToListslist(supportedBlacklistTypes[blTypes] + ".BlackLists",blacklistToUse);
+ } else {
+ listManager.removeListFromListslist(supportedBlacklistTypes[blTypes] + ".BlackLists",blacklistToUse);
+ }
+ }
- } else if (post.containsKey("activatelistbutton")) {
- if( listManager.ListInListslist(BLACKLIST_ACTIVE, filename) ) {
- listManager.removeListFromListslist(BLACKLIST_ACTIVE, filename);
- } else { // inactive list -> enable
- listManager.addListToListslist(BLACKLIST_ACTIVE, filename);
- }
- listManager.reloadBlacklists();
+ listManager.reloadBlacklists();
+
+ } else if (post.containsKey("shareList")) {
- } else if (post.containsKey("sharelistbutton")) {
- if (listManager.ListInListslist(BLACKLIST_SHARED, filename)) {
+ /* ===========================================================
+ * Share a blacklist
+ * =========================================================== */
+
+ blacklistToUse = (String)post.get("selectedListName");
+
+ if (listManager.ListInListslist(BLACKLIST_SHARED, blacklistToUse)) {
// Remove from shared BlackLists
- listManager.removeListFromListslist(BLACKLIST_SHARED, filename);
+ listManager.removeListFromListslist(BLACKLIST_SHARED, blacklistToUse);
} else { // inactive list -> enable
- listManager.addListToListslist(BLACKLIST_SHARED, filename);
+ listManager.addListToListslist(BLACKLIST_SHARED, blacklistToUse);
+ }
+ } else if (post.containsKey("deleteBlacklistEntry")) {
+
+ /* ===========================================================
+ * Delete a blacklist entry
+ * =========================================================== */
+
+ // get the current selected blacklist name
+ blacklistToUse = (String)post.get("currentBlacklist");
+
+ // get the entry that should be deleted
+ String oldEntry = (String)post.get("selectedEntry");
+
+ // load blacklist data from file
+ ArrayList list = listManager.getListArray(new File(listManager.listsPath, blacklistToUse));
+
+ // delete the old entry from file
+ if (list != null) {
+ for (int i=0; i < list.size(); i++) {
+ if (((String)list.get(i)).equals(oldEntry)) {
+ list.remove(i);
+ break;
+ }
+ }
+ listManager.writeList(new File(listManager.listsPath, blacklistToUse), (String[])list.toArray(new String[list.size()]));
+ }
+
+ // remove the entry from the running blacklist engine
+ int pos = oldEntry.indexOf("/");
+ if (pos < 0) {
+ // add default empty path pattern
+ pos = oldEntry.length();
+ oldEntry = oldEntry + "/.*";
+ }
+ for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) {
+ if (listManager.ListInListslist(supportedBlacklistTypes[blTypes] + ".BlackLists",blacklistToUse)) {
+ plasmaSwitchboard.urlBlacklist.add(supportedBlacklistTypes[blTypes],oldEntry.substring(0, pos), oldEntry.substring(pos + 1));
+ }
+ }
+
+ } else if (post.containsKey("addBlacklistEntry")) {
+
+ /* ===========================================================
+ * Add a new blacklist entry
+ * =========================================================== */
+
+ blacklistToUse = (String)post.get("currentBlacklist");
+
+ String newEntry = (String)post.get("newEntry");
+
+ // TODO: ignore empty entries
+
+ if (newEntry.startsWith("http://") ){
+ newEntry = newEntry.substring(7);
+ }
+
+ int pos = newEntry.indexOf("/");
+ if (pos < 0) {
+ // add default empty path pattern
+ pos = newEntry.length();
+ newEntry = newEntry + "/.*";
+ }
+
+ // append the line to the file
+ PrintWriter pw = null;
+ try {
+ pw = new PrintWriter(new FileWriter(new File(listManager.listsPath, blacklistToUse), true));
+ pw.println(newEntry);
+ pw.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ } finally {
+ if (pw != null) try { pw.close(); } catch (Exception e){ /* */}
}
- } // List Management End
- // remove a Item?
- if (post.containsKey("delbutton") &&
- post.containsKey("Itemlist") &&
- !((String)post.get("Itemlist")).equals("") ) {
- removeItem = (String)post.get("Itemlist");
+ // add to blacklist
+ for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) {
+ if (listManager.ListInListslist(supportedBlacklistTypes[blTypes] + ".BlackLists",blacklistToUse)) {
+ plasmaSwitchboard.urlBlacklist.add(supportedBlacklistTypes[blTypes],newEntry.substring(0, pos), newEntry.substring(pos + 1));
+ }
+ }
}
- } // post != null
+
+ }
+
+ // loading all blacklist files located in the directory
+ String[] dirlist = listManager.getDirListing(listManager.listsPath);
+
+ // if we have not chosen a blacklist until yet we use the first file
+ if (blacklistToUse == null && dirlist != null && dirlist.length > 0) {
+ blacklistToUse = dirlist[0];
+ }
+
- // Read the List
- final ArrayList list = listManager.getListArray(new File(listManager.listsPath, filename));
- final StringBuffer out = new StringBuffer(list.size() * 64);
+ // Read the blacklist items from file
+ final ArrayList list = listManager.getListArray(new File(listManager.listsPath, blacklistToUse));
+
+ // sort them
String[] sortedlist = new String[list.size()];
Arrays.sort(list.toArray(sortedlist));
+
+ // display them
+ int entryCount = 0;
for (int j=0;j 0) { // no nullpointer error
final Enumeration e = yacyCore.seedDB.seedsConnected(true, false, null);
- i = 0;
+ int peerCount = 0;
while (e.hasMoreElements()) {
seed = (yacySeed) e.nextElement();
if (seed != null) {
final String Hash = seed.hash;
final String Name = seed.get(yacySeed.NAME, "nameless");
- prop.put("otherHosts_" + i + "_hash", Hash);
- prop.put("otherHosts_" + i + "_name", Name);
- i++;
+ prop.put("otherHosts_" + peerCount + "_hash", Hash);
+ prop.put("otherHosts_" + peerCount + "_name", Name);
+ peerCount++;
}
}
- prop.put("otherHosts", i);
-// } else {
-// System.out.println("BlackList_p: yacy seed not loaded!"); // DEBUG:
+ prop.put("otherHosts", peerCount);
}
-
+
+
// List BlackLists
- final String[] BlackLists = listManager.getListslistArray(BLACKLIST_ALL);
- for (i = 0; i <= BlackLists.length - 1; i++) {
- prop.put(BLACKLIST + i + "_name", BlackLists[i]);
- prop.put(BLACKLIST + i + "_active", 0);
- prop.put(BLACKLIST + i + "_shared", 0);
- prop.put(BLACKLIST + i + "_selected", 0);
- if (BlackLists[i].equals(filename)) { //current List
- prop.put(BLACKLIST + i + "_selected", 1);
- }
- if (listManager.ListInListslist(BLACKLIST_ACTIVE, BlackLists[i])) {
- prop.put(BLACKLIST + i + "_active", 1);
- }
- if (listManager.ListInListslist(BLACKLIST_SHARED, BlackLists[i])) {
- prop.put(BLACKLIST + i + "_shared", 1);
+ int blacklistCount = 0;
+ if (dirlist != null) {
+ for (int i = 0; i <= dirlist.length - 1; i++) {
+ prop.put(BLACKLIST + blacklistCount + "_name", dirlist[i]);
+ prop.put(BLACKLIST + blacklistCount + "_shared", 0);
+
+ if (dirlist[i].equals(blacklistToUse)) { //current List
+ prop.put(BLACKLIST + blacklistCount + "_selected", 1);
+
+ for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) {
+ prop.put("currentActiveFor_" + blTypes + "_blTypeName",supportedBlacklistTypes[blTypes]);
+ prop.put("currentActiveFor_" + blTypes + "_checked",
+ listManager.ListInListslist(supportedBlacklistTypes[blTypes] + ".BlackLists",dirlist[i])?0:1);
+ }
+ prop.put("currentActiveFor",supportedBlacklistTypes.length);
+
+ }
+
+ if (listManager.ListInListslist(BLACKLIST_SHARED, dirlist[i])) {
+ prop.put(BLACKLIST + blacklistCount + "_shared", 1);
+ } else {
+ prop.put(BLACKLIST + blacklistCount + "_selected", 0);
+ }
+
+ int activeCount = 0;
+ for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) {
+ if (listManager.ListInListslist(supportedBlacklistTypes[blTypes] + ".BlackLists",dirlist[i])) {
+ prop.put(BLACKLIST + blacklistCount + "_active_" + activeCount + "_blTypeName",supportedBlacklistTypes[blTypes]);
+ activeCount++;
+ }
+ }
+ prop.put(BLACKLIST + blacklistCount + "_active",activeCount);
+ blacklistCount++;
}
}
- prop.put("blackLists", i);
- prop.put("filename", filename);
+ prop.put("blackLists", blacklistCount);
+
+ prop.put("currentBlacklist", blacklistToUse);
return prop;
}
diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java
index 574c0565f..e0266ec8e 100644
--- a/htroot/IndexControl_p.java
+++ b/htroot/IndexControl_p.java
@@ -63,6 +63,7 @@ import de.anomic.index.indexEntryAttribute;
import de.anomic.index.indexURL;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSwitchboard;
+import de.anomic.plasma.plasmaURLPattern;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@@ -462,7 +463,7 @@ public class IndexControl_p {
} else {
url = new URL(us);
- if (plasmaSwitchboard.urlBlacklist.isListed(url)) {
+ if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, url)) {
result.append("");
} else {
result.append("");
diff --git a/htroot/sharedBlacklist_p.java b/htroot/sharedBlacklist_p.java
index 904da1b5f..951b2eb88 100644
--- a/htroot/sharedBlacklist_p.java
+++ b/htroot/sharedBlacklist_p.java
@@ -61,6 +61,7 @@ import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashSet;
+import de.anomic.data.listManager;
import de.anomic.http.httpHeader;
import de.anomic.http.httpc;
import de.anomic.plasma.plasmaSwitchboard;
@@ -250,8 +251,16 @@ public class sharedBlacklist_p {
out += newItem+"\n";
prop.put("status_list_"+count+"_entry", newItem);
count++;
- if (plasmaSwitchboard.urlBlacklist != null)
- plasmaSwitchboard.urlBlacklist.add(newItem.substring(0, pos), newItem.substring(pos + 1));
+ if (plasmaSwitchboard.urlBlacklist != null) {
+ String supportedBlacklistTypesStr = env.getConfig("BlackLists.types", "");
+ String[] supportedBlacklistTypes = supportedBlacklistTypesStr.split(",");
+
+ for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) {
+ if (listManager.ListInListslist(supportedBlacklistTypes[blTypes] + ".BlackLists",filename)) {
+ plasmaSwitchboard.urlBlacklist.add(supportedBlacklistTypes[blTypes],newItem.substring(0, pos), newItem.substring(pos + 1));
+ }
+ }
+ }
//write the list
try{
diff --git a/htroot/yacy/transferRWI.java b/htroot/yacy/transferRWI.java
index cc3f5f642..5da6f2431 100644
--- a/htroot/yacy/transferRWI.java
+++ b/htroot/yacy/transferRWI.java
@@ -54,6 +54,7 @@ import de.anomic.http.httpHeader;
import de.anomic.index.indexEntry;
import de.anomic.index.indexURLEntry;
import de.anomic.plasma.plasmaSwitchboard;
+import de.anomic.plasma.plasmaURLPattern;
import de.anomic.server.serverCore;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@@ -163,7 +164,7 @@ public final class transferRWI {
wordhashes[received] = wordHash;
iEntry = new indexURLEntry(estring.substring(p));
urlHash = iEntry.urlHash();
- if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.hashInBlacklistedCache(urlHash))) {
+ if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.hashInBlacklistedCache(plasmaURLPattern.BLACKLIST_DHT, urlHash))) {
//int deleted = sb.wordIndex.tryRemoveURLs(urlHash);
yacyCore.log.logFine("transferRWI: blocked blacklisted URLHash '" + urlHash + "' from peer " + otherPeerName + "; deleted 1 URL entries from RWIs");
blocked++;
diff --git a/source/de/anomic/data/listManager.java b/source/de/anomic/data/listManager.java
index 9e1e9b6fc..9516de6e3 100644
--- a/source/de/anomic/data/listManager.java
+++ b/source/de/anomic/data/listManager.java
@@ -155,11 +155,13 @@ public class listManager {
// overloaded function to write an array
public static boolean writeList(File listFile, String[] list){
- String out = "";
- for(int i=0;i <= list.length; i++){
- out += list[i] + serverCore.crlfString;
+ StringBuffer out = new StringBuffer();
+ for(int i=0;i < list.length; i++){
+ out
+ .append(list[i])
+ .append(serverCore.crlfString);
}
- return writeList(listFile, out); //(File, String)
+ return writeList(listFile, out.toString()); //(File, String)
}
public static String getListString(String filename, boolean withcomments){
@@ -194,6 +196,12 @@ public class listManager {
String[] fileListString;
File[] fileList;
final File dir = new File(dirname);
+ return getDirListing(dir);
+ }
+
+ public static String[] getDirListing(File dir){
+ String[] fileListString;
+ File[] fileList;
if (dir != null ) {
if (!dir.exists()) {
@@ -207,7 +215,7 @@ public class listManager {
return fileListString;
}
return null;
- }
+ }
public static ArrayList getDirsRecursive(File dir, String notdir){
return getDirsRecursive(dir, notdir, true);
@@ -321,11 +329,21 @@ public class listManager {
// load all active Blacklists in the Proxy
public static void reloadBlacklists(){
- final String f = switchboard.getConfig("proxyBlackListsActive", "");
- de.anomic.plasma.plasmaSwitchboard.urlBlacklist.clear();
- if (f != "") {
- de.anomic.plasma.plasmaSwitchboard.urlBlacklist.loadList(f, "/");
- }
+ String supportedBlacklistTypesStr = switchboard.getConfig("BlackLists.types", "");
+ String[] supportedBlacklistTypes = supportedBlacklistTypesStr.split(",");
+
+ ArrayList blacklistFiles = new ArrayList(supportedBlacklistTypes.length);
+ for (int i=0; i < supportedBlacklistTypes.length; i++) {
+ String[] blacklistFile = new String[]{
+ supportedBlacklistTypes[i],
+ switchboard.getConfig(supportedBlacklistTypes[i] + ".BlackLists", "")
+ };
+ blacklistFiles.add(blacklistFile);
+ }
+
+ de.anomic.plasma.plasmaSwitchboard.urlBlacklist.clear();
+ de.anomic.plasma.plasmaSwitchboard.urlBlacklist.loadList((String[][])blacklistFiles.toArray(new String[blacklistFiles.size()][]), "/");
+
// switchboard.urlBlacklist.clear();
// if (f != "") switchboard.urlBlacklist.loadLists("black", f, "/");
}
diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java
index 4352f1091..76d9e8d71 100644
--- a/source/de/anomic/http/httpdProxyHandler.java
+++ b/source/de/anomic/http/httpdProxyHandler.java
@@ -96,6 +96,7 @@ import de.anomic.index.indexURL;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaSwitchboard;
+import de.anomic.plasma.plasmaURLPattern;
import de.anomic.server.serverCore;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverObjects;
@@ -389,7 +390,7 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
// respond a 404 for all AGIS ("all you get is shit") servers
String hostlow = host.toLowerCase();
if (args != null) { path = path + "?" + args; }
- if (plasmaSwitchboard.urlBlacklist.isListed(hostlow, path)) {
+ if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_PROXY, hostlow, path)) {
httpd.sendRespondError(conProp,respond,4,403,null,
"URL '" + hostlow + "' blocked by yacy proxy (blacklisted)",null);
this.theLogger.logInfo("AGIS blocking of host '" + hostlow + "'");
@@ -915,7 +916,7 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
// re-calc the url path
String remotePath = (args == null) ? path : (path + "?" + args);
- if (plasmaSwitchboard.urlBlacklist.isListed(hostlow, remotePath)) {
+ if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_PROXY, hostlow, remotePath)) {
httpd.sendRespondError(conProp,respond,4,403,null,
"URL '" + hostlow + "' blocked by yacy proxy (blacklisted)",null);
this.theLogger.logInfo("AGIS blocking of host '" + hostlow + "'");
@@ -1128,7 +1129,7 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
// blacklist idea inspired by [AS]:
// respond a 404 for all AGIS ("all you get is shit") servers
final String hostlow = host.toLowerCase();
- if (plasmaSwitchboard.urlBlacklist.isListed(hostlow, path)) {
+ if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_PROXY, hostlow, path)) {
httpd.sendRespondError(conProp,clientOut,4,403,null,
"URL '" + hostlow + "' blocked by yacy proxy (blacklisted)",null);
this.theLogger.logInfo("AGIS blocking of host '" + hostlow + "'");
diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java
index 1d59a4c5d..60ee71136 100644
--- a/source/de/anomic/plasma/plasmaCrawlLURL.java
+++ b/source/de/anomic/plasma/plasmaCrawlLURL.java
@@ -898,7 +898,7 @@ public final class plasmaCrawlLURL extends indexURL {
plasmaCrawlLURL.Entry entry = (plasmaCrawlLURL.Entry) eiter.next();
totalSearchedUrls++;
- if (plasmaSwitchboard.urlBlacklist.isListed(entry.url())==true) {
+ if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER,entry.url())==true) {
lastBlacklistedUrl = entry.url().toString();
lastBlacklistedHash = entry.hash();
serverLog.logFine("URLDBCLEANER", ++blacklistedUrls + " blacklisted (" + ((double)blacklistedUrls/totalSearchedUrls)*100 + "%): " + entry.hash() + " " + entry.url());
diff --git a/source/de/anomic/plasma/plasmaCrawlStacker.java b/source/de/anomic/plasma/plasmaCrawlStacker.java
index 4cdaef4e9..80c2a2879 100644
--- a/source/de/anomic/plasma/plasmaCrawlStacker.java
+++ b/source/de/anomic/plasma/plasmaCrawlStacker.java
@@ -283,7 +283,7 @@ public final class plasmaCrawlStacker {
}
// check blacklist
- if (plasmaSwitchboard.urlBlacklist.isListed(nexturl)) {
+ if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER,nexturl)) {
reason = plasmaCrawlEURL.DENIED_URL_IN_BLACKLIST;
this.log.logFine("URL '" + nexturlString + "' is in blacklist. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
diff --git a/source/de/anomic/plasma/plasmaCrawlWorker.java b/source/de/anomic/plasma/plasmaCrawlWorker.java
index 89c9833dd..f615d3786 100644
--- a/source/de/anomic/plasma/plasmaCrawlWorker.java
+++ b/source/de/anomic/plasma/plasmaCrawlWorker.java
@@ -311,7 +311,7 @@ public final class plasmaCrawlWorker extends Thread {
// check if url is in blacklist
String hostlow = host.toLowerCase();
- if (plasmaSwitchboard.urlBlacklist.isListed(hostlow, path)) {
+ if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, hostlow, path)) {
log.logInfo("CRAWLER Rejecting URL '" + url.toString() + "'. URL is in blacklist.");
addURLtoErrorDB(url, refererURLString, initiator, name, plasmaCrawlEURL.DENIED_URL_IN_BLACKLIST, new bitfield(indexURL.urlFlagLength));
return null;
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index d477426ca..d63c2d728 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -125,6 +125,7 @@ import java.util.logging.Level;
import de.anomic.data.blogBoard;
import de.anomic.data.bookmarksDB;
+import de.anomic.data.listManager;
import de.anomic.data.messageBoard;
import de.anomic.data.wikiBoard;
import de.anomic.data.userDB;
@@ -302,14 +303,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// load the black-list / inspired by [AS]
File ulrBlackListFile = new File(getRootPath(), getConfig("listsPath", "DATA/LISTS"));
- urlBlacklist = new plasmaURLPattern(ulrBlackListFile);
- String f = getConfig("proxyBlackListsActive", null);
- if (f != null) {
- urlBlacklist.loadList(f, "/");
- this.log.logConfig("loaded black-list from file " + ulrBlackListFile.getName() + ", " +
- urlBlacklist.size() + " entries, " +
- ppRamString(ulrBlackListFile.length()/1024));
- }
+ urlBlacklist = new plasmaURLPattern(ulrBlackListFile);
+ listManager.switchboard = this;
+ listManager.listsPath = ulrBlackListFile;
+ listManager.reloadBlacklists();
// load badwords (to filter the topwords)
if (badwords == null) {
diff --git a/source/de/anomic/plasma/plasmaURLPattern.java b/source/de/anomic/plasma/plasmaURLPattern.java
index 7c129e32f..55fde4548 100644
--- a/source/de/anomic/plasma/plasmaURLPattern.java
+++ b/source/de/anomic/plasma/plasmaURLPattern.java
@@ -43,78 +43,158 @@ package de.anomic.plasma;
import java.io.File;
import de.anomic.net.URL;
+
+import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
+import java.util.Iterator;
import java.util.Set;
import de.anomic.kelondro.kelondroMSetTools;
public class plasmaURLPattern {
+
+ public static final String BLACKLIST_CRAWLER = "crawler";
+ public static final String BLACKLIST_PROXY = "proxy";
+ public static final String BLACKLIST_DHT = "dht";
+ public static final String BLACKLIST_SEARCH = "search";
+
+ public static final HashSet BLACKLIST_TYPES = new HashSet(Arrays.asList(new String[]{
+ BLACKLIST_CRAWLER,
+ BLACKLIST_PROXY,
+ BLACKLIST_DHT,
+ BLACKLIST_SEARCH
+ }));
+
- private Set cachedUrlHashs = Collections.synchronizedSet(new HashSet());
- private File rootPath = null;
+ private File blacklistRootPath = null;
+ private HashMap cachedUrlHashs = null;
private HashMap hostpaths = null; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here
public plasmaURLPattern(File rootPath) {
super();
- this.rootPath = rootPath;
+ this.blacklistRootPath = rootPath;
+
+ // prepare the data structure
this.hostpaths = new HashMap();
+ this.cachedUrlHashs = new HashMap();
+
+ Iterator iter = BLACKLIST_TYPES.iterator();
+ while (iter.hasNext()) {
+ String blacklistType = (String) iter.next();
+ this.hostpaths.put(blacklistType, new HashMap());
+ this.cachedUrlHashs.put(blacklistType, Collections.synchronizedSet(new HashSet()));
+ }
}
public void clear() {
- this.hostpaths = new HashMap();
+ Iterator iter = this.hostpaths.keySet().iterator();
+ while (iter.hasNext()) {
+ HashMap blacklistMap = (HashMap) this.hostpaths.get(iter.next());
+ blacklistMap.clear();
+ }
}
public int size() {
- return hostpaths.size();
+ int size = 0;
+ Iterator iter = this.hostpaths.keySet().iterator();
+ while (iter.hasNext()) {
+ HashMap blacklistMap = (HashMap) this.hostpaths.get(iter.next());
+ size += blacklistMap.size();
+ }
+ return size;
}
-
- public void loadList(String filenames, String sep) {
- // File listsPath = new File(getRootPath(), getConfig("listsPath", "DATA/LISTS"));
- final String[] filenamesarray = filenames.split(",");
+
+ public void loadList(String blacklistType, String filenames, String sep) {
+ if (blacklistType == null) throw new IllegalArgumentException();
+ if (!BLACKLIST_TYPES.contains(blacklistType)) throw new IllegalArgumentException("Unknown backlist type.");
+
+ HashMap blacklistMap = (HashMap) this.hostpaths.get(blacklistType);
+ String[] filenamesarray = filenames.split(",");
if( filenamesarray.length > 0) {
for (int i = 0; i < filenamesarray.length; i++) {
- hostpaths.putAll(kelondroMSetTools.loadMap(new File(rootPath, filenamesarray[i]).toString(), sep));
+ blacklistMap.putAll(kelondroMSetTools.loadMap(new File(this.blacklistRootPath, filenamesarray[i]).toString(), sep));
}
+ }
+ }
+
+ public void loadList(String[][] filenames, String sep) {
+ for (int j = 0; j < filenames.length; j++) {
+ String[] nextFile = filenames[j];
+ String blacklistType = nextFile[0];
+ String fileName = nextFile[1];
+ this.loadList(blacklistType, fileName, sep);
}
}
- public void remove(String host) {
- hostpaths.remove(host);
+ public void remove(String blacklistType, String host) {
+ if (blacklistType == null) throw new IllegalArgumentException();
+ if (!BLACKLIST_TYPES.contains(blacklistType)) throw new IllegalArgumentException("Unknown backlist type.");
+
+ HashMap blacklistMap = (HashMap) this.hostpaths.get(blacklistType);
+ blacklistMap.remove(host);
}
- public void add(String host, String path) {
+ public void add(String blacklistType, String host, String path) {
+ if (host == null) throw new NullPointerException();
+ if (path == null) throw new NullPointerException();
+ if (blacklistType == null) throw new IllegalArgumentException();
+ if (!BLACKLIST_TYPES.contains(blacklistType)) throw new IllegalArgumentException("Unknown backlist type.");
+
if (path.length() > 0 && path.charAt(0) == '/') path = path.substring(1);
- hostpaths.put(host.toLowerCase(), path);
+
+ HashMap blacklistMap = (HashMap) this.hostpaths.get(blacklistType);
+ blacklistMap.put(host.toLowerCase(), path);
}
public int blacklistCacheSize() {
- return cachedUrlHashs.size();
+ int size = 0;
+ Iterator iter = this.cachedUrlHashs.keySet().iterator();
+ while (iter.hasNext()) {
+ Set blacklistMap = (Set) this.cachedUrlHashs.get(iter.next());
+ size += blacklistMap.size();
+ }
+ return size;
}
- public boolean hashInBlacklistedCache(String urlHash) {
- return cachedUrlHashs.contains(urlHash);
+ public boolean hashInBlacklistedCache(String blacklistType, String urlHash) {
+ if (blacklistType == null) throw new IllegalArgumentException();
+ if (!BLACKLIST_TYPES.contains(blacklistType)) throw new IllegalArgumentException("Unknown backlist type.");
+
+ Set urlHashCache = (Set) this.cachedUrlHashs.get(blacklistType);
+ return urlHashCache.contains(urlHash);
}
- public boolean isListed(String urlHash, URL url) {
- if (!cachedUrlHashs.contains(urlHash)) {
- boolean temp = isListed(url.getHost().toLowerCase(), url.getFile());
- if (temp)
- {
- cachedUrlHashs.add(urlHash);
- }
+ public boolean isListed(String blacklistType, String urlHash, URL url) {
+ if (blacklistType == null) throw new IllegalArgumentException();
+ if (!BLACKLIST_TYPES.contains(blacklistType)) throw new IllegalArgumentException("Unknown backlist type.");
+
+ Set urlHashCache = (Set) this.cachedUrlHashs.get(blacklistType);
+ if (!urlHashCache.contains(urlHash)) {
+ boolean temp = isListed(blacklistType, url.getHost().toLowerCase(), url.getFile());
+ if (temp) {
+ urlHashCache.add(urlHash);
+ }
return temp;
}
return true;
}
- public boolean isListed(URL url) {
- return isListed(url.getHost().toLowerCase(), url.getFile());
+ public boolean isListed(String blacklistType, URL url) {
+ return isListed(blacklistType, url.getHost().toLowerCase(), url.getFile());
}
- public boolean isListed(String hostlow, String path) {
+ public boolean isListed(String blacklistType, String hostlow, String path) {
+ if (hostlow == null) throw new NullPointerException();
+ if (path == null) throw new NullPointerException();
+ if (blacklistType == null) throw new IllegalArgumentException();
+ if (!BLACKLIST_TYPES.contains(blacklistType)) throw new IllegalArgumentException("Unknown backlist type.");
+
+ // getting the proper blacklist
+ HashMap blacklistMap = (HashMap) this.hostpaths.get(blacklistType);
+
if (path.length() > 0 && path.charAt(0) == '/') path = path.substring(1);
String pp = ""; // path-pattern
@@ -122,19 +202,19 @@ public class plasmaURLPattern {
// [TL] While "." are found within the string
int index = 0;
while ((index = hostlow.indexOf('.', index + 1)) != -1) {
- if ((pp = (String) hostpaths.get(hostlow.substring(0, index + 1) + "*")) != null) {
+ if ((pp = (String) blacklistMap.get(hostlow.substring(0, index + 1) + "*")) != null) {
return ((pp.equals("*")) || (path.matches(pp)));
}
}
index = hostlow.length();
while ((index = hostlow.lastIndexOf('.', index - 1)) != -1) {
- if ((pp = (String) hostpaths.get("*" + hostlow.substring(index, hostlow.length()))) != null) {
+ if ((pp = (String) blacklistMap.get("*" + hostlow.substring(index, hostlow.length()))) != null) {
return ((pp.equals("*")) || (path.matches(pp)));
}
}
// try to match without wildcard in domain
- return (((pp = (String) hostpaths.get(hostlow)) != null) &&
+ return (((pp = (String) blacklistMap.get(hostlow)) != null) &&
((pp.equals("*")) || (path.matches(pp))));
}
diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java
index 11cb768db..0c14160b1 100644
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@@ -708,7 +708,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
// "+entry.getUrlHash());
try {
url = lurl.getEntry(entry.urlHash(), null).url();
- if ((url == null) || (plasmaSwitchboard.urlBlacklist.isListed(url) == true)) {
+ if ((url == null) || (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, url) == true)) {
urlHashs.add(entry.urlHash());
}
} catch (IOException e) {
diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java
index 8d0864f03..79cf58d28 100644
--- a/source/de/anomic/yacy/yacyClient.java
+++ b/source/de/anomic/yacy/yacyClient.java
@@ -478,7 +478,7 @@ public final class yacyClient {
for (int n = 0; n < results; n++) {
// get one single search result
urlEntry = urlManager.newEntry((String) result.get("resource" + n), true);
- if ((urlEntry == null) || (blacklist.isListed(urlEntry.url()))) { continue; } // block with backlist
+ if ((urlEntry == null) || (blacklist.isListed(plasmaURLPattern.BLACKLIST_SEARCH, urlEntry.url()))) { continue; } // block with backlist
urlEntry.store();
int urlLength = urlEntry.url().toString().length();
int urlComps = htmlFilterContentScraper.urlComps(urlEntry.url().toString()).length;
diff --git a/source/migration.java b/source/migration.java
index a36690dc3..9e763c8f8 100644
--- a/source/migration.java
+++ b/source/migration.java
@@ -245,6 +245,16 @@ public class migration {
sb.setConfig("portForwarding.sch.HostUser", sb.getConfig("portForwardingHostUser",""));
sb.setConfig("portForwarding.sch.HostPwd", sb.getConfig("portForwardingHostPwd",""));
}
+
+ // migration for blacklists
+ if ((value = sb.getConfig("proxyBlackLists","")).length() > 0) {
+ sb.setConfig("proxy.BlackLists", value);
+ sb.setConfig("crawler.BlackLists", value);
+ sb.setConfig("dht.BlackLists", value);
+ sb.setConfig("search.BlackLists", value);
+
+ sb.setConfig("BlackLists.Shared",sb.getConfig("proxyBlackListsShared",""));
+ }
}
}
diff --git a/yacy.init b/yacy.init
index 56a72273b..74fdec289 100644
--- a/yacy.init
+++ b/yacy.init
@@ -205,9 +205,14 @@ proxyYellowList=yacy.yellow
# the black-list; URLs appearing in this list will not be loaded;
# instead always a 404 is returned
# all these files will be placed in the listsPath
-proxyBlackLists=url.default.black
-proxyBlackListsActive=url.default.black
-proxyBlackListsShared=url.default.black
+BlackLists.types=proxy,crawler,dht,search
+BlackLists.Shared=url.default.black
+
+proxy.BlackLists=url.default.black
+crawler.BlackLists=url.default.black
+dht.BlackLists=url.default.black
+search.BlackLists=url.default.black
+
proxyCookieBlackList=cookie.default.black
proxyCookieWhiteList=cookie.default.black