From d2e8e762187ffda937cf4bd07e04d144cefe99b3 Mon Sep 17 00:00:00 2001 From: theli Date: Sat, 12 Aug 2006 02:42:10 +0000 Subject: [PATCH] *) now it's possible to configure the yacy blacklist separately for dht, search, proxy, crawler See: http://www.yacy-forum.de/viewtopic.php?t=2541 http://www.yacy-forum.de/viewtopic.php?p=24516 git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2389 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/Blacklist_p.html | 255 ++++++------ htroot/Blacklist_p.java | 374 +++++++++++------- htroot/IndexControl_p.java | 3 +- htroot/sharedBlacklist_p.java | 13 +- htroot/yacy/transferRWI.java | 3 +- source/de/anomic/data/listManager.java | 38 +- source/de/anomic/http/httpdProxyHandler.java | 7 +- source/de/anomic/plasma/plasmaCrawlLURL.java | 2 +- .../de/anomic/plasma/plasmaCrawlStacker.java | 2 +- .../de/anomic/plasma/plasmaCrawlWorker.java | 2 +- .../de/anomic/plasma/plasmaSwitchboard.java | 13 +- source/de/anomic/plasma/plasmaURLPattern.java | 140 +++++-- source/de/anomic/plasma/plasmaWordIndex.java | 2 +- source/de/anomic/yacy/yacyClient.java | 2 +- source/migration.java | 10 + yacy.init | 11 +- 16 files changed, 549 insertions(+), 328 deletions(-) diff --git a/htroot/Blacklist_p.html b/htroot/Blacklist_p.html index 5c0690d3b..3ceeb3d0b 100644 --- a/htroot/Blacklist_p.html +++ b/htroot/Blacklist_p.html @@ -14,151 +14,172 @@ You may also provide your blacklist to other peers by sharing them; in return yo collect blacklist entries from other peers.

- - - - - - + + + + + + + + + + + + - - - - - + + + + + + + + + - - - - - + + + + + + + + + - -
-   -
-
- + + + + + + + + - - - - - - + + -
 
- - - + + + - - - - - + +
- Edit list: - -
-
-
-
-
-
-
+ Edit list: + +
+ +
+
+ Activate this list for ... + + #{currentActiveFor}# + + + + #{/currentActiveFor}# +
#[blTypeName]#
+ +
+ +
+
+
- New list: - -
-
-
+ New list: + +
+
+
- - -
-

Active list: #[filename]#

-
+
- - + + + + - +
  • domain/fullpath
  • +
  • domain/.* or regexpr
  • +
  • *.domain/.* or regexpr
  • +
  • domain.*/.* or regexpr
  • +
  • *.sub.domain/.* or regexpr
  • +
  • sub.domain.*/.* or regexpr
  • + + + - + +
    + + +
    +

    Active list: #[currentBlacklist]#

    +
    +
    + + + - - - - - - + + + + + + - - - - -
    These are the domain name / path patterns in this blacklist:
    You can select them here for deletion -
    - - - -

    - -

    +
    + + +

    + +

    Enter new domain name / path pattern in the form: -
    -

    -

    - -
    - -
      - Import blacklist items from other YaCy peers:
    + +

    +

    + +
    + +
     
    Import blacklist items from other YaCy peers:
    - + Host: -

    +

    -
    - Import blacklist items from URL:
    +
    Import blacklist items from URL:
    - + URL:

    -

    -
    -

    Import blacklist items from file:

    -
    - +
    +

    Import blacklist items from file:

    +
    + File:

    - +

    -
    + + +

    -#(status)# + +#(status)# :: #[item]# was removed from blacklist :: diff --git a/htroot/Blacklist_p.java b/htroot/Blacklist_p.java index 65a06f1b1..ab026a6c1 100644 --- a/htroot/Blacklist_p.java +++ b/htroot/Blacklist_p.java @@ -49,14 +49,15 @@ // if the shell's current path is HTROOT import java.io.File; +import java.io.FileWriter; import java.io.IOException; +import java.io.PrintWriter; import java.util.ArrayList; import java.util.Arrays; import java.util.Enumeration; import de.anomic.data.listManager; import de.anomic.http.httpHeader; import de.anomic.plasma.plasmaSwitchboard; -import de.anomic.server.serverCore; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.yacy.yacyCore; @@ -64,196 +65,273 @@ import de.anomic.yacy.yacySeed; public class Blacklist_p { private final static String BLACKLIST = "blackLists_"; - private final static String BLACKLIST_ALL = "proxyBlackLists"; - private final static String BLACKLIST_ACTIVE = "proxyBlackListsActive"; - private final static String BLACKLIST_SHARED = "proxyBlackListsShared"; + private final static String BLACKLIST_SHARED = "BlackLists.Shared"; public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) { - // return variable that accumulates replacements + + // initialize the list manager listManager.switchboard = (plasmaSwitchboard) env; listManager.listsPath = new File(listManager.switchboard.getRootPath(),listManager.switchboard.getConfig("listManager.listsPath", "DATA/LISTS")); - final serverObjects prop = new serverObjects(); - String line; -// String HTMLout = ""; - - String removeItem = "removeme"; - int numItems = 0; - int i; // need below - - String[] filenames = listManager.getListslistArray(BLACKLIST_ALL); - String filename = ""; - + + // getting the list of supported blacklist types + String supportedBlacklistTypesStr = env.getConfig("BlackLists.types", ""); + String[] supportedBlacklistTypes = supportedBlacklistTypesStr.split(","); + + String blacklistToUse = null; + serverObjects prop = new serverObjects(); + + // do all post operations if (post != null) { - if (post.containsKey("blackLists")) { // Blacklist selected - filename = (String)post.get("blackLists"); - } else if (post.containsKey("filename")) { - filename = (String)post.get("filename"); - } else if (filenames.length > 0){ // first BlackList - filename = filenames[0]; -// } else { //No BlackList -// System.out.println("DEBUG: No Blacklist found"); + + if (post.containsKey("selectList")) { + blacklistToUse = (String)post.get("selectedListName"); } - prop.put("status", 0); // nothing + if (post.containsKey("createNewList")) { + /* =========================================================== + * Creation of a new blacklist + * =========================================================== */ + + blacklistToUse = (String)post.get("newListName"); + if (!blacklistToUse.endsWith(".black")) blacklistToUse += ".black"; - // del list - if (post.containsKey("dellistbutton")) { - final File BlackListFile = new File(listManager.listsPath, filename); + try { + final File newFile = new File(listManager.listsPath, blacklistToUse); + newFile.createNewFile(); + + // share the newly created blacklist + listManager.addListToListslist(BLACKLIST_SHARED, blacklistToUse); + + // activate it for all known blacklist types + for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) { + listManager.addListToListslist(supportedBlacklistTypes[blTypes] + ".BlackLists",blacklistToUse); + } + } catch (IOException e) {/* */} + + } else if (post.containsKey("deleteList")) { + /* =========================================================== + * Delete a blacklist + * =========================================================== */ + + blacklistToUse = (String)post.get("selectedListName"); + + File BlackListFile = new File(listManager.listsPath, blacklistToUse); BlackListFile.delete(); - // remove from all BlackLists Lists - listManager.removeListFromListslist(BLACKLIST_ALL, filename); - listManager.removeListFromListslist(BLACKLIST_ACTIVE, filename); - listManager.removeListFromListslist(BLACKLIST_SHARED, filename); - + for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) { + listManager.removeListFromListslist(supportedBlacklistTypes[blTypes] + ".BlackLists",blacklistToUse); + } + + // remove it from the shared list + listManager.removeListFromListslist(BLACKLIST_SHARED, blacklistToUse); + blacklistToUse = null; + // reload Blacklists listManager.reloadBlacklists(); - filenames = listManager.getListslistArray(BLACKLIST_ALL); - if (filenames.length > 0) { - filename = filenames[0]; - } - // new list - } else if (post.containsKey("newlistbutton")) { - String newList = (String)post.get("newlist"); - if (!newList.endsWith(".black")) { - newList += ".black"; - } - filename = newList; //to select it in the returnes Document - try { - final File newFile = new File(listManager.listsPath, newList); - newFile.createNewFile(); - listManager.addListToListslist(BLACKLIST_ALL, newList); - listManager.addListToListslist(BLACKLIST_ACTIVE, newList); - listManager.addListToListslist(BLACKLIST_SHARED, newList); - } catch (IOException e) {} + } else if (post.containsKey("activateList")) { + /* =========================================================== + * Activate/Deactivate a blacklist + * =========================================================== */ + + blacklistToUse = (String)post.get("selectedListName"); + + for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) { + if (post.containsKey("activateList4" + supportedBlacklistTypes[blTypes])) { + listManager.addListToListslist(supportedBlacklistTypes[blTypes] + ".BlackLists",blacklistToUse); + } else { + listManager.removeListFromListslist(supportedBlacklistTypes[blTypes] + ".BlackLists",blacklistToUse); + } + } - } else if (post.containsKey("activatelistbutton")) { - if( listManager.ListInListslist(BLACKLIST_ACTIVE, filename) ) { - listManager.removeListFromListslist(BLACKLIST_ACTIVE, filename); - } else { // inactive list -> enable - listManager.addListToListslist(BLACKLIST_ACTIVE, filename); - } - listManager.reloadBlacklists(); + listManager.reloadBlacklists(); + + } else if (post.containsKey("shareList")) { - } else if (post.containsKey("sharelistbutton")) { - if (listManager.ListInListslist(BLACKLIST_SHARED, filename)) { + /* =========================================================== + * Share a blacklist + * =========================================================== */ + + blacklistToUse = (String)post.get("selectedListName"); + + if (listManager.ListInListslist(BLACKLIST_SHARED, blacklistToUse)) { // Remove from shared BlackLists - listManager.removeListFromListslist(BLACKLIST_SHARED, filename); + listManager.removeListFromListslist(BLACKLIST_SHARED, blacklistToUse); } else { // inactive list -> enable - listManager.addListToListslist(BLACKLIST_SHARED, filename); + listManager.addListToListslist(BLACKLIST_SHARED, blacklistToUse); + } + } else if (post.containsKey("deleteBlacklistEntry")) { + + /* =========================================================== + * Delete a blacklist entry + * =========================================================== */ + + // get the current selected blacklist name + blacklistToUse = (String)post.get("currentBlacklist"); + + // get the entry that should be deleted + String oldEntry = (String)post.get("selectedEntry"); + + // load blacklist data from file + ArrayList list = listManager.getListArray(new File(listManager.listsPath, blacklistToUse)); + + // delete the old entry from file + if (list != null) { + for (int i=0; i < list.size(); i++) { + if (((String)list.get(i)).equals(oldEntry)) { + list.remove(i); + break; + } + } + listManager.writeList(new File(listManager.listsPath, blacklistToUse), (String[])list.toArray(new String[list.size()])); + } + + // remove the entry from the running blacklist engine + int pos = oldEntry.indexOf("/"); + if (pos < 0) { + // add default empty path pattern + pos = oldEntry.length(); + oldEntry = oldEntry + "/.*"; + } + for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) { + if (listManager.ListInListslist(supportedBlacklistTypes[blTypes] + ".BlackLists",blacklistToUse)) { + plasmaSwitchboard.urlBlacklist.add(supportedBlacklistTypes[blTypes],oldEntry.substring(0, pos), oldEntry.substring(pos + 1)); + } + } + + } else if (post.containsKey("addBlacklistEntry")) { + + /* =========================================================== + * Add a new blacklist entry + * =========================================================== */ + + blacklistToUse = (String)post.get("currentBlacklist"); + + String newEntry = (String)post.get("newEntry"); + + // TODO: ignore empty entries + + if (newEntry.startsWith("http://") ){ + newEntry = newEntry.substring(7); + } + + int pos = newEntry.indexOf("/"); + if (pos < 0) { + // add default empty path pattern + pos = newEntry.length(); + newEntry = newEntry + "/.*"; + } + + // append the line to the file + PrintWriter pw = null; + try { + pw = new PrintWriter(new FileWriter(new File(listManager.listsPath, blacklistToUse), true)); + pw.println(newEntry); + pw.close(); + } catch (IOException e) { + e.printStackTrace(); + } finally { + if (pw != null) try { pw.close(); } catch (Exception e){ /* */} } - } // List Management End - // remove a Item? - if (post.containsKey("delbutton") && - post.containsKey("Itemlist") && - !((String)post.get("Itemlist")).equals("") ) { - removeItem = (String)post.get("Itemlist"); + // add to blacklist + for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) { + if (listManager.ListInListslist(supportedBlacklistTypes[blTypes] + ".BlackLists",blacklistToUse)) { + plasmaSwitchboard.urlBlacklist.add(supportedBlacklistTypes[blTypes],newEntry.substring(0, pos), newEntry.substring(pos + 1)); + } + } } - } // post != null + + } + + // loading all blacklist files located in the directory + String[] dirlist = listManager.getDirListing(listManager.listsPath); + + // if we have not chosen a blacklist until yet we use the first file + if (blacklistToUse == null && dirlist != null && dirlist.length > 0) { + blacklistToUse = dirlist[0]; + } + - // Read the List - final ArrayList list = listManager.getListArray(new File(listManager.listsPath, filename)); - final StringBuffer out = new StringBuffer(list.size() * 64); + // Read the blacklist items from file + final ArrayList list = listManager.getListArray(new File(listManager.listsPath, blacklistToUse)); + + // sort them String[] sortedlist = new String[list.size()]; Arrays.sort(list.toArray(sortedlist)); + + // display them + int entryCount = 0; for (int j=0;j 0) { // no nullpointer error final Enumeration e = yacyCore.seedDB.seedsConnected(true, false, null); - i = 0; + int peerCount = 0; while (e.hasMoreElements()) { seed = (yacySeed) e.nextElement(); if (seed != null) { final String Hash = seed.hash; final String Name = seed.get(yacySeed.NAME, "nameless"); - prop.put("otherHosts_" + i + "_hash", Hash); - prop.put("otherHosts_" + i + "_name", Name); - i++; + prop.put("otherHosts_" + peerCount + "_hash", Hash); + prop.put("otherHosts_" + peerCount + "_name", Name); + peerCount++; } } - prop.put("otherHosts", i); -// } else { -// System.out.println("BlackList_p: yacy seed not loaded!"); // DEBUG: + prop.put("otherHosts", peerCount); } - + + // List BlackLists - final String[] BlackLists = listManager.getListslistArray(BLACKLIST_ALL); - for (i = 0; i <= BlackLists.length - 1; i++) { - prop.put(BLACKLIST + i + "_name", BlackLists[i]); - prop.put(BLACKLIST + i + "_active", 0); - prop.put(BLACKLIST + i + "_shared", 0); - prop.put(BLACKLIST + i + "_selected", 0); - if (BlackLists[i].equals(filename)) { //current List - prop.put(BLACKLIST + i + "_selected", 1); - } - if (listManager.ListInListslist(BLACKLIST_ACTIVE, BlackLists[i])) { - prop.put(BLACKLIST + i + "_active", 1); - } - if (listManager.ListInListslist(BLACKLIST_SHARED, BlackLists[i])) { - prop.put(BLACKLIST + i + "_shared", 1); + int blacklistCount = 0; + if (dirlist != null) { + for (int i = 0; i <= dirlist.length - 1; i++) { + prop.put(BLACKLIST + blacklistCount + "_name", dirlist[i]); + prop.put(BLACKLIST + blacklistCount + "_shared", 0); + + if (dirlist[i].equals(blacklistToUse)) { //current List + prop.put(BLACKLIST + blacklistCount + "_selected", 1); + + for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) { + prop.put("currentActiveFor_" + blTypes + "_blTypeName",supportedBlacklistTypes[blTypes]); + prop.put("currentActiveFor_" + blTypes + "_checked", + listManager.ListInListslist(supportedBlacklistTypes[blTypes] + ".BlackLists",dirlist[i])?0:1); + } + prop.put("currentActiveFor",supportedBlacklistTypes.length); + + } + + if (listManager.ListInListslist(BLACKLIST_SHARED, dirlist[i])) { + prop.put(BLACKLIST + blacklistCount + "_shared", 1); + } else { + prop.put(BLACKLIST + blacklistCount + "_selected", 0); + } + + int activeCount = 0; + for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) { + if (listManager.ListInListslist(supportedBlacklistTypes[blTypes] + ".BlackLists",dirlist[i])) { + prop.put(BLACKLIST + blacklistCount + "_active_" + activeCount + "_blTypeName",supportedBlacklistTypes[blTypes]); + activeCount++; + } + } + prop.put(BLACKLIST + blacklistCount + "_active",activeCount); + blacklistCount++; } } - prop.put("blackLists", i); - prop.put("filename", filename); + prop.put("blackLists", blacklistCount); + + prop.put("currentBlacklist", blacklistToUse); return prop; } diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java index 574c0565f..e0266ec8e 100644 --- a/htroot/IndexControl_p.java +++ b/htroot/IndexControl_p.java @@ -63,6 +63,7 @@ import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexURL; import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.plasma.plasmaURLPattern; import de.anomic.plasma.plasmaWordIndex; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -462,7 +463,7 @@ public class IndexControl_p { } else { url = new URL(us); - if (plasmaSwitchboard.urlBlacklist.isListed(url)) { + if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, url)) { result.append(""); } else { result.append(""); diff --git a/htroot/sharedBlacklist_p.java b/htroot/sharedBlacklist_p.java index 904da1b5f..951b2eb88 100644 --- a/htroot/sharedBlacklist_p.java +++ b/htroot/sharedBlacklist_p.java @@ -61,6 +61,7 @@ import java.util.ArrayList; import java.util.Enumeration; import java.util.HashSet; +import de.anomic.data.listManager; import de.anomic.http.httpHeader; import de.anomic.http.httpc; import de.anomic.plasma.plasmaSwitchboard; @@ -250,8 +251,16 @@ public class sharedBlacklist_p { out += newItem+"\n"; prop.put("status_list_"+count+"_entry", newItem); count++; - if (plasmaSwitchboard.urlBlacklist != null) - plasmaSwitchboard.urlBlacklist.add(newItem.substring(0, pos), newItem.substring(pos + 1)); + if (plasmaSwitchboard.urlBlacklist != null) { + String supportedBlacklistTypesStr = env.getConfig("BlackLists.types", ""); + String[] supportedBlacklistTypes = supportedBlacklistTypesStr.split(","); + + for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) { + if (listManager.ListInListslist(supportedBlacklistTypes[blTypes] + ".BlackLists",filename)) { + plasmaSwitchboard.urlBlacklist.add(supportedBlacklistTypes[blTypes],newItem.substring(0, pos), newItem.substring(pos + 1)); + } + } + } //write the list try{ diff --git a/htroot/yacy/transferRWI.java b/htroot/yacy/transferRWI.java index cc3f5f642..5da6f2431 100644 --- a/htroot/yacy/transferRWI.java +++ b/htroot/yacy/transferRWI.java @@ -54,6 +54,7 @@ import de.anomic.http.httpHeader; import de.anomic.index.indexEntry; import de.anomic.index.indexURLEntry; import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.plasma.plasmaURLPattern; import de.anomic.server.serverCore; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -163,7 +164,7 @@ public final class transferRWI { wordhashes[received] = wordHash; iEntry = new indexURLEntry(estring.substring(p)); urlHash = iEntry.urlHash(); - if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.hashInBlacklistedCache(urlHash))) { + if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.hashInBlacklistedCache(plasmaURLPattern.BLACKLIST_DHT, urlHash))) { //int deleted = sb.wordIndex.tryRemoveURLs(urlHash); yacyCore.log.logFine("transferRWI: blocked blacklisted URLHash '" + urlHash + "' from peer " + otherPeerName + "; deleted 1 URL entries from RWIs"); blocked++; diff --git a/source/de/anomic/data/listManager.java b/source/de/anomic/data/listManager.java index 9e1e9b6fc..9516de6e3 100644 --- a/source/de/anomic/data/listManager.java +++ b/source/de/anomic/data/listManager.java @@ -155,11 +155,13 @@ public class listManager { // overloaded function to write an array public static boolean writeList(File listFile, String[] list){ - String out = ""; - for(int i=0;i <= list.length; i++){ - out += list[i] + serverCore.crlfString; + StringBuffer out = new StringBuffer(); + for(int i=0;i < list.length; i++){ + out + .append(list[i]) + .append(serverCore.crlfString); } - return writeList(listFile, out); //(File, String) + return writeList(listFile, out.toString()); //(File, String) } public static String getListString(String filename, boolean withcomments){ @@ -194,6 +196,12 @@ public class listManager { String[] fileListString; File[] fileList; final File dir = new File(dirname); + return getDirListing(dir); + } + + public static String[] getDirListing(File dir){ + String[] fileListString; + File[] fileList; if (dir != null ) { if (!dir.exists()) { @@ -207,7 +215,7 @@ public class listManager { return fileListString; } return null; - } + } public static ArrayList getDirsRecursive(File dir, String notdir){ return getDirsRecursive(dir, notdir, true); @@ -321,11 +329,21 @@ public class listManager { // load all active Blacklists in the Proxy public static void reloadBlacklists(){ - final String f = switchboard.getConfig("proxyBlackListsActive", ""); - de.anomic.plasma.plasmaSwitchboard.urlBlacklist.clear(); - if (f != "") { - de.anomic.plasma.plasmaSwitchboard.urlBlacklist.loadList(f, "/"); - } + String supportedBlacklistTypesStr = switchboard.getConfig("BlackLists.types", ""); + String[] supportedBlacklistTypes = supportedBlacklistTypesStr.split(","); + + ArrayList blacklistFiles = new ArrayList(supportedBlacklistTypes.length); + for (int i=0; i < supportedBlacklistTypes.length; i++) { + String[] blacklistFile = new String[]{ + supportedBlacklistTypes[i], + switchboard.getConfig(supportedBlacklistTypes[i] + ".BlackLists", "") + }; + blacklistFiles.add(blacklistFile); + } + + de.anomic.plasma.plasmaSwitchboard.urlBlacklist.clear(); + de.anomic.plasma.plasmaSwitchboard.urlBlacklist.loadList((String[][])blacklistFiles.toArray(new String[blacklistFiles.size()][]), "/"); + // switchboard.urlBlacklist.clear(); // if (f != "") switchboard.urlBlacklist.loadLists("black", f, "/"); } diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java index 4352f1091..76d9e8d71 100644 --- a/source/de/anomic/http/httpdProxyHandler.java +++ b/source/de/anomic/http/httpdProxyHandler.java @@ -96,6 +96,7 @@ import de.anomic.index.indexURL; import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaParser; import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.plasma.plasmaURLPattern; import de.anomic.server.serverCore; import de.anomic.server.serverFileUtils; import de.anomic.server.serverObjects; @@ -389,7 +390,7 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt // respond a 404 for all AGIS ("all you get is shit") servers String hostlow = host.toLowerCase(); if (args != null) { path = path + "?" + args; } - if (plasmaSwitchboard.urlBlacklist.isListed(hostlow, path)) { + if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_PROXY, hostlow, path)) { httpd.sendRespondError(conProp,respond,4,403,null, "URL '" + hostlow + "' blocked by yacy proxy (blacklisted)",null); this.theLogger.logInfo("AGIS blocking of host '" + hostlow + "'"); @@ -915,7 +916,7 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt // re-calc the url path String remotePath = (args == null) ? path : (path + "?" + args); - if (plasmaSwitchboard.urlBlacklist.isListed(hostlow, remotePath)) { + if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_PROXY, hostlow, remotePath)) { httpd.sendRespondError(conProp,respond,4,403,null, "URL '" + hostlow + "' blocked by yacy proxy (blacklisted)",null); this.theLogger.logInfo("AGIS blocking of host '" + hostlow + "'"); @@ -1128,7 +1129,7 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt // blacklist idea inspired by [AS]: // respond a 404 for all AGIS ("all you get is shit") servers final String hostlow = host.toLowerCase(); - if (plasmaSwitchboard.urlBlacklist.isListed(hostlow, path)) { + if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_PROXY, hostlow, path)) { httpd.sendRespondError(conProp,clientOut,4,403,null, "URL '" + hostlow + "' blocked by yacy proxy (blacklisted)",null); this.theLogger.logInfo("AGIS blocking of host '" + hostlow + "'"); diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java index 1d59a4c5d..60ee71136 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURL.java +++ b/source/de/anomic/plasma/plasmaCrawlLURL.java @@ -898,7 +898,7 @@ public final class plasmaCrawlLURL extends indexURL { plasmaCrawlLURL.Entry entry = (plasmaCrawlLURL.Entry) eiter.next(); totalSearchedUrls++; - if (plasmaSwitchboard.urlBlacklist.isListed(entry.url())==true) { + if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER,entry.url())==true) { lastBlacklistedUrl = entry.url().toString(); lastBlacklistedHash = entry.hash(); serverLog.logFine("URLDBCLEANER", ++blacklistedUrls + " blacklisted (" + ((double)blacklistedUrls/totalSearchedUrls)*100 + "%): " + entry.hash() + " " + entry.url()); diff --git a/source/de/anomic/plasma/plasmaCrawlStacker.java b/source/de/anomic/plasma/plasmaCrawlStacker.java index 4cdaef4e9..80c2a2879 100644 --- a/source/de/anomic/plasma/plasmaCrawlStacker.java +++ b/source/de/anomic/plasma/plasmaCrawlStacker.java @@ -283,7 +283,7 @@ public final class plasmaCrawlStacker { } // check blacklist - if (plasmaSwitchboard.urlBlacklist.isListed(nexturl)) { + if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER,nexturl)) { reason = plasmaCrawlEURL.DENIED_URL_IN_BLACKLIST; this.log.logFine("URL '" + nexturlString + "' is in blacklist. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); diff --git a/source/de/anomic/plasma/plasmaCrawlWorker.java b/source/de/anomic/plasma/plasmaCrawlWorker.java index 89c9833dd..f615d3786 100644 --- a/source/de/anomic/plasma/plasmaCrawlWorker.java +++ b/source/de/anomic/plasma/plasmaCrawlWorker.java @@ -311,7 +311,7 @@ public final class plasmaCrawlWorker extends Thread { // check if url is in blacklist String hostlow = host.toLowerCase(); - if (plasmaSwitchboard.urlBlacklist.isListed(hostlow, path)) { + if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, hostlow, path)) { log.logInfo("CRAWLER Rejecting URL '" + url.toString() + "'. URL is in blacklist."); addURLtoErrorDB(url, refererURLString, initiator, name, plasmaCrawlEURL.DENIED_URL_IN_BLACKLIST, new bitfield(indexURL.urlFlagLength)); return null; diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index d477426ca..d63c2d728 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -125,6 +125,7 @@ import java.util.logging.Level; import de.anomic.data.blogBoard; import de.anomic.data.bookmarksDB; +import de.anomic.data.listManager; import de.anomic.data.messageBoard; import de.anomic.data.wikiBoard; import de.anomic.data.userDB; @@ -302,14 +303,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // load the black-list / inspired by [AS] File ulrBlackListFile = new File(getRootPath(), getConfig("listsPath", "DATA/LISTS")); - urlBlacklist = new plasmaURLPattern(ulrBlackListFile); - String f = getConfig("proxyBlackListsActive", null); - if (f != null) { - urlBlacklist.loadList(f, "/"); - this.log.logConfig("loaded black-list from file " + ulrBlackListFile.getName() + ", " + - urlBlacklist.size() + " entries, " + - ppRamString(ulrBlackListFile.length()/1024)); - } + urlBlacklist = new plasmaURLPattern(ulrBlackListFile); + listManager.switchboard = this; + listManager.listsPath = ulrBlackListFile; + listManager.reloadBlacklists(); // load badwords (to filter the topwords) if (badwords == null) { diff --git a/source/de/anomic/plasma/plasmaURLPattern.java b/source/de/anomic/plasma/plasmaURLPattern.java index 7c129e32f..55fde4548 100644 --- a/source/de/anomic/plasma/plasmaURLPattern.java +++ b/source/de/anomic/plasma/plasmaURLPattern.java @@ -43,78 +43,158 @@ package de.anomic.plasma; import java.io.File; import de.anomic.net.URL; + +import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; +import java.util.Iterator; import java.util.Set; import de.anomic.kelondro.kelondroMSetTools; public class plasmaURLPattern { + + public static final String BLACKLIST_CRAWLER = "crawler"; + public static final String BLACKLIST_PROXY = "proxy"; + public static final String BLACKLIST_DHT = "dht"; + public static final String BLACKLIST_SEARCH = "search"; + + public static final HashSet BLACKLIST_TYPES = new HashSet(Arrays.asList(new String[]{ + BLACKLIST_CRAWLER, + BLACKLIST_PROXY, + BLACKLIST_DHT, + BLACKLIST_SEARCH + })); + - private Set cachedUrlHashs = Collections.synchronizedSet(new HashSet()); - private File rootPath = null; + private File blacklistRootPath = null; + private HashMap cachedUrlHashs = null; private HashMap hostpaths = null; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here public plasmaURLPattern(File rootPath) { super(); - this.rootPath = rootPath; + this.blacklistRootPath = rootPath; + + // prepare the data structure this.hostpaths = new HashMap(); + this.cachedUrlHashs = new HashMap(); + + Iterator iter = BLACKLIST_TYPES.iterator(); + while (iter.hasNext()) { + String blacklistType = (String) iter.next(); + this.hostpaths.put(blacklistType, new HashMap()); + this.cachedUrlHashs.put(blacklistType, Collections.synchronizedSet(new HashSet())); + } } public void clear() { - this.hostpaths = new HashMap(); + Iterator iter = this.hostpaths.keySet().iterator(); + while (iter.hasNext()) { + HashMap blacklistMap = (HashMap) this.hostpaths.get(iter.next()); + blacklistMap.clear(); + } } public int size() { - return hostpaths.size(); + int size = 0; + Iterator iter = this.hostpaths.keySet().iterator(); + while (iter.hasNext()) { + HashMap blacklistMap = (HashMap) this.hostpaths.get(iter.next()); + size += blacklistMap.size(); + } + return size; } - - public void loadList(String filenames, String sep) { - // File listsPath = new File(getRootPath(), getConfig("listsPath", "DATA/LISTS")); - final String[] filenamesarray = filenames.split(","); + + public void loadList(String blacklistType, String filenames, String sep) { + if (blacklistType == null) throw new IllegalArgumentException(); + if (!BLACKLIST_TYPES.contains(blacklistType)) throw new IllegalArgumentException("Unknown backlist type."); + + HashMap blacklistMap = (HashMap) this.hostpaths.get(blacklistType); + String[] filenamesarray = filenames.split(","); if( filenamesarray.length > 0) { for (int i = 0; i < filenamesarray.length; i++) { - hostpaths.putAll(kelondroMSetTools.loadMap(new File(rootPath, filenamesarray[i]).toString(), sep)); + blacklistMap.putAll(kelondroMSetTools.loadMap(new File(this.blacklistRootPath, filenamesarray[i]).toString(), sep)); } + } + } + + public void loadList(String[][] filenames, String sep) { + for (int j = 0; j < filenames.length; j++) { + String[] nextFile = filenames[j]; + String blacklistType = nextFile[0]; + String fileName = nextFile[1]; + this.loadList(blacklistType, fileName, sep); } } - public void remove(String host) { - hostpaths.remove(host); + public void remove(String blacklistType, String host) { + if (blacklistType == null) throw new IllegalArgumentException(); + if (!BLACKLIST_TYPES.contains(blacklistType)) throw new IllegalArgumentException("Unknown backlist type."); + + HashMap blacklistMap = (HashMap) this.hostpaths.get(blacklistType); + blacklistMap.remove(host); } - public void add(String host, String path) { + public void add(String blacklistType, String host, String path) { + if (host == null) throw new NullPointerException(); + if (path == null) throw new NullPointerException(); + if (blacklistType == null) throw new IllegalArgumentException(); + if (!BLACKLIST_TYPES.contains(blacklistType)) throw new IllegalArgumentException("Unknown backlist type."); + if (path.length() > 0 && path.charAt(0) == '/') path = path.substring(1); - hostpaths.put(host.toLowerCase(), path); + + HashMap blacklistMap = (HashMap) this.hostpaths.get(blacklistType); + blacklistMap.put(host.toLowerCase(), path); } public int blacklistCacheSize() { - return cachedUrlHashs.size(); + int size = 0; + Iterator iter = this.cachedUrlHashs.keySet().iterator(); + while (iter.hasNext()) { + Set blacklistMap = (Set) this.cachedUrlHashs.get(iter.next()); + size += blacklistMap.size(); + } + return size; } - public boolean hashInBlacklistedCache(String urlHash) { - return cachedUrlHashs.contains(urlHash); + public boolean hashInBlacklistedCache(String blacklistType, String urlHash) { + if (blacklistType == null) throw new IllegalArgumentException(); + if (!BLACKLIST_TYPES.contains(blacklistType)) throw new IllegalArgumentException("Unknown backlist type."); + + Set urlHashCache = (Set) this.cachedUrlHashs.get(blacklistType); + return urlHashCache.contains(urlHash); } - public boolean isListed(String urlHash, URL url) { - if (!cachedUrlHashs.contains(urlHash)) { - boolean temp = isListed(url.getHost().toLowerCase(), url.getFile()); - if (temp) - { - cachedUrlHashs.add(urlHash); - } + public boolean isListed(String blacklistType, String urlHash, URL url) { + if (blacklistType == null) throw new IllegalArgumentException(); + if (!BLACKLIST_TYPES.contains(blacklistType)) throw new IllegalArgumentException("Unknown backlist type."); + + Set urlHashCache = (Set) this.cachedUrlHashs.get(blacklistType); + if (!urlHashCache.contains(urlHash)) { + boolean temp = isListed(blacklistType, url.getHost().toLowerCase(), url.getFile()); + if (temp) { + urlHashCache.add(urlHash); + } return temp; } return true; } - public boolean isListed(URL url) { - return isListed(url.getHost().toLowerCase(), url.getFile()); + public boolean isListed(String blacklistType, URL url) { + return isListed(blacklistType, url.getHost().toLowerCase(), url.getFile()); } - public boolean isListed(String hostlow, String path) { + public boolean isListed(String blacklistType, String hostlow, String path) { + if (hostlow == null) throw new NullPointerException(); + if (path == null) throw new NullPointerException(); + if (blacklistType == null) throw new IllegalArgumentException(); + if (!BLACKLIST_TYPES.contains(blacklistType)) throw new IllegalArgumentException("Unknown backlist type."); + + // getting the proper blacklist + HashMap blacklistMap = (HashMap) this.hostpaths.get(blacklistType); + if (path.length() > 0 && path.charAt(0) == '/') path = path.substring(1); String pp = ""; // path-pattern @@ -122,19 +202,19 @@ public class plasmaURLPattern { // [TL] While "." are found within the string int index = 0; while ((index = hostlow.indexOf('.', index + 1)) != -1) { - if ((pp = (String) hostpaths.get(hostlow.substring(0, index + 1) + "*")) != null) { + if ((pp = (String) blacklistMap.get(hostlow.substring(0, index + 1) + "*")) != null) { return ((pp.equals("*")) || (path.matches(pp))); } } index = hostlow.length(); while ((index = hostlow.lastIndexOf('.', index - 1)) != -1) { - if ((pp = (String) hostpaths.get("*" + hostlow.substring(index, hostlow.length()))) != null) { + if ((pp = (String) blacklistMap.get("*" + hostlow.substring(index, hostlow.length()))) != null) { return ((pp.equals("*")) || (path.matches(pp))); } } // try to match without wildcard in domain - return (((pp = (String) hostpaths.get(hostlow)) != null) && + return (((pp = (String) blacklistMap.get(hostlow)) != null) && ((pp.equals("*")) || (path.matches(pp)))); } diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 11cb768db..0c14160b1 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -708,7 +708,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { // "+entry.getUrlHash()); try { url = lurl.getEntry(entry.urlHash(), null).url(); - if ((url == null) || (plasmaSwitchboard.urlBlacklist.isListed(url) == true)) { + if ((url == null) || (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, url) == true)) { urlHashs.add(entry.urlHash()); } } catch (IOException e) { diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 8d0864f03..79cf58d28 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -478,7 +478,7 @@ public final class yacyClient { for (int n = 0; n < results; n++) { // get one single search result urlEntry = urlManager.newEntry((String) result.get("resource" + n), true); - if ((urlEntry == null) || (blacklist.isListed(urlEntry.url()))) { continue; } // block with backlist + if ((urlEntry == null) || (blacklist.isListed(plasmaURLPattern.BLACKLIST_SEARCH, urlEntry.url()))) { continue; } // block with backlist urlEntry.store(); int urlLength = urlEntry.url().toString().length(); int urlComps = htmlFilterContentScraper.urlComps(urlEntry.url().toString()).length; diff --git a/source/migration.java b/source/migration.java index a36690dc3..9e763c8f8 100644 --- a/source/migration.java +++ b/source/migration.java @@ -245,6 +245,16 @@ public class migration { sb.setConfig("portForwarding.sch.HostUser", sb.getConfig("portForwardingHostUser","")); sb.setConfig("portForwarding.sch.HostPwd", sb.getConfig("portForwardingHostPwd","")); } + + // migration for blacklists + if ((value = sb.getConfig("proxyBlackLists","")).length() > 0) { + sb.setConfig("proxy.BlackLists", value); + sb.setConfig("crawler.BlackLists", value); + sb.setConfig("dht.BlackLists", value); + sb.setConfig("search.BlackLists", value); + + sb.setConfig("BlackLists.Shared",sb.getConfig("proxyBlackListsShared","")); + } } } diff --git a/yacy.init b/yacy.init index 56a72273b..74fdec289 100644 --- a/yacy.init +++ b/yacy.init @@ -205,9 +205,14 @@ proxyYellowList=yacy.yellow # the black-list; URLs appearing in this list will not be loaded; # instead always a 404 is returned # all these files will be placed in the listsPath -proxyBlackLists=url.default.black -proxyBlackListsActive=url.default.black -proxyBlackListsShared=url.default.black +BlackLists.types=proxy,crawler,dht,search +BlackLists.Shared=url.default.black + +proxy.BlackLists=url.default.black +crawler.BlackLists=url.default.black +dht.BlackLists=url.default.black +search.BlackLists=url.default.black + proxyCookieBlackList=cookie.default.black proxyCookieWhiteList=cookie.default.black