From 5b0c1449e17efc482c88ec7084a4d76633c3aef5 Mon Sep 17 00:00:00 2001 From: fuchsi Date: Mon, 10 Sep 2007 06:20:27 +0000 Subject: [PATCH] various fixes and cleanups for blacklist handling: 1. avoid adding duplicate file name entries in config properties for lists, 2. correctly merge all path masks from all list files for the same host masks, 3. rewrite helper methods standard java methods for Collection transformations, 4. merged various methods with identical functionality for different Collection implementations into one, 5. minor refactoring to improve code readability. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4087 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/BlacklistCleaner_p.java | 6 +- htroot/Blacklist_p.java | 32 +- htroot/Bookmarks.java | 6 +- htroot/IndexControl_p.java | 4 +- htroot/sharedBlacklist_p.java | 2 +- htroot/xml/blacklists_p.java | 4 +- source/de/anomic/data/blogBoard.java | 10 +- source/de/anomic/data/blogBoardComments.java | 2 +- source/de/anomic/data/bookmarksDB.java | 49 +-- source/de/anomic/data/listManager.java | 284 ++++++++++-------- .../de/anomic/plasma/plasmaSwitchboard.java | 6 +- .../plasma/urlPattern/abstractURLPattern.java | 47 ++- .../plasma/urlPattern/plasmaURLPattern.java | 16 + 13 files changed, 282 insertions(+), 186 deletions(-) diff --git a/htroot/BlacklistCleaner_p.java b/htroot/BlacklistCleaner_p.java index aa47ab57d..c5490cdb9 100644 --- a/htroot/BlacklistCleaner_p.java +++ b/htroot/BlacklistCleaner_p.java @@ -109,7 +109,7 @@ public class BlacklistCleaner_p { if (post.containsKey("listNames")) { blacklistToUse = (String)post.get("listNames"); - if (blacklistToUse.length() == 0 || !listManager.ListInListslist("listManager.listsPath", blacklistToUse)) + if (blacklistToUse.length() == 0 || !listManager.listSetContains("listManager.listsPath", blacklistToUse)) prop.put("results", 2); } @@ -283,7 +283,7 @@ public class BlacklistCleaner_p { // remove the entry from the running blacklist engine for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) { - if (listManager.ListInListslist(supportedBlacklistTypes[blTypes] + ".BlackLists", blacklistToUse)) { + if (listManager.listSetContains(supportedBlacklistTypes[blTypes] + ".BlackLists", blacklistToUse)) { String host = (s.indexOf("/") == -1) ? s : s.substring(0, s.indexOf("/")); String path = (s.indexOf("/") == -1) ? ".*" : s.substring(s.indexOf("/") + 1); try { @@ -318,7 +318,7 @@ public class BlacklistCleaner_p { } pw.println(host + "/" + path); for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) { - if (listManager.ListInListslist(supportedBlacklistTypes[blTypes] + ".BlackLists",blacklistToUse)) { + if (listManager.listSetContains(supportedBlacklistTypes[blTypes] + ".BlackLists",blacklistToUse)) { plasmaSwitchboard.urlBlacklist.add( supportedBlacklistTypes[blTypes], host, diff --git a/htroot/Blacklist_p.java b/htroot/Blacklist_p.java index 2411caf8c..1a4941e98 100644 --- a/htroot/Blacklist_p.java +++ b/htroot/Blacklist_p.java @@ -138,11 +138,11 @@ public class Blacklist_p { newFile.createNewFile(); // share the newly created blacklist - listManager.addListToListslist(BLACKLIST_SHARED, blacklistToUse); + listManager.updateListSet(BLACKLIST_SHARED, blacklistToUse); // activate it for all known blacklist types for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) { - listManager.addListToListslist(supportedBlacklistTypes[blTypes] + ".BlackLists",blacklistToUse); + listManager.updateListSet(supportedBlacklistTypes[blTypes] + ".BlackLists",blacklistToUse); } } catch (IOException e) {/* */} @@ -161,11 +161,11 @@ public class Blacklist_p { BlackListFile.delete(); for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) { - listManager.removeListFromListslist(supportedBlacklistTypes[blTypes] + ".BlackLists",blacklistToUse); + listManager.removeFromListSet(supportedBlacklistTypes[blTypes] + ".BlackLists",blacklistToUse); } // remove it from the shared list - listManager.removeListFromListslist(BLACKLIST_SHARED, blacklistToUse); + listManager.removeFromListSet(BLACKLIST_SHARED, blacklistToUse); blacklistToUse = null; // reload Blacklists @@ -185,9 +185,9 @@ public class Blacklist_p { for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) { if (post.containsKey("activateList4" + supportedBlacklistTypes[blTypes])) { - listManager.addListToListslist(supportedBlacklistTypes[blTypes] + ".BlackLists",blacklistToUse); + listManager.updateListSet(supportedBlacklistTypes[blTypes] + ".BlackLists",blacklistToUse); } else { - listManager.removeListFromListslist(supportedBlacklistTypes[blTypes] + ".BlackLists",blacklistToUse); + listManager.removeFromListSet(supportedBlacklistTypes[blTypes] + ".BlackLists",blacklistToUse); } } @@ -205,11 +205,11 @@ public class Blacklist_p { return prop; } - if (listManager.ListInListslist(BLACKLIST_SHARED, blacklistToUse)) { + if (listManager.listSetContains(BLACKLIST_SHARED, blacklistToUse)) { // Remove from shared BlackLists - listManager.removeListFromListslist(BLACKLIST_SHARED, blacklistToUse); + listManager.removeFromListSet(BLACKLIST_SHARED, blacklistToUse); } else { // inactive list -> enable - listManager.addListToListslist(BLACKLIST_SHARED, blacklistToUse); + listManager.updateListSet(BLACKLIST_SHARED, blacklistToUse); } } else if (post.containsKey("deleteBlacklistEntry")) { @@ -253,7 +253,7 @@ public class Blacklist_p { oldEntry = oldEntry + "/.*"; } for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) { - if (listManager.ListInListslist(supportedBlacklistTypes[blTypes] + ".BlackLists",blacklistToUse)) { + if (listManager.listSetContains(supportedBlacklistTypes[blTypes] + ".BlackLists",blacklistToUse)) { plasmaSwitchboard.urlBlacklist.remove(supportedBlacklistTypes[blTypes],oldEntry.substring(0, pos), oldEntry.substring(pos + 1)); } } @@ -303,7 +303,7 @@ public class Blacklist_p { // add to blacklist for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) { - if (listManager.ListInListslist(supportedBlacklistTypes[blTypes] + ".BlackLists",blacklistToUse)) { + if (listManager.listSetContains(supportedBlacklistTypes[blTypes] + ".BlackLists",blacklistToUse)) { plasmaSwitchboard.urlBlacklist.add(supportedBlacklistTypes[blTypes],newEntry.substring(0, pos), newEntry.substring(pos + 1)); } } @@ -352,7 +352,7 @@ public class Blacklist_p { // add to blacklist for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) { - if (listManager.ListInListslist(supportedBlacklistTypes[blTypes] + ".BlackLists",blacklistToUse)) { + if (listManager.listSetContains(supportedBlacklistTypes[blTypes] + ".BlackLists",blacklistToUse)) { plasmaSwitchboard.urlBlacklist.add(supportedBlacklistTypes[blTypes],entry.substring(0, pos), entry.substring(pos + 1)); } } @@ -390,7 +390,7 @@ public class Blacklist_p { entry = entry + "/.*"; } for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) { - if (listManager.ListInListslist(supportedBlacklistTypes[blTypes] + ".BlackLists",blacklistToUse)) { + if (listManager.listSetContains(supportedBlacklistTypes[blTypes] + ".BlackLists",blacklistToUse)) { plasmaSwitchboard.urlBlacklist.remove(supportedBlacklistTypes[blTypes],entry.substring(0, pos), entry.substring(pos + 1)); } } @@ -469,13 +469,13 @@ public class Blacklist_p { for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) { prop.put(DISABLED + "currentActiveFor_" + blTypes + "_blTypeName",supportedBlacklistTypes[blTypes]); prop.put(DISABLED + "currentActiveFor_" + blTypes + "_checked", - listManager.ListInListslist(supportedBlacklistTypes[blTypes] + ".BlackLists",dirlist[i])?0:1); + listManager.listSetContains(supportedBlacklistTypes[blTypes] + ".BlackLists",dirlist[i])?0:1); } prop.put(DISABLED + "currentActiveFor",supportedBlacklistTypes.length); } - if (listManager.ListInListslist(BLACKLIST_SHARED, dirlist[i])) { + if (listManager.listSetContains(BLACKLIST_SHARED, dirlist[i])) { prop.put(DISABLED + BLACKLIST + blacklistCount + "_shared", 1); } else { prop.put(DISABLED + BLACKLIST + blacklistCount + "_shared", 0); @@ -483,7 +483,7 @@ public class Blacklist_p { int activeCount = 0; for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) { - if (listManager.ListInListslist(supportedBlacklistTypes[blTypes] + ".BlackLists",dirlist[i])) { + if (listManager.listSetContains(supportedBlacklistTypes[blTypes] + ".BlackLists",dirlist[i])) { prop.put(DISABLED + BLACKLIST + blacklistCount + "_active_" + activeCount + "_blTypeName",supportedBlacklistTypes[blTypes]); activeCount++; } diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java index 8f0eb3a87..efb3baf07 100644 --- a/htroot/Bookmarks.java +++ b/htroot/Bookmarks.java @@ -49,8 +49,8 @@ import java.io.File; import java.net.MalformedURLException; import java.util.Date; import java.util.HashMap; -import java.util.HashSet; import java.util.Iterator; +import java.util.Set; import de.anomic.data.bookmarksDB; import de.anomic.data.listManager; @@ -127,7 +127,7 @@ public class Bookmarks { if(tagsString.equals("")){ tagsString="unsorted"; //defaulttag } - HashSet tags=listManager.string2hashset(tagsString); + Set tags=listManager.string2set(tagsString); bookmarksDB.Bookmark bookmark = switchboard.bookmarksDB.createBookmark(url, username); if(bookmark != null){ @@ -264,7 +264,7 @@ public class Bookmarks { count++; } count=0; - HashSet tags; + Set tags; Iterator tagsIt; int tagCount; while(countMeaning of ListSet: There are various "lists" in YaCy which are + * actually disjunct (pairwise unequal) sets which themselves can be seperated + * into different subsets. E.g., there can be more than one blacklist of a type. + * A ListSet is the set of all those "lists" (subsets) of an equal type. + * + * @param setName name of the ListSet + * @return a ListSet from configuration file + */ + public static Set getListSet(String setName) { + return string2set(switchboard.getConfig(setName, "")); + } - switchboard.setConfig(ListName, temp); + /** + * Removes an element from a ListSet and updates the configuration file + * accordingly. If the element doesn't exist, then nothing will be changed. + * + * @param setName name of the ListSet. + * @param listName name of the element to remove from the ListSet. + */ + public static void removeFromListSet(String setName, String listName) { + Set listSet = getListSet(setName); + + if (listSet.size() > 0) { + listSet.remove(listName); + switchboard.setConfig(setName, collection2string(listSet)); } + } - // add a new List to a List-List - public static void addListToListslist(String ListName, String newList) { - String[] Lists = getListslistArray(ListName); - String temp = ""; + /** + * Adds an element to an existing ListSet. If the ListSet doesn't exist yet, + * a new one will be added. If the ListSet already contains an identical element, + * then nothing happens. + * + * The new list will be written to the configuartion file. + * + * @param setName + * @param newListName + */ + public static void updateListSet(String setName, String newListName) { + Set listSet = getListSet(setName); + listSet.add(newListName); - for (int i = 0; i <= (Lists.length -1); i++) { - temp += Lists[i] + ","; - } - temp += newList; - switchboard.setConfig(ListName, temp); - } + switchboard.setConfig(setName, collection2string(listSet)); + } - // returns true, if the Lists-List contains the Listname - public static boolean ListInListslist(String Listname, String BlackList) { - String[] Lists = getListslistArray(Listname); + /** + * @param setName ListSet in which to search for an element. + * @param listName the element to search for. + * @return true if the ListSet "setName" contains an element + * "listName", false otherwise. + */ + public static boolean listSetContains(String setName, String listName) { + Set Lists = getListSet(setName); + + return Lists.contains(listName); + } - for (int u=0; u <= Lists.length -1; u++) { - if (BlackList.equals(Lists[u])) { - return true; - } - } - return false; - } -//================generel Lists================== +//================general Lists================== - // Gets a Array of all lines(Items) of a (list)file + /** + * Read lines of a file into an ArrayList. + * + * @param listFile the file + * @return the resulting array as an ArrayList + */ public static ArrayList getListArray(File listFile){ String line; ArrayList list = new ArrayList(); @@ -140,7 +160,13 @@ public class listManager { return list; } - // Writes the Liststring to a file + /** + * Write a String to a file (used for string representation of lists). + * + * @param listFile the file to write to + * @param out the String to write + * @return returns true if successful, false otherwise + */ public static boolean writeList(File listFile, String out) { BufferedWriter bw = null; try { @@ -155,7 +181,13 @@ public class listManager { } } - // overloaded function to write an array + /** + * Write elements of an Array of Strings to a file (one element per line). + * + * @param listFile the file to write to + * @param list the Array to write + * @return returns true if successful, false otherwise + */ public static boolean writeList(File listFile, String[] list){ StringBuffer out = new StringBuffer(); for(int i=0;i < list.length; i++){ @@ -166,11 +198,19 @@ public class listManager { return writeList(listFile, new String(out)); //(File, String) } + // same as below public static String getListString(String filename, boolean withcomments) { File listFile = new File(listsPath ,filename); return getListString(listFile, withcomments); } - + + /** + * Read lines of a text file into a String, optionally ignoring comments. + * + * @param listFile the File to read from. + * @param withcomments If false ignore lines starting with '#'. + * @return String representation of the file content. + */ public static String getListString(File listFile, boolean withcomments){ StringBuffer temp = new StringBuffer(); @@ -203,6 +243,13 @@ public class listManager { return getDirListing(dir); } + /** + * Read content of a directory into a String array of file names. + * + * @param dir The directory to get the file listing from. If it doesn't exist yet, + * it will be created. + * @return array of file names + */ public static String[] getDirListing(File dir){ String[] fileListString; File[] fileList; @@ -221,9 +268,11 @@ public class listManager { return null; } + // same as below public static ArrayList getDirsRecursive(File dir, String notdir){ return getDirsRecursive(dir, notdir, true); } + /** * Returns a List of all dirs and subdirs as File Objects * @@ -246,92 +295,89 @@ public class listManager { } return resultList; } - public static String arraylist2string(ArrayList list){ - Iterator it=list.iterator(); - String ret=""; - if(it.hasNext()){ - ret=(String) it.next(); - while(it.hasNext()){ - ret+=","+(String)it.next(); + + +//================Helper functions for collection conversion================== + + /** + * Simple conversion of a Collection of Strings to a comma separated String. + * If the implementing Collection subclass guaranties an order of its elements, + * the substrings of the result will have the same order. + * + * @param col a Collection of Strings. + * @return String with elements from set separated by comma. + */ + public static String collection2string(Collection col){ + StringBuffer str = new StringBuffer(); + + if (col != null && (col.size() > 0)) { + Iterator it = col.iterator(); + str.append((String) it.next()); + while(it.hasNext()) { + str.append(",").append((String) it.next()); } } - return ret; + + return str.toString(); } + + /** + * @see listManager#string2vector(String) + */ public static ArrayList string2arraylist(String string){ - ArrayList ret=new ArrayList(); - String[] hashes=string.split(","); - if(string.indexOf(",") > -1){ - for(int i=0;inull + */ + public static Set string2set(String string){ + HashSet set; + + if (string != null) { + set = new HashSet(Arrays.asList(string.split(","))); + } else { + set = new HashSet(); } - return ret; + + return set; } + /** + * Simple conversion of a comma separated list to a Vector containing + * the order of the substrings. + * + * @param string list of comma separated Strings + * @return resulting Vector or empty Vector if string is null + */ public static Vector string2vector(String string){ - Vector ret=new Vector(); - String[] hashes=string.split(","); - if(string.indexOf(",") > -1){ - for(int i=0;i -1){ - for(int i=0;i 0) { - for (int i = 0; i < filenamesarray.length; i++) { - blacklistMap.putAll(kelondroMSetTools.loadMapMultiValsPerKey(new File(this.blacklistRootPath, filenamesarray[i]).toString(), sep)); + String[] fileNames = blFile.getFileNamesUnified(); + if (fileNames.length > 0) { + for (int i = 0; i < fileNames.length; i++) { + // make sure all requested blacklist files exist + File file = new File(this.blacklistRootPath, fileNames[i]); + try { + file.createNewFile(); + } catch (IOException e) { /* */ } + + // join all blacklists from files into one internal blacklist map + loadedBlacklist = kelondroMSetTools.loadMapMultiValsPerKey(file.toString(), sep).entrySet(); + for (Iterator mi = loadedBlacklist.iterator(); mi.hasNext(); ) { + loadedEntry = (Map.Entry) mi.next(); + loadedPaths = (ArrayList) loadedEntry.getValue(); + + // create new entry if host mask unknown, otherwise merge + // existing one with path patterns from blacklist file + paths = (ArrayList) blacklistMap.get(loadedEntry.getKey()); + if (paths == null) { + blacklistMap.put(loadedEntry.getKey(), loadedPaths); + } else { + // TODO check for duplicates? (refactor List -> Set) + paths.addAll(loadedPaths); + } + } } } } + public void loadList(String blacklistType, String fileNames, String sep) { + // method for not breaking older plasmaURLPattern interface + blacklistFile blFile = new blacklistFile(fileNames, blacklistType); + + loadList(blFile, sep); + } + public void removeAll(String blacklistType, String host) { HashMap blacklistMap = getBlacklistMap(blacklistType); blacklistMap.remove(host); diff --git a/source/de/anomic/plasma/urlPattern/plasmaURLPattern.java b/source/de/anomic/plasma/urlPattern/plasmaURLPattern.java index 59c826e28..9fd6e2898 100644 --- a/source/de/anomic/plasma/urlPattern/plasmaURLPattern.java +++ b/source/de/anomic/plasma/urlPattern/plasmaURLPattern.java @@ -1,6 +1,8 @@ package de.anomic.plasma.urlPattern; import java.io.File; +import java.util.Arrays; +import java.util.HashSet; import de.anomic.yacy.yacyURL; @@ -24,6 +26,20 @@ public interface plasmaURLPattern { } public String getFileName() { return this.filename; } + + + /** + * Construct a unified array of file names from comma seperated file name + * list. + * + * @return unified String array of file names + */ + public String[] getFileNamesUnified() { + HashSet hs = new HashSet(Arrays.asList(this.filename.split(","))); + + return (String[]) hs.toArray(new String[hs.size()]); + } + public String getType() { return this.type; } }