From 26f5757b405bd24200a587e21c47cf1c33c8c069 Mon Sep 17 00:00:00 2001 From: karlchenofhell Date: Sat, 24 Feb 2007 13:56:32 +0000 Subject: [PATCH] - added support for multiple paths per domain to default-blacklist warning: an interface-change had been neccessary: - remove(String, String) has been renamed to removeAll(String, String), because it removes all path-entries for the specified host - remove(String, String, String) has been added to delete only a path-entry - geBlacklistType(String) has been renamed to getBlacklistType(String) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3391 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/BlacklistCleaner_p.java | 6 +- htroot/Blacklist_p.java | 6 +- .../de/anomic/kelondro/kelondroMSetTools.java | 25 +++++++++ .../plasma/urlPattern/abstractURLPattern.java | 55 ++++++++++++------- .../plasma/urlPattern/defaultURLPattern.java | 31 ++++++++--- .../plasma/urlPattern/plasmaURLPattern.java | 20 ++++++- .../soap/services/BlacklistService.java | 2 +- 7 files changed, 108 insertions(+), 37 deletions(-) diff --git a/htroot/BlacklistCleaner_p.java b/htroot/BlacklistCleaner_p.java index d0893233a..f7f4a7b52 100644 --- a/htroot/BlacklistCleaner_p.java +++ b/htroot/BlacklistCleaner_p.java @@ -249,7 +249,8 @@ public class BlacklistCleaner_p { for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) { if (listManager.ListInListslist(supportedBlacklistTypes[blTypes] + ".BlackLists", blacklistToUse)) { plasmaSwitchboard.urlBlacklist.remove(supportedBlacklistTypes[blTypes], - (s.indexOf("/") == -1) ? s : s.substring(0, s.indexOf("/"))); + (s.indexOf("/") == -1) ? s : s.substring(0, s.indexOf("/")), + (s.indexOf("/") == -1) ? ".*" : s.substring(s.indexOf("/") + 1)); } } } @@ -284,7 +285,8 @@ public class BlacklistCleaner_p { for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) { if (listManager.ListInListslist(supportedBlacklistTypes[blTypes] + ".BlackLists", blacklistToUse)) { plasmaSwitchboard.urlBlacklist.remove(supportedBlacklistTypes[blTypes], - (s.indexOf("/") == -1) ? s : s.substring(0, s.indexOf("/"))); + (s.indexOf("/") == -1) ? s : s.substring(0, s.indexOf("/")), + (s.indexOf("/") == -1) ? ".*" : s.substring(s.indexOf("/") + 1)); plasmaSwitchboard.urlBlacklist.add(supportedBlacklistTypes[blTypes], host, path); } } diff --git a/htroot/Blacklist_p.java b/htroot/Blacklist_p.java index c84bfac0a..a7cc017b5 100644 --- a/htroot/Blacklist_p.java +++ b/htroot/Blacklist_p.java @@ -225,7 +225,7 @@ public class Blacklist_p { } for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) { if (listManager.ListInListslist(supportedBlacklistTypes[blTypes] + ".BlackLists",blacklistToUse)) { - plasmaSwitchboard.urlBlacklist.remove(supportedBlacklistTypes[blTypes],oldEntry.substring(0, pos)); + plasmaSwitchboard.urlBlacklist.remove(supportedBlacklistTypes[blTypes],oldEntry.substring(0, pos), oldEntry.substring(pos + 1)); } } @@ -307,7 +307,7 @@ public class Blacklist_p { if (nextEntry.length() == 0) continue; if (nextEntry.startsWith("#")) continue; - prop.put(DISABLED + "Itemlist_" + entryCount + "_item", de.anomic.data.wikiCode.replaceXMLEntities(nextEntry)); + prop.put(DISABLED + "Itemlist_" + entryCount + "_item", nextEntry); entryCount++; } prop.put(DISABLED + "Itemlist", entryCount); @@ -341,7 +341,7 @@ public class Blacklist_p { int blacklistCount = 0; if (dirlist != null) { for (int i = 0; i <= dirlist.length - 1; i++) { - prop.put(DISABLED + BLACKLIST + blacklistCount + "_name", de.anomic.data.wikiCode.replaceXMLEntities(dirlist[i])); + prop.put(DISABLED + BLACKLIST + blacklistCount + "_name", dirlist[i]); prop.put(DISABLED + BLACKLIST + blacklistCount + "_selected", 0); if (dirlist[i].equals(blacklistToUse)) { //current List diff --git a/source/de/anomic/kelondro/kelondroMSetTools.java b/source/de/anomic/kelondro/kelondroMSetTools.java index 6b24185bc..302ee339f 100644 --- a/source/de/anomic/kelondro/kelondroMSetTools.java +++ b/source/de/anomic/kelondro/kelondroMSetTools.java @@ -45,6 +45,7 @@ import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; +import java.util.ArrayList; import java.util.Collection; import java.util.Comparator; import java.util.Iterator; @@ -378,6 +379,30 @@ public class kelondroMSetTools { return map; } + public static TreeMap /* > */ loadMapMultiValsPerKey(String filename, String sep) { + TreeMap map = new TreeMap(); + BufferedReader br = null; + try { + br = new BufferedReader(new InputStreamReader(new FileInputStream(filename))); + String line, key, value; + int pos; + while ((line = br.readLine()) != null) { + line = line.trim(); + if ((line.length() > 0) && (!(line.startsWith("#"))) && ((pos = line.indexOf(sep)) > 0)) { + key = line.substring(0, pos).trim().toLowerCase(); + value = line.substring(pos + sep.length()).trim(); + if (!map.containsKey(key)) + map.put(key, new ArrayList()); + ((ArrayList)map.get(key)).add(value); + } + } + } catch (IOException e) { + } finally { + if (br != null) try { br.close(); } catch (Exception e) {} + } + return map; + } + public static TreeSet loadList(File file, Comparator c) { TreeSet list = new TreeSet(c); if (!(file.exists())) return list; diff --git a/source/de/anomic/plasma/urlPattern/abstractURLPattern.java b/source/de/anomic/plasma/urlPattern/abstractURLPattern.java index 636603b95..e07a7e98d 100644 --- a/source/de/anomic/plasma/urlPattern/abstractURLPattern.java +++ b/source/de/anomic/plasma/urlPattern/abstractURLPattern.java @@ -45,13 +45,19 @@ package de.anomic.plasma.urlPattern; +import java.io.BufferedReader; import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Set; +import java.util.TreeMap; import de.anomic.kelondro.kelondroMSetTools; import de.anomic.net.URL; @@ -67,8 +73,7 @@ public abstract class abstractURLPattern implements plasmaURLPattern { protected File blacklistRootPath = null; protected HashMap cachedUrlHashs = null; - protected HashMap hostpaths = null; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here - + protected HashMap /* >> */ hostpaths = null; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here public abstractURLPattern(File rootPath) { this.setRootPath(rootPath); @@ -98,7 +103,7 @@ public abstract class abstractURLPattern implements plasmaURLPattern { this.blacklistRootPath = rootPath; } - protected HashMap geBlacklistMap(String blacklistType) { + protected HashMap getBlacklistMap(String blacklistType) { if (blacklistType == null) throw new IllegalArgumentException(); if (!BLACKLIST_TYPES.contains(blacklistType)) throw new IllegalArgumentException("Unknown backlist type."); @@ -124,38 +129,43 @@ public abstract class abstractURLPattern implements plasmaURLPattern { int size = 0; Iterator iter = this.hostpaths.keySet().iterator(); while (iter.hasNext()) { - HashMap blacklistMap = (HashMap) this.hostpaths.get(iter.next()); - size += blacklistMap.size(); + Iterator blIter = ((HashMap)this.hostpaths.get(iter.next())).values().iterator(); + while (blIter.hasNext()) + size += ((ArrayList)blIter.next()).size(); } return size; } - public void loadList(String[][] filenames, String sep) { - for (int j = 0; j < filenames.length; j++) { - String[] nextFile = filenames[j]; - String blacklistType = nextFile[0]; - String fileName = nextFile[1]; - this.loadList(blacklistType, fileName, sep); + public void loadList(blacklistFile[] blFiles, String sep) { + for (int j = 0; j < blFiles.length; j++) { + blacklistFile blf = blFiles[j]; + loadList(blf.getType(), blf.getFileName(), sep); } } public void loadList(String blacklistType, String filenames, String sep) { - - HashMap blacklistMap = geBlacklistMap(blacklistType); + HashMap blacklistMap = getBlacklistMap(blacklistType); String[] filenamesarray = filenames.split(","); - if( filenamesarray.length > 0) { + if (filenamesarray.length > 0) { for (int i = 0; i < filenamesarray.length; i++) { - blacklistMap.putAll(kelondroMSetTools.loadMap(new File(this.blacklistRootPath, filenamesarray[i]).toString(), sep)); + blacklistMap.putAll(kelondroMSetTools.loadMapMultiValsPerKey(new File(this.blacklistRootPath, filenamesarray[i]).toString(), sep)); } - } + } } - public void remove(String blacklistType, String host) { - - HashMap blacklistMap = geBlacklistMap(blacklistType); + public void removeAll(String blacklistType, String host) { + HashMap blacklistMap = getBlacklistMap(blacklistType); blacklistMap.remove(host); } + + public void remove(String blacklistType, String host, String path) { + HashMap blacklistMap = getBlacklistMap(blacklistType); + ArrayList hostList = (ArrayList)blacklistMap.get(host); + hostList.remove(path); + if (hostList.size() == 0) + blacklistMap.remove(host); + } public void add(String blacklistType, String host, String path) { if (host == null) throw new NullPointerException(); @@ -163,8 +173,11 @@ public abstract class abstractURLPattern implements plasmaURLPattern { if (path.length() > 0 && path.charAt(0) == '/') path = path.substring(1); - HashMap blacklistMap = geBlacklistMap(blacklistType); - blacklistMap.put(host.toLowerCase(), path); + HashMap blacklistMap = getBlacklistMap(blacklistType); + ArrayList hostList = (ArrayList)blacklistMap.get(host.toLowerCase()); + if (hostList == null) + blacklistMap.put(host.toLowerCase(), (hostList = new ArrayList())); + hostList.add(path); } public int blacklistCacheSize() { diff --git a/source/de/anomic/plasma/urlPattern/defaultURLPattern.java b/source/de/anomic/plasma/urlPattern/defaultURLPattern.java index c1c61b5da..b0590382e 100644 --- a/source/de/anomic/plasma/urlPattern/defaultURLPattern.java +++ b/source/de/anomic/plasma/urlPattern/defaultURLPattern.java @@ -42,6 +42,7 @@ package de.anomic.plasma.urlPattern; import java.io.File; +import java.util.ArrayList; import java.util.HashMap; @@ -60,28 +61,44 @@ public class defaultURLPattern extends abstractURLPattern implements plasmaURLPa if (path == null) throw new NullPointerException(); // getting the proper blacklist - HashMap blacklistMap = super.geBlacklistMap(blacklistType); + HashMap blacklistMap = super.getBlacklistMap(blacklistType); if (path.length() > 0 && path.charAt(0) == '/') path = path.substring(1); + ArrayList app; + boolean matched = false; String pp = ""; // path-pattern // first try to match the domain with wildcard '*' // [TL] While "." are found within the string int index = 0; while ((index = hostlow.indexOf('.', index + 1)) != -1) { - if ((pp = (String) blacklistMap.get(hostlow.substring(0, index + 1) + "*")) != null) { - return ((pp.equals("*")) || (path.matches(pp))); + if ((app = (ArrayList) blacklistMap.get(hostlow.substring(0, index + 1) + "*")) != null) { + for (int i=app.size()-1; !matched && i>-1; i--) { + pp = (String)app.get(i); + matched |= ((pp.equals("*")) || (path.matches(pp))); + } + return matched; } } index = hostlow.length(); while ((index = hostlow.lastIndexOf('.', index - 1)) != -1) { - if ((pp = (String) blacklistMap.get("*" + hostlow.substring(index, hostlow.length()))) != null) { - return ((pp.equals("*")) || (path.matches(pp))); + if ((app = (ArrayList) blacklistMap.get("*" + hostlow.substring(index, hostlow.length()))) != null) { + for (int i=app.size()-1; !matched && i>-1; i--) { + pp = (String)app.get(i); + matched |= ((pp.equals("*")) || (path.matches(pp))); + } + return matched; } } // try to match without wildcard in domain - return (((pp = (String) blacklistMap.get(hostlow)) != null) && - ((pp.equals("*")) || (path.matches(pp)))); + if ((app = (ArrayList)blacklistMap.get(hostlow)) != null) { + for (int i=app.size()-1; !matched && i>-1; i--) { + pp = (String)app.get(i); + matched |= ((pp.equals("*")) || (path.matches(pp))); + } + return matched; + } + return false; } } diff --git a/source/de/anomic/plasma/urlPattern/plasmaURLPattern.java b/source/de/anomic/plasma/urlPattern/plasmaURLPattern.java index fcacb1e3f..b331605d4 100644 --- a/source/de/anomic/plasma/urlPattern/plasmaURLPattern.java +++ b/source/de/anomic/plasma/urlPattern/plasmaURLPattern.java @@ -10,7 +10,20 @@ public interface plasmaURLPattern { public static final String BLACKLIST_CRAWLER = "crawler"; public static final String BLACKLIST_PROXY = "proxy"; public static final String BLACKLIST_SEARCH = "search"; - + + public static final class blacklistFile { + + private final String filename; + private final String type; + + public blacklistFile(String filename, String type) { + this.filename = filename; + this.type = type; + } + + public String getFileName() { return this.filename; } + public String getType() { return this.type; } + } public String getEngineInfo(); @@ -21,12 +34,13 @@ public interface plasmaURLPattern { public int size(); public void clear(); - public void remove(String blacklistType, String host); + public void removeAll(String blacklistType, String host); + public void remove(String blacklistType, String host, String path); public void add(String blacklistType, String host, String path); public void loadList(String blacklistType, String filenames, String sep); - public void loadList(String[][] filenames, String sep); + public void loadList(blacklistFile[] blFiles, String sep); public boolean hashInBlacklistedCache(String blacklistType, String urlHash); diff --git a/source/de/anomic/soap/services/BlacklistService.java b/source/de/anomic/soap/services/BlacklistService.java index 39f716a3a..43261c022 100644 --- a/source/de/anomic/soap/services/BlacklistService.java +++ b/source/de/anomic/soap/services/BlacklistService.java @@ -466,7 +466,7 @@ public class BlacklistService extends AbstractService { // if the current blacklist is activated for the type, remove the item from the list if (listManager.ListInListslist(supportedBlacklistTypes[blTypes] + BLACKLISTS,blacklistName)) { - plasmaSwitchboard.urlBlacklist.remove(supportedBlacklistTypes[blTypes],itemParts[0]); + plasmaSwitchboard.urlBlacklist.remove(supportedBlacklistTypes[blTypes],itemParts[0], itemParts[1]); } } }