- added support for multiple paths per domain to default-blacklist

warning: an interface-change had been neccessary:
- remove(String, String) has been renamed to removeAll(String, String), because it removes all path-entries for the specified host
- remove(String, String, String) has been added to delete only a path-entry
- geBlacklistType(String) has been renamed to getBlacklistType(String)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3391 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
karlchenofhell 18 years ago
parent 3d6ab19f7e
commit 26f5757b40

@ -249,7 +249,8 @@ public class BlacklistCleaner_p {
for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) { for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) {
if (listManager.ListInListslist(supportedBlacklistTypes[blTypes] + ".BlackLists", blacklistToUse)) { if (listManager.ListInListslist(supportedBlacklistTypes[blTypes] + ".BlackLists", blacklistToUse)) {
plasmaSwitchboard.urlBlacklist.remove(supportedBlacklistTypes[blTypes], plasmaSwitchboard.urlBlacklist.remove(supportedBlacklistTypes[blTypes],
(s.indexOf("/") == -1) ? s : s.substring(0, s.indexOf("/"))); (s.indexOf("/") == -1) ? s : s.substring(0, s.indexOf("/")),
(s.indexOf("/") == -1) ? ".*" : s.substring(s.indexOf("/") + 1));
} }
} }
} }
@ -284,7 +285,8 @@ public class BlacklistCleaner_p {
for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) { for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) {
if (listManager.ListInListslist(supportedBlacklistTypes[blTypes] + ".BlackLists", blacklistToUse)) { if (listManager.ListInListslist(supportedBlacklistTypes[blTypes] + ".BlackLists", blacklistToUse)) {
plasmaSwitchboard.urlBlacklist.remove(supportedBlacklistTypes[blTypes], plasmaSwitchboard.urlBlacklist.remove(supportedBlacklistTypes[blTypes],
(s.indexOf("/") == -1) ? s : s.substring(0, s.indexOf("/"))); (s.indexOf("/") == -1) ? s : s.substring(0, s.indexOf("/")),
(s.indexOf("/") == -1) ? ".*" : s.substring(s.indexOf("/") + 1));
plasmaSwitchboard.urlBlacklist.add(supportedBlacklistTypes[blTypes], host, path); plasmaSwitchboard.urlBlacklist.add(supportedBlacklistTypes[blTypes], host, path);
} }
} }

@ -225,7 +225,7 @@ public class Blacklist_p {
} }
for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) { for (int blTypes=0; blTypes < supportedBlacklistTypes.length; blTypes++) {
if (listManager.ListInListslist(supportedBlacklistTypes[blTypes] + ".BlackLists",blacklistToUse)) { if (listManager.ListInListslist(supportedBlacklistTypes[blTypes] + ".BlackLists",blacklistToUse)) {
plasmaSwitchboard.urlBlacklist.remove(supportedBlacklistTypes[blTypes],oldEntry.substring(0, pos)); plasmaSwitchboard.urlBlacklist.remove(supportedBlacklistTypes[blTypes],oldEntry.substring(0, pos), oldEntry.substring(pos + 1));
} }
} }
@ -307,7 +307,7 @@ public class Blacklist_p {
if (nextEntry.length() == 0) continue; if (nextEntry.length() == 0) continue;
if (nextEntry.startsWith("#")) continue; if (nextEntry.startsWith("#")) continue;
prop.put(DISABLED + "Itemlist_" + entryCount + "_item", de.anomic.data.wikiCode.replaceXMLEntities(nextEntry)); prop.put(DISABLED + "Itemlist_" + entryCount + "_item", nextEntry);
entryCount++; entryCount++;
} }
prop.put(DISABLED + "Itemlist", entryCount); prop.put(DISABLED + "Itemlist", entryCount);
@ -341,7 +341,7 @@ public class Blacklist_p {
int blacklistCount = 0; int blacklistCount = 0;
if (dirlist != null) { if (dirlist != null) {
for (int i = 0; i <= dirlist.length - 1; i++) { for (int i = 0; i <= dirlist.length - 1; i++) {
prop.put(DISABLED + BLACKLIST + blacklistCount + "_name", de.anomic.data.wikiCode.replaceXMLEntities(dirlist[i])); prop.put(DISABLED + BLACKLIST + blacklistCount + "_name", dirlist[i]);
prop.put(DISABLED + BLACKLIST + blacklistCount + "_selected", 0); prop.put(DISABLED + BLACKLIST + blacklistCount + "_selected", 0);
if (dirlist[i].equals(blacklistToUse)) { //current List if (dirlist[i].equals(blacklistToUse)) { //current List

@ -45,6 +45,7 @@ import java.io.File;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
import java.util.Comparator; import java.util.Comparator;
import java.util.Iterator; import java.util.Iterator;
@ -378,6 +379,30 @@ public class kelondroMSetTools {
return map; return map;
} }
public static TreeMap /* <String,ArrayList<String>> */ loadMapMultiValsPerKey(String filename, String sep) {
TreeMap map = new TreeMap();
BufferedReader br = null;
try {
br = new BufferedReader(new InputStreamReader(new FileInputStream(filename)));
String line, key, value;
int pos;
while ((line = br.readLine()) != null) {
line = line.trim();
if ((line.length() > 0) && (!(line.startsWith("#"))) && ((pos = line.indexOf(sep)) > 0)) {
key = line.substring(0, pos).trim().toLowerCase();
value = line.substring(pos + sep.length()).trim();
if (!map.containsKey(key))
map.put(key, new ArrayList());
((ArrayList)map.get(key)).add(value);
}
}
} catch (IOException e) {
} finally {
if (br != null) try { br.close(); } catch (Exception e) {}
}
return map;
}
public static TreeSet loadList(File file, Comparator c) { public static TreeSet loadList(File file, Comparator c) {
TreeSet list = new TreeSet(c); TreeSet list = new TreeSet(c);
if (!(file.exists())) return list; if (!(file.exists())) return list;

@ -45,13 +45,19 @@
package de.anomic.plasma.urlPattern; package de.anomic.plasma.urlPattern;
import java.io.BufferedReader;
import java.io.File; import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collections; import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.Set; import java.util.Set;
import java.util.TreeMap;
import de.anomic.kelondro.kelondroMSetTools; import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.net.URL; import de.anomic.net.URL;
@ -67,8 +73,7 @@ public abstract class abstractURLPattern implements plasmaURLPattern {
protected File blacklistRootPath = null; protected File blacklistRootPath = null;
protected HashMap cachedUrlHashs = null; protected HashMap cachedUrlHashs = null;
protected HashMap hostpaths = null; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here protected HashMap /* <blacklistType,HashMap<host,ArrayList<path>>> */ hostpaths = null; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here
public abstractURLPattern(File rootPath) { public abstractURLPattern(File rootPath) {
this.setRootPath(rootPath); this.setRootPath(rootPath);
@ -98,7 +103,7 @@ public abstract class abstractURLPattern implements plasmaURLPattern {
this.blacklistRootPath = rootPath; this.blacklistRootPath = rootPath;
} }
protected HashMap geBlacklistMap(String blacklistType) { protected HashMap getBlacklistMap(String blacklistType) {
if (blacklistType == null) throw new IllegalArgumentException(); if (blacklistType == null) throw new IllegalArgumentException();
if (!BLACKLIST_TYPES.contains(blacklistType)) throw new IllegalArgumentException("Unknown backlist type."); if (!BLACKLIST_TYPES.contains(blacklistType)) throw new IllegalArgumentException("Unknown backlist type.");
@ -124,36 +129,41 @@ public abstract class abstractURLPattern implements plasmaURLPattern {
int size = 0; int size = 0;
Iterator iter = this.hostpaths.keySet().iterator(); Iterator iter = this.hostpaths.keySet().iterator();
while (iter.hasNext()) { while (iter.hasNext()) {
HashMap blacklistMap = (HashMap) this.hostpaths.get(iter.next()); Iterator blIter = ((HashMap)this.hostpaths.get(iter.next())).values().iterator();
size += blacklistMap.size(); while (blIter.hasNext())
size += ((ArrayList)blIter.next()).size();
} }
return size; return size;
} }
public void loadList(String[][] filenames, String sep) { public void loadList(blacklistFile[] blFiles, String sep) {
for (int j = 0; j < filenames.length; j++) { for (int j = 0; j < blFiles.length; j++) {
String[] nextFile = filenames[j]; blacklistFile blf = blFiles[j];
String blacklistType = nextFile[0]; loadList(blf.getType(), blf.getFileName(), sep);
String fileName = nextFile[1];
this.loadList(blacklistType, fileName, sep);
} }
} }
public void loadList(String blacklistType, String filenames, String sep) { public void loadList(String blacklistType, String filenames, String sep) {
HashMap blacklistMap = getBlacklistMap(blacklistType);
HashMap blacklistMap = geBlacklistMap(blacklistType);
String[] filenamesarray = filenames.split(","); String[] filenamesarray = filenames.split(",");
if( filenamesarray.length > 0) { if (filenamesarray.length > 0) {
for (int i = 0; i < filenamesarray.length; i++) { for (int i = 0; i < filenamesarray.length; i++) {
blacklistMap.putAll(kelondroMSetTools.loadMap(new File(this.blacklistRootPath, filenamesarray[i]).toString(), sep)); blacklistMap.putAll(kelondroMSetTools.loadMapMultiValsPerKey(new File(this.blacklistRootPath, filenamesarray[i]).toString(), sep));
} }
} }
} }
public void remove(String blacklistType, String host) { public void removeAll(String blacklistType, String host) {
HashMap blacklistMap = getBlacklistMap(blacklistType);
blacklistMap.remove(host);
}
HashMap blacklistMap = geBlacklistMap(blacklistType); public void remove(String blacklistType, String host, String path) {
HashMap blacklistMap = getBlacklistMap(blacklistType);
ArrayList hostList = (ArrayList)blacklistMap.get(host);
hostList.remove(path);
if (hostList.size() == 0)
blacklistMap.remove(host); blacklistMap.remove(host);
} }
@ -163,8 +173,11 @@ public abstract class abstractURLPattern implements plasmaURLPattern {
if (path.length() > 0 && path.charAt(0) == '/') path = path.substring(1); if (path.length() > 0 && path.charAt(0) == '/') path = path.substring(1);
HashMap blacklistMap = geBlacklistMap(blacklistType); HashMap blacklistMap = getBlacklistMap(blacklistType);
blacklistMap.put(host.toLowerCase(), path); ArrayList hostList = (ArrayList)blacklistMap.get(host.toLowerCase());
if (hostList == null)
blacklistMap.put(host.toLowerCase(), (hostList = new ArrayList()));
hostList.add(path);
} }
public int blacklistCacheSize() { public int blacklistCacheSize() {

@ -42,6 +42,7 @@
package de.anomic.plasma.urlPattern; package de.anomic.plasma.urlPattern;
import java.io.File; import java.io.File;
import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
@ -60,28 +61,44 @@ public class defaultURLPattern extends abstractURLPattern implements plasmaURLPa
if (path == null) throw new NullPointerException(); if (path == null) throw new NullPointerException();
// getting the proper blacklist // getting the proper blacklist
HashMap blacklistMap = super.geBlacklistMap(blacklistType); HashMap blacklistMap = super.getBlacklistMap(blacklistType);
if (path.length() > 0 && path.charAt(0) == '/') path = path.substring(1); if (path.length() > 0 && path.charAt(0) == '/') path = path.substring(1);
ArrayList app;
boolean matched = false;
String pp = ""; // path-pattern String pp = ""; // path-pattern
// first try to match the domain with wildcard '*' // first try to match the domain with wildcard '*'
// [TL] While "." are found within the string // [TL] While "." are found within the string
int index = 0; int index = 0;
while ((index = hostlow.indexOf('.', index + 1)) != -1) { while ((index = hostlow.indexOf('.', index + 1)) != -1) {
if ((pp = (String) blacklistMap.get(hostlow.substring(0, index + 1) + "*")) != null) { if ((app = (ArrayList) blacklistMap.get(hostlow.substring(0, index + 1) + "*")) != null) {
return ((pp.equals("*")) || (path.matches(pp))); for (int i=app.size()-1; !matched && i>-1; i--) {
pp = (String)app.get(i);
matched |= ((pp.equals("*")) || (path.matches(pp)));
}
return matched;
} }
} }
index = hostlow.length(); index = hostlow.length();
while ((index = hostlow.lastIndexOf('.', index - 1)) != -1) { while ((index = hostlow.lastIndexOf('.', index - 1)) != -1) {
if ((pp = (String) blacklistMap.get("*" + hostlow.substring(index, hostlow.length()))) != null) { if ((app = (ArrayList) blacklistMap.get("*" + hostlow.substring(index, hostlow.length()))) != null) {
return ((pp.equals("*")) || (path.matches(pp))); for (int i=app.size()-1; !matched && i>-1; i--) {
pp = (String)app.get(i);
matched |= ((pp.equals("*")) || (path.matches(pp)));
}
return matched;
} }
} }
// try to match without wildcard in domain // try to match without wildcard in domain
return (((pp = (String) blacklistMap.get(hostlow)) != null) && if ((app = (ArrayList)blacklistMap.get(hostlow)) != null) {
((pp.equals("*")) || (path.matches(pp)))); for (int i=app.size()-1; !matched && i>-1; i--) {
pp = (String)app.get(i);
matched |= ((pp.equals("*")) || (path.matches(pp)));
}
return matched;
}
return false;
} }
} }

@ -11,6 +11,19 @@ public interface plasmaURLPattern {
public static final String BLACKLIST_PROXY = "proxy"; public static final String BLACKLIST_PROXY = "proxy";
public static final String BLACKLIST_SEARCH = "search"; public static final String BLACKLIST_SEARCH = "search";
public static final class blacklistFile {
private final String filename;
private final String type;
public blacklistFile(String filename, String type) {
this.filename = filename;
this.type = type;
}
public String getFileName() { return this.filename; }
public String getType() { return this.type; }
}
public String getEngineInfo(); public String getEngineInfo();
@ -21,12 +34,13 @@ public interface plasmaURLPattern {
public int size(); public int size();
public void clear(); public void clear();
public void remove(String blacklistType, String host); public void removeAll(String blacklistType, String host);
public void remove(String blacklistType, String host, String path);
public void add(String blacklistType, String host, String path); public void add(String blacklistType, String host, String path);
public void loadList(String blacklistType, String filenames, String sep); public void loadList(String blacklistType, String filenames, String sep);
public void loadList(String[][] filenames, String sep); public void loadList(blacklistFile[] blFiles, String sep);
public boolean hashInBlacklistedCache(String blacklistType, String urlHash); public boolean hashInBlacklistedCache(String blacklistType, String urlHash);

@ -466,7 +466,7 @@ public class BlacklistService extends AbstractService {
// if the current blacklist is activated for the type, remove the item from the list // if the current blacklist is activated for the type, remove the item from the list
if (listManager.ListInListslist(supportedBlacklistTypes[blTypes] + BLACKLISTS,blacklistName)) { if (listManager.ListInListslist(supportedBlacklistTypes[blTypes] + BLACKLISTS,blacklistName)) {
plasmaSwitchboard.urlBlacklist.remove(supportedBlacklistTypes[blTypes],itemParts[0]); plasmaSwitchboard.urlBlacklist.remove(supportedBlacklistTypes[blTypes],itemParts[0], itemParts[1]);
} }
} }
} }

Loading…
Cancel
Save