From a6a3090c3de2d57efbb512b197cbd9c9989b389c Mon Sep 17 00:00:00 2001 From: low012 Date: Tue, 29 Sep 2009 21:28:49 +0000 Subject: [PATCH] *) blacklist cleaner supports usage of regular expressions now *) refacored BlacklistCleaner_p.java for better readability *) moved check of validity of patterns to the Balcklist implementation since patterns might be valid in one implementation, but not in another *) added method to check validity to Blacklist interface *) fixed some minor issues like typos or wrong whitespaces *) set subversion properties for a whole bunch of files git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6359 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/BlacklistCleaner_p.html | 8 +- htroot/BlacklistCleaner_p.java | 299 ++++++++++--------- htroot/Blacklist_p.html | 2 +- htroot/Blacklist_p.java | 4 - htroot/IndexImportWikimedia_p.java | 6 +- htroot/IndexImport_p.java | 6 +- htroot/LogStatistics_p.java | 6 +- htroot/PerformanceGraph.java | 6 +- htroot/PerformanceSearch_p.java | 6 +- htroot/Ranking_p.java | 106 +++---- htroot/SearchEventPicture.java | 6 +- htroot/Supporter.java | 6 +- htroot/Surftips.java | 6 +- htroot/Threaddump_p.java | 6 +- source/de/anomic/data/AbstractBlacklist.java | 34 ++- source/de/anomic/data/Blacklist.java | 15 +- source/de/anomic/data/Coordinates.java | 6 +- source/de/anomic/data/DefaultBlacklist.java | 80 ++++- source/de/anomic/data/DidYouMeanLibrary.java | 6 +- source/de/anomic/data/LibraryProvider.java | 6 +- source/de/anomic/data/Location.java | 6 +- source/de/anomic/data/OpenGeoDB.java | 6 +- source/de/anomic/data/URLAnalysis.java | 6 +- source/de/anomic/data/URLLicense.java | 6 +- source/de/anomic/data/diff.java | 6 +- 25 files changed, 387 insertions(+), 263 deletions(-) diff --git a/htroot/BlacklistCleaner_p.html b/htroot/BlacklistCleaner_p.html index a91ab2a1c..5cbf99fd8 100644 --- a/htroot/BlacklistCleaner_p.html +++ b/htroot/BlacklistCleaner_p.html @@ -15,8 +15,12 @@
Check list
+

+ + Allow regular expressions in host part of blacklist entries. +

::

The blacklist-cleaner only works for the following blacklist-engines up to now:

@@ -40,11 +44,13 @@
diff --git a/htroot/BlacklistCleaner_p.java b/htroot/BlacklistCleaner_p.java index 49ce82fe5..c5e4fb8bb 100644 --- a/htroot/BlacklistCleaner_p.java +++ b/htroot/BlacklistCleaner_p.java @@ -37,13 +37,11 @@ import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; -import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.regex.Matcher; import java.util.regex.Pattern; -import java.util.regex.PatternSyntaxException; import de.anomic.data.AbstractBlacklist; import de.anomic.data.Blacklist; @@ -54,6 +52,7 @@ import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.yacy.logging.Log; +import java.util.Set; public class BlacklistCleaner_p { @@ -61,14 +60,7 @@ public class BlacklistCleaner_p { private static final String DISABLED = "disabled_"; private static final String BLACKLISTS = "blacklists_"; private static final String ENTRIES = "entries_"; - - private static final int ERR_TWO_WILDCARDS_IN_HOST = 0; - private static final int ERR_SUBDOMAIN_XOR_WILDCARD = 1; - private static final int ERR_PATH_REGEX = 2; - private static final int ERR_WILDCARD_BEGIN_OR_END = 3; - private static final int ERR_HOST_WRONG_CHARS = 4; - private static final int ERR_DOUBLE_OCCURANCE = 5; - + private final static String BLACKLIST_FILENAME_FILTER = "^.*\\.black$"; public static final Class[] supportedBLEngines = { @@ -82,61 +74,75 @@ public class BlacklistCleaner_p { listManager.switchboard = (Switchboard) env; listManager.listsPath = new File(env.getRootPath(), env.getConfig("listManager.listsPath", "DATA/LISTS")); String blacklistToUse = null; - + // getting the list of supported blacklist types final String supportedBlacklistTypesStr = AbstractBlacklist.BLACKLIST_TYPES_STRING; final String[] supportedBlacklistTypes = supportedBlacklistTypesStr.split(","); - - if (post == null) { - prop.put("results", "0"); - putBlacklists(prop, listManager.getDirListing(listManager.listsPath, BLACKLIST_FILENAME_FILTER), blacklistToUse); - return prop; - } - - if (post.containsKey("listNames")) { - blacklistToUse = post.get("listNames"); - if (blacklistToUse.length() == 0 || !listManager.listSetContains("listManager.listsPath", blacklistToUse)) - prop.put("results", "2"); - } - - putBlacklists(prop, listManager.getDirListing(listManager.listsPath, BLACKLIST_FILENAME_FILTER), blacklistToUse); - - if (blacklistToUse != null) { - prop.put("results", "1"); - - if (post.containsKey("delete")) { - prop.put(RESULTS + "modified", "1"); - prop.put(RESULTS + "modified_delCount", removeEntries(blacklistToUse, supportedBlacklistTypes, getByPrefix(post, "select", true, true))); - } else if (post.containsKey("alter")) { - prop.put(RESULTS + "modified", "2"); - prop.put(RESULTS + "modified_alterCount", alterEntries(blacklistToUse, supportedBlacklistTypes, getByPrefix(post, "select", true, false), getByPrefix(post, "entry", false, false))); + + prop.put(DISABLED+"checked", "1"); + + if (post != null) { + + final boolean allowRegex = post.get("allowRegex", "off").equalsIgnoreCase("on") ? true: false; + prop.put(DISABLED+"checked", (allowRegex) ? "1" : "0"); + + if (post.containsKey("listNames")) { + blacklistToUse = post.get("listNames"); + if (blacklistToUse.length() == 0 || !listManager.listSetContains("listManager.listsPath", blacklistToUse)) { + prop.put("results", "2"); + + } } - - // list illegal entries - final HashMap ies = getIllegalEntries(blacklistToUse, Switchboard.urlBlacklist); - prop.put(RESULTS + "blList", blacklistToUse); - prop.put(RESULTS + "entries", ies.size()); - prop.putHTML(RESULTS + "blEngine", Switchboard.urlBlacklist.getEngineInfo()); - prop.put(RESULTS + "disabled", (ies.size() == 0) ? "1" : "0"); - if (ies.size() > 0) { - prop.put(RESULTS + DISABLED + "entries", ies.size()); - int i = 0; - String s; - for (Entry entry: ies.entrySet()) { - s = entry.getKey(); - prop.put(RESULTS + DISABLED + ENTRIES + i + "_error", entry.getValue().longValue()); - prop.putHTML(RESULTS + DISABLED + ENTRIES + i + "_entry", s); - i++; + + putBlacklists(prop, listManager.getDirListing(listManager.listsPath, BLACKLIST_FILENAME_FILTER), blacklistToUse); + + if (blacklistToUse != null) { + prop.put("results", "1"); + + if (post.containsKey("delete")) { + prop.put(RESULTS + "modified", "1"); + prop.put(RESULTS + "modified_delCount", removeEntries(blacklistToUse, supportedBlacklistTypes, getKeysByPrefix(post, "select", true))); + } else if (post.containsKey("alter")) { + prop.put(RESULTS + "modified", "2"); + prop.put(RESULTS + "modified_alterCount", alterEntries(blacklistToUse, supportedBlacklistTypes, getKeysByPrefix(post, "select", false), getValuesByPrefix(post, "entry", false))); + } + + // list illegal entries + final Map illegalEntries = getIllegalEntries(blacklistToUse, Switchboard.urlBlacklist, allowRegex); + prop.put(RESULTS + "blList", blacklistToUse); + prop.put(RESULTS + "entries", illegalEntries.size()); + prop.putHTML(RESULTS + "blEngine", Switchboard.urlBlacklist.getEngineInfo()); + prop.put(RESULTS + "disabled", (illegalEntries.size() == 0) ? "1" : "0"); + if (illegalEntries.size() > 0) { + prop.put(RESULTS + DISABLED + "entries", illegalEntries.size()); + int i = 0; + String key; + for (Entry entry : illegalEntries.entrySet()) { + key = entry.getKey(); + prop.put(RESULTS + DISABLED + ENTRIES + i + "_error", entry.getValue().longValue()); + prop.putHTML(RESULTS + DISABLED + ENTRIES + i + "_entry", key); + i++; + } } } + } else { + prop.put("results", "0"); + putBlacklists(prop, listManager.getDirListing(listManager.listsPath, BLACKLIST_FILENAME_FILTER), blacklistToUse); } return prop; } - + + /** + * Adds a list of blacklist to the server objects properties which are used to + * display the blacklist in the HTML page belonging to this servlet. + * @param prop Server objects properties object. + * @param lists List of blacklists. + * @param selected Element in list of blacklists which will be preselected in HTML. + */ private static void putBlacklists(final serverObjects prop, final List lists, final String selected) { boolean supported = false; - for (int i=0; i r; if (useHashSet) { @@ -169,20 +210,15 @@ public class BlacklistCleaner_p { } else { r = new ArrayList(); } - - String s; + if (useKeys) { - final Iterator it = post.keySet().iterator(); - while (it.hasNext()) { - if ((s = it.next()).indexOf(prefix) == 0) { - r.add(s.substring(prefix.length())); + for (String entry : post.keySet()) { + if (entry.indexOf(prefix) == 0) { + r.add(entry.substring(prefix.length())); } } } else { - final Iterator> it = post.entrySet().iterator(); - Map.Entry entry; - while (it.hasNext()) { - entry = it.next(); + for (Map.Entry entry : post.entrySet()) { if (entry.getKey().indexOf(prefix) == 0) { r.add(entry.getValue()); } @@ -191,79 +227,56 @@ public class BlacklistCleaner_p { return r.toArray(new String[r.size()]); } - - private static HashMap/* entry, error-code */ getIllegalEntries(final String blacklistToUse, final Blacklist blEngine) { - final HashMap r = new HashMap(); - final HashSet ok = new HashSet(); - - final ArrayList list = listManager.getListArray(new File(listManager.listsPath, blacklistToUse)); - final Iterator it = list.iterator(); - String s, host, path; + + /** + * Finds illegal entries in black list. + * @param blacklistToUse The blacklist to be checked. + * @param blEngine The blacklist engine which is used to check + * @param allowRegex Set to true to allow regular expressions in host part of blacklist entry. + * @return A map which contains all entries whoch have been identified as being + * illegal by the blacklistEngine with the entry as key and an error code as + * value. + */ + private static Map getIllegalEntries(final String blacklistToUse, final Blacklist blEngine, final boolean allowRegex) { + final Map illegalEntries = new HashMap(); + final Set legalEntries = new HashSet(); - if (blEngine instanceof DefaultBlacklist) { - int slashPos; - while (it.hasNext()) { - s = (it.next()).trim(); - - // check for double-occurance - if (ok.contains(s)) { - r.put(s, Integer.valueOf(ERR_DOUBLE_OCCURANCE)); - continue; - } - ok.add(s); - - if ((slashPos = s.indexOf("/")) == -1) { - host = s; - path = ".*"; - } else { - host = s.substring(0, slashPos); - path = s.substring(slashPos + 1); - } - - final int i = host.indexOf("*"); + final List list = listManager.getListArray(new File(listManager.listsPath, blacklistToUse)); + final Map properties= new HashMap(); + properties.put("allowRegex", String.valueOf(allowRegex)); - // check whether host begins illegally - if (!host.matches("([A-Za-z0-9_-]+|\\*)(\\.([A-Za-z0-9_-]+|\\*))*")) { - if (i == 0 && host.length() > 1 && host.charAt(1) != '.') { - r.put(s, Integer.valueOf(ERR_SUBDOMAIN_XOR_WILDCARD)); - continue; - } - r.put(s, Integer.valueOf(ERR_HOST_WRONG_CHARS)); - continue; - } - - // in host-part only full sub-domains may be wildcards - if (host.length() > 0 && i > -1) { - if (!(i == 0 || i == host.length() - 1)) { - r.put(s, Integer.valueOf(ERR_WILDCARD_BEGIN_OR_END)); - continue; - } - - if (i == host.length() - 1 && host.length() > 1 && host.charAt(i - 1) != '.') { - r.put(s, Integer.valueOf(ERR_SUBDOMAIN_XOR_WILDCARD)); - continue; - } - } + if (blEngine instanceof AbstractBlacklist) { + + int err = 0; + + for (String element : list) { + element = element.trim(); - // check for double-occurences of "*" in host - if (host.indexOf("*", i + 1) > -1) { - r.put(s, Integer.valueOf(ERR_TWO_WILDCARDS_IN_HOST)); + // check for double-occurance + if (legalEntries.contains(element)) { + illegalEntries.put(element, Integer.valueOf(AbstractBlacklist.ERR_DOUBLE_OCCURANCE)); continue; } - - // check for errors on regex-compiling path - try { - Pattern.compile(path); - } catch (final PatternSyntaxException e) { - r.put(s, Integer.valueOf(ERR_PATH_REGEX)); - continue; + legalEntries.add(element); + + err = blEngine.checkError(element, properties); + + if (err > 0) { + illegalEntries.put(element, err); } } } - return r; + return illegalEntries; } - + + /** + * Removes existing entries from a blacklist. + * @param blacklistToUse The blacklist which contains the + * @param supportedBlacklistTypes Types of blacklists which the entry is to changed in. + * @param entries Array of entries to be deleted. + * @return Length of the list of entries to be removed. + */ private static int removeEntries(final String blacklistToUse, final String[] supportedBlacklistTypes, final String[] entries) { // load blacklist data from file final ArrayList list = listManager.getListArray(new File(listManager.listsPath, blacklistToUse)); @@ -309,28 +322,36 @@ public class BlacklistCleaner_p { } return entries.length; } - + + /** + * Changes existing entry in a blacklist. + * @param blacklistToUse The blacklist which contains the entry. + * @param supportedBlacklistTypes Types of blacklists which the entry is to changed in. + * @param oldEntry Entry to be changed. + * @param newEntry Changed entry. + * @return The length of the new entry. + */ private static int alterEntries( final String blacklistToUse, final String[] supportedBlacklistTypes, - final String[] oldE, - final String[] newE) { - removeEntries(blacklistToUse, supportedBlacklistTypes, oldE); + final String[] oldEntry, + final String[] newEntry) { + removeEntries(blacklistToUse, supportedBlacklistTypes, oldEntry); PrintWriter pw = null; try { pw = new PrintWriter(new FileWriter(new File(listManager.listsPath, blacklistToUse), true)); String host, path; - for (int i=0, pos; i -

The right '*', after the '/', can replaced by a regex.

+

The right '*', after the '/', can be replaced by a regex.

  • domain.net/fullpath
  • domain.net/*
  • diff --git a/htroot/Blacklist_p.java b/htroot/Blacklist_p.java index 742428fb0..179e6b42d 100644 --- a/htroot/Blacklist_p.java +++ b/htroot/Blacklist_p.java @@ -43,7 +43,6 @@ import de.anomic.data.AbstractBlacklist; import de.anomic.data.Blacklist; import de.anomic.data.listManager; import de.anomic.http.metadata.RequestHeader; -import de.anomic.search.SearchEventCache; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -65,9 +64,6 @@ public class Blacklist_p { listManager.switchboard = (Switchboard) env; listManager.listsPath = new File(listManager.switchboard.getRootPath(),listManager.switchboard.getConfig("listManager.listsPath", "DATA/LISTS")); - // clean up all search events in case that a (new) blacklist entry denies previously returned results - SearchEventCache.cleanupEvents(true); - // getting the list of supported blacklist types final String supportedBlacklistTypesStr = AbstractBlacklist.BLACKLIST_TYPES_STRING; final String[] supportedBlacklistTypes = supportedBlacklistTypesStr.split(","); diff --git a/htroot/IndexImportWikimedia_p.java b/htroot/IndexImportWikimedia_p.java index 8b6cac6bf..5b30ce13e 100644 --- a/htroot/IndexImportWikimedia_p.java +++ b/htroot/IndexImportWikimedia_p.java @@ -4,9 +4,9 @@ // first published 04.05.2009 on http://yacy.net // Frankfurt, Germany // -// $LastChangedDate: 2009-04-16 17:29:00 +0200 (Do, 16 Apr 2009) $ -// $LastChangedRevision: 5812 $ -// $LastChangedBy: orbiter $ +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by diff --git a/htroot/IndexImport_p.java b/htroot/IndexImport_p.java index 26455b792..32dddb5ee 100644 --- a/htroot/IndexImport_p.java +++ b/htroot/IndexImport_p.java @@ -7,9 +7,9 @@ // //This file is contributed by Martin Thelian // -// $LastChangedDate: 2005-10-17 17:46:12 +0200 (Mo, 17 Okt 2005) $ -// $LastChangedRevision: 947 $ -// $LastChangedBy: borg-0300 $ +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ // //This program is free software; you can redistribute it and/or modify //it under the terms of the GNU General Public License as published by diff --git a/htroot/LogStatistics_p.java b/htroot/LogStatistics_p.java index d451ac0a4..29f7a5eba 100644 --- a/htroot/LogStatistics_p.java +++ b/htroot/LogStatistics_p.java @@ -8,9 +8,9 @@ // // This File is contributed by Franz Brausze // -// $LastChangedDate: 2007-01-17 12:00:00 +0100 (Di, 17 Jan 2007) $ -// $LastChangedRevision: 3216 $ -// $LastChangedBy: karlchenofhell $ +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by diff --git a/htroot/PerformanceGraph.java b/htroot/PerformanceGraph.java index bbd25b32d..bb2effb5e 100644 --- a/htroot/PerformanceGraph.java +++ b/htroot/PerformanceGraph.java @@ -4,9 +4,9 @@ // // This is a part of YaCy, a peer-to-peer based web search engine // -// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ -// $LastChangedRevision: 1986 $ -// $LastChangedBy: orbiter $ +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ // // LICENSE // diff --git a/htroot/PerformanceSearch_p.java b/htroot/PerformanceSearch_p.java index f5cbecb84..435a8ffd9 100644 --- a/htroot/PerformanceSearch_p.java +++ b/htroot/PerformanceSearch_p.java @@ -4,9 +4,9 @@ // // This is a part of YaCy, a peer-to-peer based web search engine // -// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ -// $LastChangedRevision: 1986 $ -// $LastChangedBy: orbiter $ +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ // // LICENSE // diff --git a/htroot/Ranking_p.java b/htroot/Ranking_p.java index 9dae44262..7c8b83d2d 100644 --- a/htroot/Ranking_p.java +++ b/htroot/Ranking_p.java @@ -5,9 +5,9 @@ // // This is a part of YaCy, a peer-to-peer based web search engine // -// $LastChangedDate: 2007-07-19 22:11:48 +0000 (Do, 19 Jul 2007) $ -// $LastChangedRevision: 3995 $ -// $LastChangedBy: orbiter $ +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ // // LICENSE // @@ -44,38 +44,38 @@ public class Ranking_p { private static final HashMap rankingParameters = new HashMap(); static { - rankingParameters.put(RankingProfile.APP_DC_CREATOR, "Appearance In Author"); - rankingParameters.put(RankingProfile.APP_DC_TITLE, "Appearance In Title"); - rankingParameters.put(RankingProfile.APPEMPH, "Appearance In Emphasized Text"); - rankingParameters.put(RankingProfile.APP_DC_DESCRIPTION, "Appearance In Reference/Anchor Name"); - rankingParameters.put(RankingProfile.APP_DC_SUBJECT, "Appearance In Tags"); - rankingParameters.put(RankingProfile.APPURL, "Appearance In URL"); - rankingParameters.put(RankingProfile.AUTHORITY, "Authority of Domain"); - rankingParameters.put(RankingProfile.CATHASAPP, "Category App, Appearance"); - rankingParameters.put(RankingProfile.CATHASAUDIO, "Category Audio Appearance"); - rankingParameters.put(RankingProfile.CATHASIMAGE, "Category Image Appearance"); - rankingParameters.put(RankingProfile.CATHASVIDEO, "Category Video Appearance"); - rankingParameters.put(RankingProfile.CATINDEXOF, "Category Index Page"); - rankingParameters.put(RankingProfile.DATE, "Date"); - rankingParameters.put(RankingProfile.DESCRCOMPINTOPLIST, "Description Comp. Appears In Toplist"); - rankingParameters.put(RankingProfile.DOMLENGTH, "Domain Length"); - rankingParameters.put(RankingProfile.HITCOUNT, "Hit Count"); - rankingParameters.put(RankingProfile.LLOCAL, "Links To Local Domain"); - rankingParameters.put(RankingProfile.LOTHER, "Links To Other Domain"); - rankingParameters.put(RankingProfile.PHRASESINTEXT, "Phrases In Text"); - rankingParameters.put(RankingProfile.POSINTEXT, "Position In Text"); - rankingParameters.put(RankingProfile.POSOFPHRASE, "Position Of Phrase"); - rankingParameters.put(RankingProfile.POSINPHRASE, "Position In Phrase"); - rankingParameters.put(RankingProfile.PREFER, "Application Of Prefer Pattern"); - rankingParameters.put(RankingProfile.TERMFREQUENCY, "Term Frequency"); - rankingParameters.put(RankingProfile.URLCOMPINTOPLIST, "URL Component Appears In Toplist"); - rankingParameters.put(RankingProfile.URLCOMPS, "URL Components"); - rankingParameters.put(RankingProfile.URLLENGTH, "URL Length"); - rankingParameters.put(RankingProfile.WORDDISTANCE, "Word Distance"); - rankingParameters.put(RankingProfile.WORDSINTEXT, "Words In Text"); - rankingParameters.put(RankingProfile.WORDSINTITLE, "Words In Title"); - rankingParameters.put(RankingProfile.YBR, "YaCy Block Rank"); - rankingParameters.put(RankingProfile.LANGUAGE, "Preferred Language"); + rankingParameters.put(RankingProfile.APP_DC_CREATOR, "Appearance In Author"); + rankingParameters.put(RankingProfile.APP_DC_TITLE, "Appearance In Title"); + rankingParameters.put(RankingProfile.APPEMPH, "Appearance In Emphasized Text"); + rankingParameters.put(RankingProfile.APP_DC_DESCRIPTION, "Appearance In Reference/Anchor Name"); + rankingParameters.put(RankingProfile.APP_DC_SUBJECT, "Appearance In Tags"); + rankingParameters.put(RankingProfile.APPURL, "Appearance In URL"); + rankingParameters.put(RankingProfile.AUTHORITY, "Authority of Domain"); + rankingParameters.put(RankingProfile.CATHASAPP, "Category App, Appearance"); + rankingParameters.put(RankingProfile.CATHASAUDIO, "Category Audio Appearance"); + rankingParameters.put(RankingProfile.CATHASIMAGE, "Category Image Appearance"); + rankingParameters.put(RankingProfile.CATHASVIDEO, "Category Video Appearance"); + rankingParameters.put(RankingProfile.CATINDEXOF, "Category Index Page"); + rankingParameters.put(RankingProfile.DATE, "Date"); + rankingParameters.put(RankingProfile.DESCRCOMPINTOPLIST, "Description Comp. Appears In Toplist"); + rankingParameters.put(RankingProfile.DOMLENGTH, "Domain Length"); + rankingParameters.put(RankingProfile.HITCOUNT, "Hit Count"); + rankingParameters.put(RankingProfile.LLOCAL, "Links To Local Domain"); + rankingParameters.put(RankingProfile.LOTHER, "Links To Other Domain"); + rankingParameters.put(RankingProfile.PHRASESINTEXT, "Phrases In Text"); + rankingParameters.put(RankingProfile.POSINTEXT, "Position In Text"); + rankingParameters.put(RankingProfile.POSOFPHRASE, "Position Of Phrase"); + rankingParameters.put(RankingProfile.POSINPHRASE, "Position In Phrase"); + rankingParameters.put(RankingProfile.PREFER, "Application Of Prefer Pattern"); + rankingParameters.put(RankingProfile.TERMFREQUENCY, "Term Frequency"); + rankingParameters.put(RankingProfile.URLCOMPINTOPLIST, "URL Component Appears In Toplist"); + rankingParameters.put(RankingProfile.URLCOMPS, "URL Components"); + rankingParameters.put(RankingProfile.URLLENGTH, "URL Length"); + rankingParameters.put(RankingProfile.WORDDISTANCE, "Word Distance"); + rankingParameters.put(RankingProfile.WORDSINTEXT, "Words In Text"); + rankingParameters.put(RankingProfile.WORDSINTITLE, "Words In Title"); + rankingParameters.put(RankingProfile.YBR, "YaCy Block Rank"); + rankingParameters.put(RankingProfile.LANGUAGE, "Preferred Language"); } private static serverObjects defaultValues() { @@ -108,24 +108,24 @@ public class Ranking_p { String key; int i, j = 0; for (final Entry entry: map.entrySet()) { - key = entry.getKey(); - prop.put("attr" + attrExtension + "_" + j + "_name", rankingParameters.get(key.substring(prefix.length()))); - prop.put("attr" + attrExtension + "_" + j + "_nameorg", key); - prop.put("attr" + attrExtension + "_" + j + "_select", maxRankingRange); - for (i=0; i BLACKLIST_TYPES = new HashSet(Arrays.asList(new String[]{ Blacklist.BLACKLIST_CRAWLER, Blacklist.BLACKLIST_PROXY, @@ -117,6 +126,9 @@ public abstract class AbstractBlacklist implements Blacklist { for(final Set entry: this.cachedUrlHashs.values()) { entry.clear(); } + + // clean up all search events in case that an old blacklist entry denied previously returned results, but does not anymore + SearchEventCache.cleanupEvents(true); } public int size() { @@ -178,6 +190,8 @@ public abstract class AbstractBlacklist implements Blacklist { } } } + // clean up all search events in case that a (new) blacklist entry denies previously returned results + SearchEventCache.cleanupEvents(true); } } @@ -191,6 +205,9 @@ public abstract class AbstractBlacklist implements Blacklist { public void removeAll(final String blacklistType, final String host) { getBlacklistMap(blacklistType,true).remove(host); getBlacklistMap(blacklistType,false).remove(host); + + // clean up all search events in case that an old blacklist entry denied previously returned results, but does not anymore + SearchEventCache.cleanupEvents(true); } public void remove(final String blacklistType, final String host, final String path) { @@ -209,7 +226,10 @@ public abstract class AbstractBlacklist implements Blacklist { if (hostList.size() == 0) blacklistMapNotMatch.remove(host); } -} + + // clean up all search events in case that an old blacklist entry denied previously returned results, but does not anymore + SearchEventCache.cleanupEvents(true); + } public void add(final String blacklistType, String host, String path) { if (host == null) throw new NullPointerException(); @@ -227,6 +247,9 @@ public abstract class AbstractBlacklist implements Blacklist { ArrayList hostList = blacklistMap.get(host.toLowerCase()); if (hostList == null) blacklistMap.put(host.toLowerCase(), (hostList = new ArrayList())); hostList.add(path); + + // clean up all search events in case that a (new) blacklist entry denies previously returned results + SearchEventCache.cleanupEvents(true); } public int blacklistCacheSize() { @@ -273,6 +296,7 @@ public abstract class AbstractBlacklist implements Blacklist { } return true; } + public static boolean isMatchable (final String host) { try { if(Pattern.matches("^[a-z0-9.-]*$", host)) // simple Domain (yacy.net or www.yacy.net) diff --git a/source/de/anomic/data/Blacklist.java b/source/de/anomic/data/Blacklist.java index f85e6f87d..1e08d4337 100644 --- a/source/de/anomic/data/Blacklist.java +++ b/source/de/anomic/data/Blacklist.java @@ -1,12 +1,12 @@ -// indexReferenceBlacklist.java +// Blacklist.java // (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 26.03.2008 on http://yacy.net // // This is a part of YaCy, a peer-to-peer based web search engine // -// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ -// $LastChangedRevision: 1986 $ -// $LastChangedBy: orbiter $ +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ // // LICENSE // @@ -29,6 +29,7 @@ package de.anomic.data; import java.io.File; import java.util.Arrays; import java.util.HashSet; +import java.util.Map; import de.anomic.yacy.yacyURL; @@ -93,6 +94,8 @@ public interface Blacklist { public boolean isListed(String blacklistType, yacyURL url); - public boolean isListed(String blacklistType, String hostlow, String path); - + public boolean isListed(String blacklistType, String hostlow, String path); + + public int checkError(String entry, Map properties); + } diff --git a/source/de/anomic/data/Coordinates.java b/source/de/anomic/data/Coordinates.java index 1c0681658..0560dc4f4 100644 --- a/source/de/anomic/data/Coordinates.java +++ b/source/de/anomic/data/Coordinates.java @@ -4,9 +4,9 @@ // // This is a part of YaCy // -// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ -// $LastChangedRevision: 1986 $ -// $LastChangedBy: orbiter $ +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ // // LICENSE // diff --git a/source/de/anomic/data/DefaultBlacklist.java b/source/de/anomic/data/DefaultBlacklist.java index 98b7d76fa..c5b88845e 100644 --- a/source/de/anomic/data/DefaultBlacklist.java +++ b/source/de/anomic/data/DefaultBlacklist.java @@ -4,9 +4,9 @@ // // This is a part of YaCy, a peer-to-peer based web search engine // -// $LastChangedDate: 2008-08-20 09:54:56 +0200 (Mi, 20 Aug 2008) $ -// $LastChangedRevision: 5063 $ -// $LastChangedBy: danielr $ +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ // // LICENSE // @@ -29,6 +29,7 @@ package de.anomic.data; import java.io.File; import java.util.ArrayList; import java.util.HashMap; +import java.util.Map; import java.util.Map.Entry; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; @@ -117,4 +118,77 @@ public class DefaultBlacklist extends AbstractBlacklist implements Blacklist { } return matched; } + + public int checkError(String element, Map properties) { + + boolean allowRegex = true; + int slashPos; + String host, path; + + if (properties != null) { + allowRegex = properties.get("allowRegex").equalsIgnoreCase("true") ? true : false; + } + + if ((slashPos = element.indexOf("/")) == -1) { + host = element; + path = ".*"; + } else { + host = element.substring(0, slashPos); + path = element.substring(slashPos + 1); + } + + if (!allowRegex || !isValidRegex(host)) { + final int i = host.indexOf("*"); + + // check whether host begins illegally + if (!host.matches("([A-Za-z0-9_-]+|\\*)(\\.([A-Za-z0-9_-]+|\\*))*")) { + if (i == 0 && host.length() > 1 && host.charAt(1) != '.') { + return ERR_SUBDOMAIN_XOR_WILDCARD; + } + return ERR_HOST_WRONG_CHARS; + } + + // in host-part only full sub-domains may be wildcards + if (host.length() > 0 && i > -1) { + if (!(i == 0 || i == host.length() - 1)) { + return ERR_WILDCARD_BEGIN_OR_END; + } + + if (i == host.length() - 1 && host.length() > 1 && host.charAt(i - 1) != '.') { + return ERR_SUBDOMAIN_XOR_WILDCARD; + } + } + + // check for double-occurences of "*" in host + if (host.indexOf("*", i + 1) > -1) { + return ERR_TWO_WILDCARDS_IN_HOST; + } + } else if (allowRegex && !isValidRegex(host)) { + return ERR_HOST_REGEX; + } + + // check for errors on regex-compiling path + if (!isValidRegex(path) && !path.equals("*")) { + return ERR_PATH_REGEX; + } + + return 0; + } + + /** + * Checks if a given expression is a valid regular expression. + * @param expression The expression to be checked. + * @return True if the expression is a valid regular expression, else false. + */ + private static boolean isValidRegex(String expression) { + boolean ret = true; + try { + Pattern.compile(expression); + } catch (final PatternSyntaxException e) { + + ret = false; + } + return ret; + } + } diff --git a/source/de/anomic/data/DidYouMeanLibrary.java b/source/de/anomic/data/DidYouMeanLibrary.java index 451797259..9fbe13b51 100644 --- a/source/de/anomic/data/DidYouMeanLibrary.java +++ b/source/de/anomic/data/DidYouMeanLibrary.java @@ -4,9 +4,9 @@ // // This is a part of YaCy // -// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ -// $LastChangedRevision: 1986 $ -// $LastChangedBy: orbiter $ +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ // // LICENSE // diff --git a/source/de/anomic/data/LibraryProvider.java b/source/de/anomic/data/LibraryProvider.java index d5b072d3a..f3564de1c 100644 --- a/source/de/anomic/data/LibraryProvider.java +++ b/source/de/anomic/data/LibraryProvider.java @@ -4,9 +4,9 @@ // // This is a part of YaCy // -// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ -// $LastChangedRevision: 1986 $ -// $LastChangedBy: orbiter $ +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ // // LICENSE // diff --git a/source/de/anomic/data/Location.java b/source/de/anomic/data/Location.java index 5fcc61fd8..509bf2295 100644 --- a/source/de/anomic/data/Location.java +++ b/source/de/anomic/data/Location.java @@ -4,9 +4,9 @@ // // This is a part of YaCy // -// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ -// $LastChangedRevision: 1986 $ -// $LastChangedBy: orbiter $ +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ // // LICENSE // diff --git a/source/de/anomic/data/OpenGeoDB.java b/source/de/anomic/data/OpenGeoDB.java index 9f62c3de4..ab6aaedea 100644 --- a/source/de/anomic/data/OpenGeoDB.java +++ b/source/de/anomic/data/OpenGeoDB.java @@ -4,9 +4,9 @@ // // This is a part of YaCy // -// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ -// $LastChangedRevision: 1986 $ -// $LastChangedBy: orbiter $ +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ // // LICENSE // diff --git a/source/de/anomic/data/URLAnalysis.java b/source/de/anomic/data/URLAnalysis.java index 685f47565..74f98a1b1 100644 --- a/source/de/anomic/data/URLAnalysis.java +++ b/source/de/anomic/data/URLAnalysis.java @@ -4,9 +4,9 @@ // // This is a part of YaCy, a peer-to-peer based web search engine // -// $LastChangedDate: 2009-01-02 12:38:20 +0100 (Fr, 02 Jan 2009) $ -// $LastChangedRevision: 5432 $ -// $LastChangedBy: orbiter $ +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ // // LICENSE // diff --git a/source/de/anomic/data/URLLicense.java b/source/de/anomic/data/URLLicense.java index 54bdc60e3..0f2671685 100644 --- a/source/de/anomic/data/URLLicense.java +++ b/source/de/anomic/data/URLLicense.java @@ -4,9 +4,9 @@ // // This is a part of YaCy, a peer-to-peer based web search engine // -// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ -// $LastChangedRevision: 1986 $ -// $LastChangedBy: orbiter $ +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ // // LICENSE // diff --git a/source/de/anomic/data/diff.java b/source/de/anomic/data/diff.java index af904c2d6..c7687b537 100644 --- a/source/de/anomic/data/diff.java +++ b/source/de/anomic/data/diff.java @@ -8,9 +8,9 @@ // // This file is contributed by Franz Brausze // -// $LastChangedDate: $ -// $LastChangedRevision: $ -// $LastChangedBy: $ +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by