*) Invalid crawl profiles (containing invalid mustmatch/mustnotmatch filters) will be moved from active crawls to invalid crawls (new file: DATA/INDEX/freeworld/QUEUES/crawlProfilesInvalid.heap). This file can not be edited yet, but it shoudl be easy to extend the CrawlProfileEditor accordingly.

*) Corrupt crawlProfilesPassive.heap would cause crawlProfilesActive.heap to be deleted. Don't know if this ever happend, but will not happen anymore. *) Cleaned up a little bit. *) Added some comments. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7827 6c8d7289-2bf4-0310-a012-ef5d649a1542
14 years ago · c7b95e8c81
parent b84089ff04
commit c7b95e8c81
7 changed files with 285 additions and 85 deletions
--- a/htroot/CrawlProfileEditor_p.java
+++ b/htroot/CrawlProfileEditor_p.java
@ -101,7 +101,10 @@ public class CrawlProfileEditor_p {
        labels.add(new eentry(CrawlProfile.XPSTOPW,             "Parent stop-words",     false, eentry.BOOLEAN));
    }
    
-    public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
+    public static serverObjects respond(
+            final RequestHeader header, 
+            final serverObjects post,
+            final serverSwitch env) {
        final servletProperties prop = new servletProperties();
        final Switchboard sb = (Switchboard)env;
        
@ -131,7 +134,7 @@ public class CrawlProfileEditor_p {
        
        // generate handle list: first sort by handle name
        CrawlProfile selentry;
-        Map<String, String> orderdHandles = new TreeMap<String, String>();
+        final Map<String, String> orderdHandles = new TreeMap<String, String>();
        for (final byte[] h : sb.crawler.getActive()) {
            selentry = sb.crawler.getActive(h);
            if (selentry != null && !ignoreNames.contains(selentry.name())) {
@ -219,7 +222,8 @@ public class CrawlProfileEditor_p {
                prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly_label", ee.label);
                prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly_type", ee.type);
                if (ee.type == eentry.BOOLEAN) {
-                    prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly_type_checked", Boolean.parseBoolean(val) ? "1" : "0");
+                    prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly_type_checked",
+                            Boolean.parseBoolean(val) ? "1" : "0");
                } else {
                    prop.put(EDIT_ENTRIES_PREFIX + count + "_readonly_type_value", val);
                }
@ -231,7 +235,14 @@ public class CrawlProfileEditor_p {
        return prop;
    }
    
-    private static void putProfileEntry(final servletProperties prop, final CrawlStacker crawlStacker, final CrawlProfile profile, final boolean active, final boolean dark, final int count, final int domlistlength) {
+    private static void putProfileEntry(
+            final servletProperties prop,
+            final CrawlStacker crawlStacker,
+            final CrawlProfile profile,
+            final boolean active,
+            final boolean dark,
+            final int count,
+            final int domlistlength) {

        prop.put(CRAWL_PROFILE_PREFIX + count + "_dark", dark ? "1" : "0");
        prop.put(CRAWL_PROFILE_PREFIX + count + "_name", profile.name());
@ -247,13 +258,13 @@ public class CrawlProfileEditor_p {
        prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingIfOlder", (profile.recrawlIfOlder() == 0L) ? "no re-crawl" : DateFormat.getDateTimeInstance().format(profile.recrawlIfOlder()));
        prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterDepth", "inactive");

-        // start contrib [MN]
        int i = 0;
-        if (active && profile.domMaxPages() > 0 && profile.domMaxPages() != Integer.MAX_VALUE) {
+        if (active && profile.domMaxPages() > 0
+                && profile.domMaxPages() != Integer.MAX_VALUE) {
        String item;
-        while (i <= domlistlength && !"".equals(item = crawlStacker.domName(true, i))){
+        while (i <= domlistlength && !(item = crawlStacker.domName(true, i)).isEmpty()){
            if (i == domlistlength) {
-                item = item + " ...";
+                item += " ...";
            }
            prop.putHTML(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterContent_" + i + "_item", item);
            i++;
--- a/source/de/anomic/crawler/CrawlProfile.java
+++ b/source/de/anomic/crawler/CrawlProfile.java
@ -4,7 +4,10 @@
 // (C) by Michael Peter Christen; mc@yacy.net
 // first published on http://www.anomic.de
 // Frankfurt, Germany, 2004
-// last major change: 31.08.2010
+//
+// $LastChangedDate$
+// $LastChangedRevision$
+// $LastChangedBy$
 //
 // This program is free software; you can redistribute it and/or modify
 // it under the terms of the GNU General Public License as published by
@ -63,6 +66,26 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M

    private Pattern mustmatch = null, mustnotmatch = null;

+    /**
+     * Constructor which creates CrawlPofile from parameters.
+     * @param name name of the crawl profile
+     * @param startURL root URL of the crawl
+     * @param mustmatch URLs which do not match this regex will be ignored
+     * @param mustnotmatch URLs which match this regex will be ignored
+     * @param depth height of the tree which will be created by the crawler
+     * @param recrawlIfOlder documents which have been indexed in the past will
+     * be indexed again if they are older than the time (ms) in this parameter
+     * @param domMaxPages maximum number from one domain which will be indexed
+     * @param crawlingQ true if URLs containing questionmarks shall be indexed
+     * @param indexText true if text content of URL shall be indexed
+     * @param indexMedia true if media content of URL shall be indexed
+     * @param storeHTCache true if content chall be kept in cache after indexing
+     * @param remoteIndexing true if part of the crawl job shall be distributed
+     * @param xsstopw true if static stop words shall be ignored
+     * @param xdstopw true if dynamic stop words shall be ignored
+     * @param xpstopw true if parent stop words shall be ignored
+     * @param cacheStrategy determines if and how cache is used loading content
+     */
    public CrawlProfile(
                 final String name,
                 final DigestURI startURL,
@ -81,8 +104,12 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
                 final boolean xpstopw,
                 final CacheStrategy cacheStrategy) {
        super(40);
-        if (name == null || name.length() == 0) throw new NullPointerException("name must not be null");
-        final String handle = (startURL == null) ? Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name)).substring(0, Word.commonHashLength) : ASCII.String(startURL.hash());
+        if (name == null || name.isEmpty()) {
+            throw new NullPointerException("name must not be null or empty");
+        }
+        final String handle = (startURL == null) 
+                ? Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name)).substring(0, Word.commonHashLength)
+                : ASCII.String(startURL.hash());
        put(HANDLE,           handle);
        put(NAME,             name);
        put(START_URL,        (startURL == null) ? "" : startURL.toNormalform(true, false));
@ -102,37 +129,75 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        put(CACHE_STRAGEGY,   cacheStrategy.toString());
    }

+    /**
+     * Constructor which creats a CrawlProfile from values in a Map.
+     * @param ext contains values
+     */
    public CrawlProfile(final Map<String, String> ext) {
        super(ext == null ? 1 : ext.size());
        if (ext != null) putAll(ext);
    }
-
-    public void put(final String key, final boolean value) {
+    
+    /**
+     * Adds a parameter to CrawlProfile.
+     * @param key name of the parameter
+     * @param value values if the parameter
+     */
+    public final void put(final String key, final boolean value) {
        super.put(key, Boolean.toString(value));
    }

-    public void put(final String key, final int value) {
+    /**
+     * Adds a parameter to CrawlProfile.
+     * @param key name of the parameter
+     * @param value values if the parameter
+     */
+    public final void put(final String key, final int value) {
        super.put(key, Integer.toString(value));
    }

-    public void put(final String key, final long value) {
+    /**
+     * Adds a parameter to CrawlProfile.
+     * @param key name of the parameter
+     * @param value values if the parameter
+     */
+    public final void put(final String key, final long value) {
        super.put(key, Long.toString(value));
    }

+    /**
+     * Gets handle of the CrawlProfile.
+     * @return handle of the profile
+     */
    public String handle() {
        final String r = get(HANDLE);
        //if (r == null) return null;
        return r;
    }
+    
+    /**
+     * Gets the name of the CrawlProfile.
+     * @return  name of the profile
+     */
    public String name() {
        final String r = get(NAME);
        if (r == null) return "";
        return r;
    }
+    
+    /**
+     * Gets the root URL of the crawl job.
+     * @return root URL
+     */
    public String startURL() {
        final String r = get(START_URL);
        return r;
    }
+    
+    /**
+     * Gets the regex which must be matched by URLs in order to be crawled.
+     * @return regex which must be matched
+     */
    public Pattern mustMatchPattern() {
        if (this.mustmatch == null) {
            String r = get(FILTER_MUSTMATCH);
@ -141,6 +206,11 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        }
        return this.mustmatch;
    }
+    
+    /**
+     * Gets the regex which must not be matched by URLs in order to be crawled.
+     * @return regex which must not be matched
+     */
    public Pattern mustNotMatchPattern() {
        if (this.mustnotmatch == null) {
            String r = get(FILTER_MUSTNOTMATCH);
@ -149,6 +219,12 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        }
        return this.mustnotmatch;
    }
+    
+    /**
+     * Gets depth of crawl job (or height of the tree which will be 
+     * created by the crawler).
+     * @return depth of crawl job
+     */
    public int depth() {
        final String r = get(DEPTH);
        if (r == null) return 0;
@ -159,6 +235,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
            return 0;
        }
    }
+    
    public CacheStrategy cacheStrategy() {
        final String r = get(CACHE_STRAGEGY);
        if (r == null) return CacheStrategy.IFEXIST;
@ -169,9 +246,15 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
            return CacheStrategy.IFEXIST;
        }
    }
+    
    public void setCacheStrategy(final CacheStrategy newStrategy) {
        put(CACHE_STRAGEGY, newStrategy.toString());
    }
+    
+    /**
+     * Gets the minimum age that an entry must have to be re-crawled.
+     * @return time in ms
+     */
    public long recrawlIfOlder() {
        // returns a long (millis) that is the minimum age that
        // an entry must have to be re-crawled
@ -185,6 +268,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
            return 0L;
        }
    }
+    
    public int domMaxPages() {
        // this is the maximum number of pages that are crawled for a single domain
        // if -1, this means no limit
@ -199,26 +283,31 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
            return Integer.MAX_VALUE;
        }
    }
+    
    public boolean crawlingQ() {
        final String r = get(CRAWLING_Q);
        if (r == null) return false;
        return (r.equals(Boolean.TRUE.toString()));
    }
+    
    public boolean pushSolr() {
        final String r = get(PUSH_SOLR);
        if (r == null) return true;
        return (r.equals(Boolean.TRUE.toString()));
    }
+    
    public boolean indexText() {
        final String r = get(INDEX_TEXT);
        if (r == null) return true;
        return (r.equals(Boolean.TRUE.toString()));
    }
+    
    public boolean indexMedia() {
        final String r = get(INDEX_MEDIA);
        if (r == null) return true;
        return (r.equals(Boolean.TRUE.toString()));
    }
+    
    public boolean storeHTCache() {
        final String r = get(STORE_HTCACHE);
        if (r == null) return false;
@ -229,16 +318,19 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        if (r == null) return false;
        return (r.equals(Boolean.TRUE.toString()));
    }
+    
    public boolean excludeStaticStopwords() {
        final String r = get(XSSTOPW);
        if (r == null) return false;
        return (r.equals(Boolean.TRUE.toString()));
    }
+    
    public boolean excludeDynamicStopwords() {
        final String r = get(XDSTOPW);
        if (r == null) return false;
        return (r.equals(Boolean.TRUE.toString()));
    }
+    
    public boolean excludeParentStopwords() {
        final String r = get(XPSTOPW);
        if (r == null) return false;
--- a/source/de/anomic/crawler/CrawlSwitchboard.java
+++ b/source/de/anomic/crawler/CrawlSwitchboard.java
@ -39,6 +39,7 @@ import net.yacy.kelondro.logging.Log;
 import net.yacy.kelondro.order.NaturalOrder;
 import net.yacy.kelondro.util.FileUtils;
 import net.yacy.kelondro.util.kelondroException;
+import net.yacy.repository.RegexHelper;

 public final class CrawlSwitchboard {

@ -52,6 +53,7 @@ public final class CrawlSwitchboard {

    public static final String DBFILE_ACTIVE_CRAWL_PROFILES        = "crawlProfilesActive.heap";
    public static final String DBFILE_PASSIVE_CRAWL_PROFILES       = "crawlProfilesPassive.heap";
+    public static final String DBFILE_INVALID_CRAWL_PROFILES       = "crawlProfilesInvalid.heap";

    public static final long CRAWL_PROFILE_PROXY_RECRAWL_CYCLE = 60L * 24L;
    public static final long CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L;
@ -61,7 +63,7 @@ public final class CrawlSwitchboard {
    public static final long CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE = 60L * 24L * 30L;

    private final Log       log;
-    private        Map<byte[], Map<String, String>>   profilesActiveCrawls, profilesPassiveCrawls;
+    private Map<byte[], Map<String, String>> profilesActiveCrawls, profilesPassiveCrawls, profilesInvalidCrawls;
    public  CrawlProfile    defaultProxyProfile;
    public  CrawlProfile    defaultRemoteProfile;
    public  CrawlProfile    defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile;
@ -87,40 +89,37 @@ public final class CrawlSwitchboard {
        this.queuesRoot.mkdirs();
        this.log.logConfig("Initializing Crawl Profiles");

+        final File profilesInvalidFile = new File(queuesRoot, DBFILE_INVALID_CRAWL_PROFILES);
+        this.profilesInvalidCrawls = loadFromDB(profilesInvalidFile);
+        
        final File profilesActiveFile = new File(queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES);
-        try {
-            this.profilesActiveCrawls = new MapHeap(profilesActiveFile, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, ' ');
-        } catch (final IOException e) {
-            Log.logException(e);Log.logException(e);
-            FileUtils.deletedelete(profilesActiveFile);
-            try {
-                this.profilesActiveCrawls = new MapHeap(profilesActiveFile, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, ' ');
-            } catch (final IOException e1) {
-                Log.logException(e1);
-                this.profilesActiveCrawls = null;
+        this.profilesActiveCrawls = loadFromDB(profilesActiveFile);
+        for (final byte[] handle : this.profilesActiveCrawls.keySet()) {
+            final CrawlProfile p;
+            p = new CrawlProfile(this.profilesActiveCrawls.get(handle));
+            if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_MUSTMATCH))) {
+                this.removeActive(handle);
+                this.putInvalid(handle, p);
+                Log.logWarning("CrawlProfiles", "removed Profile " + p.handle() + ": " + p.name()
+                        + " from active crawls since " + CrawlProfile.FILTER_MUSTMATCH 
+                        + " is no valid regular expression: " + p.get(CrawlProfile.FILTER_MUSTMATCH));
+            } else if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_MUSTNOTMATCH))) {
+                this.putInvalid(handle, p);
+                this.removeActive(handle);
+                Log.logWarning("CrawlProfiles", "removed Profile " + p.handle() + ": " + p.name()
+                        + " from active crawls since " + CrawlProfile.FILTER_MUSTNOTMATCH 
+                        + " is no valid regular expression: " + p.get(CrawlProfile.FILTER_MUSTNOTMATCH));
+            } else {
+                Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name());
            }
-        }
-        for (final byte[] handle: this.profilesActiveCrawls.keySet()) {
-            final CrawlProfile p = new CrawlProfile(this.profilesActiveCrawls.get(handle));
-            Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name());
+            
        }
        initActiveCrawlProfiles();
        log.logInfo("Loaded active crawl profiles from file " + profilesActiveFile.getName() + ", " + this.profilesActiveCrawls.size() + " entries");

        final File profilesPassiveFile = new File(queuesRoot, DBFILE_PASSIVE_CRAWL_PROFILES);
-        try {
-            this.profilesPassiveCrawls = new MapHeap(profilesPassiveFile, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, ' ');
-        } catch (final IOException e) {
-            Log.logException(e);Log.logException(e);
-            FileUtils.deletedelete(profilesActiveFile);
-            try {
-                this.profilesPassiveCrawls = new MapHeap(profilesPassiveFile, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, ' ');
-            } catch (final IOException e1) {
-                Log.logException(e1);
-                this.profilesPassiveCrawls = null;
-            }
-        }
-        for (final byte[] handle: this.profilesPassiveCrawls.keySet()) {
+        this.profilesPassiveCrawls = loadFromDB(profilesPassiveFile);
+        for (final byte[] handle : this.profilesPassiveCrawls.keySet()) {
            final CrawlProfile p = new CrawlProfile(this.profilesPassiveCrawls.get(handle));
            Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name());
        }
@ -135,6 +134,13 @@ public final class CrawlSwitchboard {
        if (m == null) return null;
        return new CrawlProfile(m);
    }
+    
+    public CrawlProfile getInvalid(final byte[] profileKey) {
+        if (profileKey == null) return null;
+        final Map<String, String> m = this.profilesInvalidCrawls.get(profileKey);
+        if (m == null) return null;
+        return new CrawlProfile(m);
+    }

    public CrawlProfile getPassive(final byte[] profileKey) {
        if (profileKey == null) return null;
@ -146,6 +152,10 @@ public final class CrawlSwitchboard {
    public Set<byte[]> getActive() {
        return this.profilesActiveCrawls.keySet();
    }
+    
+    public Set<byte[]> getInvalid() {
+        return this.profilesInvalidCrawls.keySet();
+    }

    public Set<byte[]> getPassive() {
        return this.profilesPassiveCrawls.keySet();
@ -155,6 +165,11 @@ public final class CrawlSwitchboard {
        if (profileKey == null) return;
        this.profilesActiveCrawls.remove(profileKey);
    }
+    
+    public void removeInvalid(final byte[] profileKey) {
+        if (profileKey == null) return;
+        this.profilesInvalidCrawls.remove(profileKey);
+    }

    public void removePassive(final byte[] profileKey) {
        if (profileKey == null) return;
@ -164,6 +179,10 @@ public final class CrawlSwitchboard {
    public void putActive(final byte[] profileKey, final CrawlProfile profile) {
        this.profilesActiveCrawls.put(profileKey, profile);
    }
+    
+    public void putInvalid(final byte[] profileKey, final CrawlProfile profile) {
+        this.profilesInvalidCrawls.put(profileKey, profile);
+    }

    public void putPassive(final byte[] profileKey, final CrawlProfile profile) {
        this.profilesPassiveCrawls.put(profileKey, profile);
@ -302,7 +321,31 @@ public final class CrawlSwitchboard {

    public void close() {
        ((MapHeap) this.profilesActiveCrawls).close();
+        ((MapHeap) this.profilesInvalidCrawls).close();
        ((MapHeap) this.profilesPassiveCrawls).close();
    }
+    
+    
+    /**
+     * Loads crawl profiles from a DB file.
+     * @param file DB file
+     * @return crawl profile data
+     */
+    private Map<byte[], Map<String, String>> loadFromDB(final File file) {
+        Map<byte[], Map<String, String>> ret;
+        try {
+            ret = new MapHeap(file, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, ' ');
+        } catch (final IOException e) {
+            Log.logException(e);Log.logException(e);
+            FileUtils.deletedelete(file);
+            try {
+                ret = new MapHeap(file, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, ' ');
+            } catch (final IOException e1) {
+                Log.logException(e1);
+                ret = null;
+            }
+        }
+        return ret;
+    }

 }
--- a/source/net/yacy/cora/services/federated/yacy/CacheStrategy.java
+++ b/source/net/yacy/cora/services/federated/yacy/CacheStrategy.java
@ -1,11 +1,43 @@
+// CacheStrategy.java
+// ------------------------
+// part of YaCy
+// (C) by Michael Peter Christen; mc@yacy.net
+// first published on http://www.anomic.de
+// Frankfurt, Germany, 2011
+//
+// $LastChangedDate$
+// $LastChangedRevision$
+// $LastChangedBy$
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
 package net.yacy.cora.services.federated.yacy;

 public enum CacheStrategy {

-    NOCACHE(0),    // never use the cache, all content from fresh internet source
-    IFFRESH(1),    // use the cache if the cache exists and is fresh using the proxy-fresh rules
-    IFEXIST(2),    // use the cache if the cache exist. Do no check freshness. Otherwise use online source.
-    CACHEONLY(3);  // never go online, use all content from cache. If no cache entry exist, consider content nevertheless as available
+    /** Never use the cache, all content from fresh internet source. */
+    NOCACHE(0),
+    /** Use the cache if the cache exists and is fresh using the
+     * proxy-fresh rules. */
+    IFFRESH(1),
+    /** Use the cache if the cache exists. Do not check freshness. Otherwise
+     * use online source. */
+    IFEXIST(2),
+    /** Never go online, use all content from cache. If no cache entry exist,
+     * consider content nevertheless as available */
+    CACHEONLY(3);
    // the fifth case may be that the CacheStrategy object is assigned NULL. That means that no snippet creation is wanted.

    public int code;
@ -14,6 +46,7 @@ public enum CacheStrategy {
        this.code = code;
    }

+    @Override
    public String toString() {
        return Integer.toString(this.code);
    }
--- a/source/net/yacy/repository/Blacklist.java
+++ b/source/net/yacy/repository/Blacklist.java
@ -444,7 +444,7 @@ public class Blacklist {
            path = element.substring(slashPos + 1);
        }

-        if (!allowRegex || !isValidRegex(host)) {
+        if (!allowRegex || !RegexHelper.isValidRegex(host)) {
            final int i = host.indexOf('*');

            // check whether host begins illegally
@ -470,33 +470,18 @@ public class Blacklist {
            if (host.indexOf("*", i + 1) > -1) {
                return BlacklistError.TWO_WILDCARDS_IN_HOST;
            }
-        } else if (allowRegex && !isValidRegex(host)) {
+        } else if (allowRegex && !RegexHelper.isValidRegex(host)) {
            return BlacklistError.HOST_REGEX;
        }

        // check for errors on regex-compiling path
-        if (!isValidRegex(path) && !"*".equals(path)) {
+        if (!RegexHelper.isValidRegex(path) && !"*".equals(path)) {
            return BlacklistError.PATH_REGEX;
        }

        return BlacklistError.NO_ERROR;
    }

-    /**
-     * Checks if a given expression is a valid regular expression.
-     * @param expression The expression to be checked.
-     * @return True if the expression is a valid regular expression, else false.
-     */
-    private static boolean isValidRegex(final String expression) {
-        boolean ret = true;
-        try {
-            Pattern.compile(expression);
-        } catch (final PatternSyntaxException e) {
-            ret = false;
-        }
-        return ret;
-    }
-
    public static String defaultBlacklist(final File listsPath) {
        final List<String> dirlist = FileUtils.getDirListing(listsPath, Blacklist.BLACKLIST_FILENAME_FILTER);
        if (dirlist.isEmpty()) {
--- a/source/net/yacy/repository/FilterEngine.java
+++ b/source/net/yacy/repository/FilterEngine.java
@ -45,6 +45,7 @@ public class FilterEngine {
            this.types = types;
        }

+        @Override
        public int compareTo(FilterEntry fe) {
            return this.path.compareToIgnoreCase(fe.path);
        }
@ -229,7 +230,7 @@ public class FilterEngine {
            path = element.substring(slashPos + 1);
        }

-        if (!allowRegex || !isValidRegex(host)) {
+        if (!allowRegex || !RegexHelper.isValidRegex(host)) {
            final int i = host.indexOf('*');

            // check whether host begins illegally
@ -255,33 +256,16 @@ public class FilterEngine {
            if (host.indexOf("*", i + 1) > -1) {
                return ERR_TWO_WILDCARDS_IN_HOST;
            }
-        } else if (allowRegex && !isValidRegex(host)) {
+        } else if (allowRegex && !RegexHelper.isValidRegex(host)) {
            return ERR_HOST_REGEX;
        }

        // check for errors on regex-compiling path
-        if (!isValidRegex(path) && !path.equals("*")) {
+        if (!RegexHelper.isValidRegex(path) && !path.equals("*")) {
            return ERR_PATH_REGEX;
        }

        return 0;
    }

-    /**
-     * Checks if a given expression is a valid regular expression.
-     * @param expression The expression to be checked.
-     * @return True if the expression is a valid regular expression, else false.
-     */
-    private static boolean isValidRegex(String expression) {
-        boolean ret = true;
-        try {
-            Pattern.compile(expression);
-        } catch (final PatternSyntaxException e) {
-
-            ret = false;
-        }
-        return ret;
-    }
-    
-
 }
--- a/source/net/yacy/repository/RegexHelper.java
+++ b/source/net/yacy/repository/RegexHelper.java
@ -0,0 +1,52 @@
+// RegexHelper.java
+// ------------------------
+// part of YaCy
+// (C) by Marc Nause; marc.nause@gmx.de
+// first published on http://www.anomic.de
+// Braunchweig, Germany, 2011
+//
+// $LastChangedDate$
+// $LastChangedRevision$
+// $LastChangedBy$
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+package net.yacy.repository;
+
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+
+public final class RegexHelper {
+    
+    /** Private constructor to avoid instantiation of static class. */
+    private RegexHelper() { }
+    
+    /**
+     * Checks if a given expression is a valid regular expression.
+     * @param expression expression to be checked
+     * @return true if the expression is a valid regular expression, else false
+     */
+    public static boolean isValidRegex(final String expression) {
+        boolean ret = true;
+        try {
+            Pattern.compile(expression);
+        } catch (final PatternSyntaxException e) {
+            ret = false;
+        }
+        return ret;
+    }
+    
+}