added crawl settings for three new filters for each crawl:

must-match for IPs (IPs that are known after DNS resolving for each URL in the crawl queue) must-not-match for IPs must-match against a list of country codes (allows only loading from hosts that are hostet in given countries) note: the settings and input environment is there with that commit, but the values are not yet evaluated git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7976 6c8d7289-2bf4-0310-a012-ef5d649a1542
14 years ago · 5ad7f9612b
parent 47a8c69745
commit 5ad7f9612b
8 changed files with 152 additions and 72 deletions
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@ -557,6 +557,12 @@ xpstopw=true
 # Change to false if requesting hits from peers with modified stopwords-file and using the unchanged client-version
 filterOutStopwordsFromTopwords=true

+# crawling steering: must-match/must-not-match
+crawlingIPMustMatch=.*
+crawlingIPMustNotMatch=
+# the default country codes are all codes for countries in Europe
+crawlingCountryMustMatch=AD,AL,AT,BA,BE,BG,BY,CH,CY,CZ,DE,DK,EE,ES,FI,FO,FR,GG,GI,GR,HR,HU,IE,IM,IS,IT,JE,LI,LT,LU,LV,MC,MD,MK,MT,NL,NO,PL,PT,RO,RU,SE,SI,SJ,SK,SM,TR,UA,UK,VA,YU
+
 # performance-settings
 # delay-times for permanent loops (milliseconds)
 # the idlesleep is the pause that an proces sleeps if the last call to the
--- a/htroot/CrawlProfileEditor_p.java
+++ b/htroot/CrawlProfileEditor_p.java
@ -86,8 +86,8 @@ public class CrawlProfileEditor_p {
    static {
        labels.add(new eentry(CrawlProfile.NAME,                "Name",                  true,  eentry.STRING));
        labels.add(new eentry(CrawlProfile.START_URL,           "Start URL",             true,  eentry.STRING));
-        labels.add(new eentry(CrawlProfile.FILTER_MUSTMATCH,    "Must-Match Filter",     false, eentry.STRING));
-        labels.add(new eentry(CrawlProfile.FILTER_MUSTNOTMATCH, "Must-Not-Match Filter", false, eentry.STRING));
+        labels.add(new eentry(CrawlProfile.FILTER_URL_MUSTMATCH,    "Must-Match Filter",     false, eentry.STRING));
+        labels.add(new eentry(CrawlProfile.FILTER_URL_MUSTNOTMATCH, "Must-Not-Match Filter", false, eentry.STRING));
        labels.add(new eentry(CrawlProfile.DEPTH,               "Crawl Depth",           false, eentry.INTEGER));
        labels.add(new eentry(CrawlProfile.RECRAWL_IF_OLDER,    "Recrawl If Older",      false, eentry.INTEGER));
        labels.add(new eentry(CrawlProfile.DOM_MAX_PAGES,       "Domain Max. Pages",     false, eentry.INTEGER));
@ -159,8 +159,8 @@ public class CrawlProfileEditor_p {
        if ((post != null) && (selentry != null)) {
            if (post.containsKey("submit")) {
                try {
-                	Pattern.compile(post.get(CrawlProfile.FILTER_MUSTMATCH, CrawlProfile.MATCH_ALL));
-                	Pattern.compile(post.get(CrawlProfile.FILTER_MUSTNOTMATCH, CrawlProfile.MATCH_NEVER));
+                	Pattern.compile(post.get(CrawlProfile.FILTER_URL_MUSTMATCH, CrawlProfile.MATCH_ALL));
+                	Pattern.compile(post.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH, CrawlProfile.MATCH_NEVER));
                    final Iterator<eentry> lit = labels.iterator();
                    eentry tee;
                    while (lit.hasNext()) {
--- a/htroot/CrawlStartExpert_p.html
+++ b/htroot/CrawlStartExpert_p.html
@ -136,7 +136,7 @@
          </td>
        </tr>
        <tr valign="top" class="TableCellLight">
-          <td><label for="mustmatch">Must-Match Filter</label>:</td>
+          <td><label for="mustmatch">Must-Match Filter for URLs</label>:</td>
          <td>
 			<input type="radio" name="range" id="rangeWide" value="wide" checked="checked" />Use filter&nbsp;&nbsp;
 			<input name="mustmatch" id="mustmatch" type="text" size="60" maxlength="100" value="#[mustmatch]#" /><br />
@ -151,7 +151,7 @@
          </td>
        </tr>
        <tr valign="top" class="TableCellDark">
-          <td><label for="mustnotmatch">Must-Not-Match Filter</label>:</td>
+          <td><label for="mustnotmatch">Must-Not-Match Filter for URLs</label>:</td>
          <td>
 			<input name="mustnotmatch" id="mustnotmatch" type="text" size="60" maxlength="100" value="#[mustnotmatch]#" />
 		  </td>
@ -162,6 +162,37 @@
            If you don't know what this means, please leave this field empty.
          </td>
        </tr>
+        <tr valign="top" class="TableCellLight">
+          <td><label for="ipMustmatch">Must-Match Filter for IPs</label>:</td>
+          <td>
+			<input name="ipMustmatch" id="ipMustmatch" type="text" size="60" maxlength="100" value="#[ipMustmatch]#" />
+		  </td>
+          <td>
+            Like the MUST-Match Filter for URLs this filter must match, but only for the IP of the host.
+            YaCy performs a DNS lookup for each host and this filter restricts the crawl to specific IPs
+          </td>
+        </tr>
+        <tr valign="top" class="TableCellDark">
+          <td><label for="ipMustnotmatch">Must-Not-Match Filter for IPs</label>:</td>
+          <td>
+			<input name="ipMustnotmatch" id="ipMustnotmatch" type="text" size="60" maxlength="100" value="#[ipMustnotmatch]#" />
+		  </td>
+          <td>
+            This filter must not match on the IP of the crawled host.
+          </td>
+        </tr>
+        <tr valign="top" class="TableCellLight">
+          <td><label for="crawlingCountryMustMatch">Must-Match List for Country Codes</label>:</td>
+          <td>
+			<input type="radio" name="countryMustMatchSwitch" id="countryMustMatchSwitch" value="true" />Use filter&nbsp;&nbsp;
+			<input name="crawlingCountryMustMatch" id="crawlingCountryMustMatch" type="text" size="60" maxlength="100" value="#[crawlingCountryMustMatch]#" />
+			<input type="radio" name="countryMustMatchSwitch" id="countryMustMatchSwitch" value="false" checked="checked" />no country code restriction
+		  </td>
+          <td>
+            Crawls can be restricted to specific countries. This uses the country code that can be computed from
+            the IP of the server that hosts the page. The filter is not a regular expressions but a list of country codes, separated by comma.
+          </td>
+        </tr>
        <tr valign="top" class="TableCellDark">
          <td>Maximum Pages per Domain:</td>
          <td>
--- a/htroot/CrawlStartExpert_p.java
+++ b/htroot/CrawlStartExpert_p.java
@ -9,7 +9,7 @@
 // $LastChangedBy: orbiter $
 //
 // LICENSE
-// 
+//
 // This program is free software; you can redistribute it and/or modify
 // it under the terms of the GNU General Public License as published by
 // the Free Software Foundation; either version 2 of the License, or
@ -25,32 +25,36 @@
 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

 import net.yacy.cora.protocol.RequestHeader;
+import net.yacy.search.Switchboard;
 import net.yacy.search.SwitchboardConstants;
 import de.anomic.crawler.CrawlProfile;
 import de.anomic.server.serverObjects;
 import de.anomic.server.serverSwitch;

 public class CrawlStartExpert_p {
-    
+
    public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
        // return variable that accumulates replacements
-        //final Switchboard sb = (Switchboard) env;
+        final Switchboard sb = (Switchboard) env;
        final serverObjects prop = new serverObjects();
-        
+
        // define visible variables
        prop.put("starturl", /*(intranet) ? repository :*/ "http://");
        prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0"));
        prop.put("crawlingDepth", Math.min(3, env.getConfigLong("crawlingDepth", 0)));
        prop.put("mustmatch", /*(intranet) ? repository + ".*" :*/ CrawlProfile.MATCH_ALL);
        prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER);
-        
+        prop.put("ipMustmatch", sb.getConfig("crawlingIPMustMatch", CrawlProfile.MATCH_ALL));
+        prop.put("ipMustnotmatch", sb.getConfig("crawlingIPMustNotMatch", CrawlProfile.MATCH_NEVER));
+        prop.put("crawlingCountryMustMatch", sb.getConfig("crawlingCountryMustMatch", ""));
+
        prop.put("crawlingIfOlderCheck", "0");
        prop.put("crawlingIfOlderUnitYearCheck", "0");
        prop.put("crawlingIfOlderUnitMonthCheck", "0");
        prop.put("crawlingIfOlderUnitDayCheck", "1");
        prop.put("crawlingIfOlderUnitHourCheck", "0");
        prop.put("crawlingIfOlderNumber", "7");
-        
+
        final int crawlingDomFilterDepth = env.getConfigInt("crawlingDomFilterDepth", -1);
        prop.put("crawlingDomFilterCheck", (crawlingDomFilterDepth == -1) ? "0" : "1");
        prop.put("crawlingDomFilterDepth", (crawlingDomFilterDepth == -1) ? 1 : crawlingDomFilterDepth);
@ -62,18 +66,18 @@ public class CrawlStartExpert_p {
        prop.put("indexingTextChecked", env.getConfigBool("indexText", true) ? "1" : "0");
        prop.put("indexingMediaChecked", env.getConfigBool("indexMedia", true) ? "1" : "0");
        prop.put("crawlOrderChecked", env.getConfigBool("crawlOrder", true) ? "1" : "0");
-        
+
        final long LCbusySleep = env.getConfigLong(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, 100L);
        final int LCppm = (LCbusySleep == 0) ? 1000 : (int) (60000L / LCbusySleep);
        prop.put("crawlingSpeedMaxChecked", (LCppm >= 1000) ? "1" : "0");
        prop.put("crawlingSpeedCustChecked", ((LCppm > 10) && (LCppm < 1000)) ? "1" : "0");
        prop.put("crawlingSpeedMinChecked", (LCppm <= 10) ? "1" : "0");
        prop.put("customPPMdefault", ((LCppm > 10) && (LCppm < 1000)) ? Integer.toString(LCppm) : "");
-        
+
        prop.put("xsstopwChecked", env.getConfigBool("xsstopw", true) ? "1" : "0");
        prop.put("xdstopwChecked", env.getConfigBool("xdstopw", true) ? "1" : "0");
        prop.put("xpstopwChecked", env.getConfigBool("xpstopw", true) ? "1" : "0");
-        
+
        // return rewrite properties
        return prop;
    }
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@ -156,6 +156,14 @@ public class Crawler_p {
                String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL);
                final String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER);
                if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL; // avoid that all urls are filtered out if bad value was submitted
+                String ipMustMatch = post.get("ipMustmatch", CrawlProfile.MATCH_ALL);
+                final String ipMustNotMatch = post.get("ipMustnotmatch", CrawlProfile.MATCH_NEVER);
+                if (ipMustMatch.length() < 2) ipMustMatch = CrawlProfile.MATCH_ALL;
+                final String countryMustMatch = post.getBoolean("countryMustMatchSwitch", false) ? post.get("countryMustMatchList", "") : "";
+                sb.setConfig("crawlingIPMustMatch", ipMustMatch);
+                sb.setConfig("crawlingIPMustNotMatch", ipMustNotMatch);
+                if (countryMustMatch.length() > 0) sb.setConfig("crawlingCountryMustMatch", countryMustMatch);
+
                // special cases:
                if (crawlingStartURL!= null && fullDomain) {
                    if (crawlingStartURL.isFile()) {
@ -249,7 +257,10 @@ public class Crawler_p {
                                crawlingStart,
                                crawlingStartURL,
                                newcrawlingMustMatch,
-                                CrawlProfile.MATCH_NEVER,
+                                newcrawlingMustNotMatch,
+                                ipMustMatch,
+                                ipMustNotMatch,
+                                countryMustMatch,
                                newcrawlingdepth,
                                crawlingIfOlder,
                                crawlingDomMaxPages,
@ -306,6 +317,9 @@ public class Crawler_p {
                                crawlingStartURL,
                                newcrawlingMustMatch,
                                newcrawlingMustNotMatch,
+                                ipMustMatch,
+                                ipMustNotMatch,
+                                countryMustMatch,
                                newcrawlingdepth,
                                crawlingIfOlder,
                                crawlingDomMaxPages,
@ -426,6 +440,9 @@ public class Crawler_p {
                                    crawlURL,
                                    newcrawlingMustMatch,
                                    CrawlProfile.MATCH_NEVER,
+                                    ipMustMatch,
+                                    ipMustNotMatch,
+                                    countryMustMatch,
                                    newcrawlingdepth,
                                    crawlingIfOlder,
                                    crawlingDomMaxPages,
@ -463,6 +480,9 @@ public class Crawler_p {
                				sitemapURL,
                				CrawlProfile.MATCH_ALL,
                				CrawlProfile.MATCH_NEVER,
+                                ipMustMatch,
+                                ipMustNotMatch,
+                                countryMustMatch,
                				0,
                				crawlingIfOlder,
                				crawlingDomMaxPages,
@ -504,6 +524,9 @@ public class Crawler_p {
                                sitelistURL,
                                newcrawlingMustMatch,
                                CrawlProfile.MATCH_NEVER,
+                                ipMustMatch,
+                                ipMustNotMatch,
+                                countryMustMatch,
                                newcrawlingdepth,
                                crawlingIfOlder,
                                crawlingDomMaxPages,
--- a/htroot/QuickCrawlLink_p.java
+++ b/htroot/QuickCrawlLink_p.java
@ -149,6 +149,9 @@ public class QuickCrawlLink_p {
                        crawlingStartURL.getHost(),
                        crawlingStartURL,
                        crawlingMustMatch,
+                        CrawlProfile.MATCH_ALL,
+                        CrawlProfile.MATCH_NEVER,
+                        "",
                        crawlingMustNotMatch,
                        CrawlingDepth,
                        60 * 24 * 30, // recrawlIfOlder (minutes); here: one month
--- a/source/de/anomic/crawler/CrawlProfile.java
+++ b/source/de/anomic/crawler/CrawlProfile.java
@ -48,8 +48,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
    public static final String HANDLE           = "handle";
    public static final String NAME             = "name";
    public static final String START_URL        = "startURL";
-    public static final String FILTER_MUSTMATCH = "generalFilter";
-    public static final String FILTER_MUSTNOTMATCH = "nevermatch";
    public static final String DEPTH            = "generalDepth";
    public static final String RECRAWL_IF_OLDER = "recrawlIfOlder";
    public static final String DOM_MAX_PAGES    = "domMaxPages";
@ -63,6 +61,11 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
    public static final String XDSTOPW          = "xdstopw";
    public static final String XPSTOPW          = "xpstopw";
    public static final String CACHE_STRAGEGY   = "cacheStrategy";
+    public static final String FILTER_URL_MUSTMATCH     = "generalFilter"; // for URLs
+    public static final String FILTER_URL_MUSTNOTMATCH  = "nevermatch";    // for URLs
+    public static final String FILTER_IP_MUSTMATCH      = "crawlingIPMustMatch";
+    public static final String FILTER_IP_MUSTNOTMATCH   = "crawlingIPMustNotMatch";
+    public static final String FILTER_COUNTRY_MUSTMATCH = "crawlingCountryMustMatch";

    private Pattern mustmatch = null, mustnotmatch = null;

@ -70,8 +73,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
     * Constructor which creates CrawlPofile from parameters.
     * @param name name of the crawl profile
     * @param startURL root URL of the crawl
-     * @param mustmatch URLs which do not match this regex will be ignored
-     * @param mustnotmatch URLs which match this regex will be ignored
+     * @param urlMustMatch URLs which do not match this regex will be ignored
+     * @param urlMustNotMatch URLs which match this regex will be ignored
     * @param depth height of the tree which will be created by the crawler
     * @param recrawlIfOlder documents which have been indexed in the past will
     * be indexed again if they are older than the time (ms) in this parameter
@ -89,8 +92,11 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
    public CrawlProfile(
                 final String name,
                 final DigestURI startURL,
-                 final String mustmatch,
-                 final String mustnotmatch,
+                 final String urlMustMatch,
+                 final String urlMustNotMatch,
+                 final String ipMustMatch,
+                 final String ipMustNotMatch,
+                 final String countryMustMatch,
                 final int depth,
                 final long recrawlIfOlder /*date*/,
                 final int domMaxPages,
@ -107,14 +113,17 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        if (name == null || name.isEmpty()) {
            throw new NullPointerException("name must not be null or empty");
        }
-        final String handle = (startURL == null) 
+        final String handle = (startURL == null)
                ? Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name)).substring(0, Word.commonHashLength)
                : ASCII.String(startURL.hash());
        put(HANDLE,           handle);
        put(NAME,             name);
        put(START_URL,        (startURL == null) ? "" : startURL.toNormalform(true, false));
-        put(FILTER_MUSTMATCH,   (mustmatch == null) ? CrawlProfile.MATCH_ALL : mustmatch);
-        put(FILTER_MUSTNOTMATCH,   (mustnotmatch == null) ? CrawlProfile.MATCH_NEVER : mustnotmatch);
+        put(FILTER_URL_MUSTMATCH,     (urlMustMatch == null) ? CrawlProfile.MATCH_ALL : urlMustMatch);
+        put(FILTER_URL_MUSTNOTMATCH,  (urlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER : urlMustNotMatch);
+        put(FILTER_IP_MUSTMATCH,      (ipMustMatch == null) ? CrawlProfile.MATCH_ALL : ipMustMatch);
+        put(FILTER_IP_MUSTNOTMATCH,   (ipMustNotMatch == null) ? CrawlProfile.MATCH_NEVER : ipMustNotMatch);
+        put(FILTER_COUNTRY_MUSTMATCH, (countryMustMatch == null) ? "" : countryMustMatch);
        put(DEPTH,            depth);
        put(RECRAWL_IF_OLDER, recrawlIfOlder);
        put(DOM_MAX_PAGES,    domMaxPages);
@ -137,7 +146,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        super(ext == null ? 1 : ext.size());
        if (ext != null) putAll(ext);
    }
-    
+
    /**
     * Adds a parameter to CrawlProfile.
     * @param key name of the parameter
@ -174,7 +183,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        //if (r == null) return null;
        return r;
    }
-    
+
    /**
     * Gets the name of the CrawlProfile.
     * @return  name of the profile
@ -184,7 +193,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        if (r == null) return "";
        return r;
    }
-    
+
    /**
     * Gets the root URL of the crawl job.
     * @return root URL
@ -193,35 +202,35 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        final String r = get(START_URL);
        return r;
    }
-    
+
    /**
     * Gets the regex which must be matched by URLs in order to be crawled.
     * @return regex which must be matched
     */
    public Pattern mustMatchPattern() {
        if (this.mustmatch == null) {
-            String r = get(FILTER_MUSTMATCH);
+            String r = get(FILTER_URL_MUSTMATCH);
            if (r == null) r = CrawlProfile.MATCH_ALL;
            this.mustmatch = Pattern.compile(r);
        }
        return this.mustmatch;
    }
-    
+
    /**
     * Gets the regex which must not be matched by URLs in order to be crawled.
     * @return regex which must not be matched
     */
    public Pattern mustNotMatchPattern() {
        if (this.mustnotmatch == null) {
-            String r = get(FILTER_MUSTNOTMATCH);
+            String r = get(FILTER_URL_MUSTNOTMATCH);
            if (r == null) r = CrawlProfile.MATCH_NEVER;
            this.mustnotmatch = Pattern.compile(r);
        }
        return this.mustnotmatch;
    }
-    
+
    /**
-     * Gets depth of crawl job (or height of the tree which will be 
+     * Gets depth of crawl job (or height of the tree which will be
     * created by the crawler).
     * @return depth of crawl job
     */
@ -235,7 +244,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
            return 0;
        }
    }
-    
+
    public CacheStrategy cacheStrategy() {
        final String r = get(CACHE_STRAGEGY);
        if (r == null) return CacheStrategy.IFEXIST;
@ -246,11 +255,11 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
            return CacheStrategy.IFEXIST;
        }
    }
-    
+
    public void setCacheStrategy(final CacheStrategy newStrategy) {
        put(CACHE_STRAGEGY, newStrategy.toString());
    }
-    
+
    /**
     * Gets the minimum age that an entry must have to be re-crawled.
     * @return time in ms
@ -268,7 +277,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
            return 0L;
        }
    }
-    
+
    public int domMaxPages() {
        // this is the maximum number of pages that are crawled for a single domain
        // if -1, this means no limit
@ -283,31 +292,31 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
            return Integer.MAX_VALUE;
        }
    }
-    
+
    public boolean crawlingQ() {
        final String r = get(CRAWLING_Q);
        if (r == null) return false;
        return (r.equals(Boolean.TRUE.toString()));
    }
-    
+
    public boolean pushSolr() {
        final String r = get(PUSH_SOLR);
        if (r == null) return true;
        return (r.equals(Boolean.TRUE.toString()));
    }
-    
+
    public boolean indexText() {
        final String r = get(INDEX_TEXT);
        if (r == null) return true;
        return (r.equals(Boolean.TRUE.toString()));
    }
-    
+
    public boolean indexMedia() {
        final String r = get(INDEX_MEDIA);
        if (r == null) return true;
        return (r.equals(Boolean.TRUE.toString()));
    }
-    
+
    public boolean storeHTCache() {
        final String r = get(STORE_HTCACHE);
        if (r == null) return false;
@ -318,19 +327,19 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        if (r == null) return false;
        return (r.equals(Boolean.TRUE.toString()));
    }
-    
+
    public boolean excludeStaticStopwords() {
        final String r = get(XSSTOPW);
        if (r == null) return false;
        return (r.equals(Boolean.TRUE.toString()));
    }
-    
+
    public boolean excludeDynamicStopwords() {
        final String r = get(XDSTOPW);
        if (r == null) return false;
        return (r.equals(Boolean.TRUE.toString()));
    }
-    
+
    public boolean excludeParentStopwords() {
        final String r = get(XPSTOPW);
        if (r == null) return false;
--- a/source/de/anomic/crawler/CrawlSwitchboard.java
+++ b/source/de/anomic/crawler/CrawlSwitchboard.java
@ -63,7 +63,8 @@ public final class CrawlSwitchboard {
    public static final long CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE = 60L * 24L * 30L;

    private final Log       log;
-    private Map<byte[], Map<String, String>> profilesActiveCrawls, profilesPassiveCrawls, profilesInvalidCrawls;
+    private Map<byte[], Map<String, String>> profilesActiveCrawls;
+    private final Map<byte[], Map<String, String>> profilesPassiveCrawls, profilesInvalidCrawls;
    public  CrawlProfile    defaultProxyProfile;
    public  CrawlProfile    defaultRemoteProfile;
    public  CrawlProfile    defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile;
@ -91,28 +92,28 @@ public final class CrawlSwitchboard {

        final File profilesInvalidFile = new File(queuesRoot, DBFILE_INVALID_CRAWL_PROFILES);
        this.profilesInvalidCrawls = loadFromDB(profilesInvalidFile);
-        
+
        final File profilesActiveFile = new File(queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES);
        this.profilesActiveCrawls = loadFromDB(profilesActiveFile);
        for (final byte[] handle : this.profilesActiveCrawls.keySet()) {
            final CrawlProfile p;
            p = new CrawlProfile(this.profilesActiveCrawls.get(handle));
-            if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_MUSTMATCH))) {
-                this.removeActive(handle);
-                this.putInvalid(handle, p);
+            if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_URL_MUSTMATCH))) {
+                removeActive(handle);
+                putInvalid(handle, p);
                Log.logWarning("CrawlProfiles", "removed Profile " + p.handle() + ": " + p.name()
-                        + " from active crawls since " + CrawlProfile.FILTER_MUSTMATCH 
-                        + " is no valid regular expression: " + p.get(CrawlProfile.FILTER_MUSTMATCH));
-            } else if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_MUSTNOTMATCH))) {
-                this.putInvalid(handle, p);
-                this.removeActive(handle);
+                        + " from active crawls since " + CrawlProfile.FILTER_URL_MUSTMATCH
+                        + " is no valid regular expression: " + p.get(CrawlProfile.FILTER_URL_MUSTMATCH));
+            } else if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH))) {
+                putInvalid(handle, p);
+                removeActive(handle);
                Log.logWarning("CrawlProfiles", "removed Profile " + p.handle() + ": " + p.name()
-                        + " from active crawls since " + CrawlProfile.FILTER_MUSTNOTMATCH 
-                        + " is no valid regular expression: " + p.get(CrawlProfile.FILTER_MUSTNOTMATCH));
+                        + " from active crawls since " + CrawlProfile.FILTER_URL_MUSTNOTMATCH
+                        + " is no valid regular expression: " + p.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH));
            } else {
                Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name());
            }
-            
+
        }
        initActiveCrawlProfiles();
        log.logInfo("Loaded active crawl profiles from file " + profilesActiveFile.getName() + ", " + this.profilesActiveCrawls.size() + " entries");
@ -134,7 +135,7 @@ public final class CrawlSwitchboard {
        if (m == null) return null;
        return new CrawlProfile(m);
    }
-    
+
    public CrawlProfile getInvalid(final byte[] profileKey) {
        if (profileKey == null) return null;
        final Map<String, String> m = this.profilesInvalidCrawls.get(profileKey);
@ -152,7 +153,7 @@ public final class CrawlSwitchboard {
    public Set<byte[]> getActive() {
        return this.profilesActiveCrawls.keySet();
    }
-    
+
    public Set<byte[]> getInvalid() {
        return this.profilesInvalidCrawls.keySet();
    }
@ -165,7 +166,7 @@ public final class CrawlSwitchboard {
        if (profileKey == null) return;
        this.profilesActiveCrawls.remove(profileKey);
    }
-    
+
    public void removeInvalid(final byte[] profileKey) {
        if (profileKey == null) return;
        this.profilesInvalidCrawls.remove(profileKey);
@ -179,7 +180,7 @@ public final class CrawlSwitchboard {
    public void putActive(final byte[] profileKey, final CrawlProfile profile) {
        this.profilesActiveCrawls.put(profileKey, profile);
    }
-    
+
    public void putInvalid(final byte[] profileKey, final CrawlProfile profile) {
        this.profilesInvalidCrawls.put(profileKey, profile);
    }
@ -227,7 +228,10 @@ public final class CrawlSwitchboard {
        if (this.defaultProxyProfile == null) {
            // generate new default entry for proxy crawling
            this.defaultProxyProfile = new CrawlProfile(
-                    "proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
+                    "proxy", null,
+                    CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
+                    CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
+                    "",
                    0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, false,
                    true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/,
@ -239,38 +243,38 @@ public final class CrawlSwitchboard {
        }
        if (this.defaultRemoteProfile == null) {
            // generate new default entry for remote crawling
-            this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
+            this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", CrawlProfile.MATCH_NEVER, 0,
                    -1, -1, true, true, true, false, false, true, true, false, CacheStrategy.IFFRESH);
            this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultRemoteProfile.handle()), this.defaultRemoteProfile);
        }
        if (this.defaultTextSnippetLocalProfile == null) {
            // generate new default entry for snippet fetch and optional crawling
-            this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
+            this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0,
                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CacheStrategy.IFEXIST);
            this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()), this.defaultTextSnippetLocalProfile);
        }
        if (this.defaultTextSnippetGlobalProfile == null) {
            // generate new default entry for snippet fetch and optional crawling
-            this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
+            this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0,
                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, true, false, true, true, false, CacheStrategy.IFEXIST);
            this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()), this.defaultTextSnippetGlobalProfile);
        }
        this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST);
        if (this.defaultMediaSnippetLocalProfile == null) {
            // generate new default entry for snippet fetch and optional crawling
-            this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
+            this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0,
                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CacheStrategy.IFEXIST);
            this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()), this.defaultMediaSnippetLocalProfile);
        }
        if (this.defaultMediaSnippetGlobalProfile == null) {
            // generate new default entry for snippet fetch and optional crawling
-            this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
+            this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0,
                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, true, false, true, true, false, true, true, false, CacheStrategy.IFEXIST);
            this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()), this.defaultMediaSnippetGlobalProfile);
        }
        if (this.defaultSurrogateProfile == null) {
            // generate new default entry for surrogate parsing
-            this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
+            this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0,
                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, true, true, false, false, false, true, true, false, CacheStrategy.NOCACHE);
            this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultSurrogateProfile.handle()), this.defaultSurrogateProfile);
        }
@ -324,8 +328,8 @@ public final class CrawlSwitchboard {
        ((MapHeap) this.profilesInvalidCrawls).close();
        ((MapHeap) this.profilesPassiveCrawls).close();
    }
-    
-    
+
+
    /**
     * Loads crawl profiles from a DB file.
     * @param file DB file