added crawl settings for three new filters for each crawl:

must-match for IPs (IPs that are known after DNS resolving for each URL in the crawl queue) must-not-match for IPs must-match against a list of country codes (allows only loading from hosts that are hostet in given countries) note: the settings and input environment is there with that commit, but the values are not yet evaluated git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7976 6c8d7289-2bf4-0310-a012-ef5d649a1542
14 years ago · 5ad7f9612b
parent 47a8c69745
commit 5ad7f9612b
8 changed files with 152 additions and 72 deletions
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@ -557,6 +557,12 @@ xpstopw=true
 # Change to false if requesting hits from peers with modified stopwords-file and using the unchanged client-version
 filterOutStopwordsFromTopwords=true

+# crawling steering: must-match/must-not-match
+crawlingIPMustMatch=.*
+crawlingIPMustNotMatch=
+# the default country codes are all codes for countries in Europe
+crawlingCountryMustMatch=AD,AL,AT,BA,BE,BG,BY,CH,CY,CZ,DE,DK,EE,ES,FI,FO,FR,GG,GI,GR,HR,HU,IE,IM,IS,IT,JE,LI,LT,LU,LV,MC,MD,MK,MT,NL,NO,PL,PT,RO,RU,SE,SI,SJ,SK,SM,TR,UA,UK,VA,YU
+
 # performance-settings
 # delay-times for permanent loops (milliseconds)
 # the idlesleep is the pause that an proces sleeps if the last call to the
--- a/htroot/CrawlProfileEditor_p.java
+++ b/htroot/CrawlProfileEditor_p.java
@ -86,8 +86,8 @@ public class CrawlProfileEditor_p {
    static {
        labels.add(new eentry(CrawlProfile.NAME,                "Name",                  true,  eentry.STRING));
        labels.add(new eentry(CrawlProfile.START_URL,           "Start URL",             true,  eentry.STRING));
-        labels.add(new eentry(CrawlProfile.FILTER_MUSTMATCH,    "Must-Match Filter",     false, eentry.STRING));
-        labels.add(new eentry(CrawlProfile.FILTER_MUSTNOTMATCH, "Must-Not-Match Filter", false, eentry.STRING));
+        labels.add(new eentry(CrawlProfile.FILTER_URL_MUSTMATCH,    "Must-Match Filter",     false, eentry.STRING));
+        labels.add(new eentry(CrawlProfile.FILTER_URL_MUSTNOTMATCH, "Must-Not-Match Filter", false, eentry.STRING));
        labels.add(new eentry(CrawlProfile.DEPTH,               "Crawl Depth",           false, eentry.INTEGER));
        labels.add(new eentry(CrawlProfile.RECRAWL_IF_OLDER,    "Recrawl If Older",      false, eentry.INTEGER));
        labels.add(new eentry(CrawlProfile.DOM_MAX_PAGES,       "Domain Max. Pages",     false, eentry.INTEGER));
@ -159,8 +159,8 @@ public class CrawlProfileEditor_p {
        if ((post != null) && (selentry != null)) {
            if (post.containsKey("submit")) {
                try {
-                	Pattern.compile(post.get(CrawlProfile.FILTER_MUSTMATCH, CrawlProfile.MATCH_ALL));
-                	Pattern.compile(post.get(CrawlProfile.FILTER_MUSTNOTMATCH, CrawlProfile.MATCH_NEVER));
+                	Pattern.compile(post.get(CrawlProfile.FILTER_URL_MUSTMATCH, CrawlProfile.MATCH_ALL));
+                	Pattern.compile(post.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH, CrawlProfile.MATCH_NEVER));
                    final Iterator<eentry> lit = labels.iterator();
                    eentry tee;
                    while (lit.hasNext()) {
--- a/htroot/CrawlStartExpert_p.html
+++ b/htroot/CrawlStartExpert_p.html
@ -136,7 +136,7 @@
          </td>
        </tr>
        <tr valign="top" class="TableCellLight">
-          <td><label for="mustmatch">Must-Match Filter</label>:</td>
+          <td><label for="mustmatch">Must-Match Filter for URLs</label>:</td>
          <td>
 			<input type="radio" name="range" id="rangeWide" value="wide" checked="checked" />Use filter&nbsp;&nbsp;
 			<input name="mustmatch" id="mustmatch" type="text" size="60" maxlength="100" value="#[mustmatch]#" /><br />
@ -151,7 +151,7 @@
          </td>
        </tr>
        <tr valign="top" class="TableCellDark">
-          <td><label for="mustnotmatch">Must-Not-Match Filter</label>:</td>
+          <td><label for="mustnotmatch">Must-Not-Match Filter for URLs</label>:</td>
          <td>
 			<input name="mustnotmatch" id="mustnotmatch" type="text" size="60" maxlength="100" value="#[mustnotmatch]#" />
 		  </td>
@ -162,6 +162,37 @@
            If you don't know what this means, please leave this field empty.
          </td>
        </tr>
+        <tr valign="top" class="TableCellLight">
+          <td><label for="ipMustmatch">Must-Match Filter for IPs</label>:</td>
+          <td>
+			<input name="ipMustmatch" id="ipMustmatch" type="text" size="60" maxlength="100" value="#[ipMustmatch]#" />
+		  </td>
+          <td>
+            Like the MUST-Match Filter for URLs this filter must match, but only for the IP of the host.
+            YaCy performs a DNS lookup for each host and this filter restricts the crawl to specific IPs
+          </td>
+        </tr>
+        <tr valign="top" class="TableCellDark">
+          <td><label for="ipMustnotmatch">Must-Not-Match Filter for IPs</label>:</td>
+          <td>
+			<input name="ipMustnotmatch" id="ipMustnotmatch" type="text" size="60" maxlength="100" value="#[ipMustnotmatch]#" />
+		  </td>
+          <td>
+            This filter must not match on the IP of the crawled host.
+          </td>
+        </tr>
+        <tr valign="top" class="TableCellLight">
+          <td><label for="crawlingCountryMustMatch">Must-Match List for Country Codes</label>:</td>
+          <td>
+			<input type="radio" name="countryMustMatchSwitch" id="countryMustMatchSwitch" value="true" />Use filter&nbsp;&nbsp;
+			<input name="crawlingCountryMustMatch" id="crawlingCountryMustMatch" type="text" size="60" maxlength="100" value="#[crawlingCountryMustMatch]#" />
+			<input type="radio" name="countryMustMatchSwitch" id="countryMustMatchSwitch" value="false" checked="checked" />no country code restriction
+		  </td>
+          <td>
+            Crawls can be restricted to specific countries. This uses the country code that can be computed from
+            the IP of the server that hosts the page. The filter is not a regular expressions but a list of country codes, separated by comma.
+          </td>
+        </tr>
        <tr valign="top" class="TableCellDark">
          <td>Maximum Pages per Domain:</td>
          <td>
--- a/htroot/CrawlStartExpert_p.java
+++ b/htroot/CrawlStartExpert_p.java
@ -25,6 +25,7 @@
 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

 import net.yacy.cora.protocol.RequestHeader;
+import net.yacy.search.Switchboard;
 import net.yacy.search.SwitchboardConstants;
 import de.anomic.crawler.CrawlProfile;
 import de.anomic.server.serverObjects;
@ -34,7 +35,7 @@ public class CrawlStartExpert_p {

    public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
        // return variable that accumulates replacements
-        //final Switchboard sb = (Switchboard) env;
+        final Switchboard sb = (Switchboard) env;
        final serverObjects prop = new serverObjects();

        // define visible variables
@ -43,6 +44,9 @@ public class CrawlStartExpert_p {
        prop.put("crawlingDepth", Math.min(3, env.getConfigLong("crawlingDepth", 0)));
        prop.put("mustmatch", /*(intranet) ? repository + ".*" :*/ CrawlProfile.MATCH_ALL);
        prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER);
+        prop.put("ipMustmatch", sb.getConfig("crawlingIPMustMatch", CrawlProfile.MATCH_ALL));
+        prop.put("ipMustnotmatch", sb.getConfig("crawlingIPMustNotMatch", CrawlProfile.MATCH_NEVER));
+        prop.put("crawlingCountryMustMatch", sb.getConfig("crawlingCountryMustMatch", ""));

        prop.put("crawlingIfOlderCheck", "0");
        prop.put("crawlingIfOlderUnitYearCheck", "0");
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@ -156,6 +156,14 @@ public class Crawler_p {
                String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL);
                final String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER);
                if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL; // avoid that all urls are filtered out if bad value was submitted
+                String ipMustMatch = post.get("ipMustmatch", CrawlProfile.MATCH_ALL);
+                final String ipMustNotMatch = post.get("ipMustnotmatch", CrawlProfile.MATCH_NEVER);
+                if (ipMustMatch.length() < 2) ipMustMatch = CrawlProfile.MATCH_ALL;
+                final String countryMustMatch = post.getBoolean("countryMustMatchSwitch", false) ? post.get("countryMustMatchList", "") : "";
+                sb.setConfig("crawlingIPMustMatch", ipMustMatch);
+                sb.setConfig("crawlingIPMustNotMatch", ipMustNotMatch);
+                if (countryMustMatch.length() > 0) sb.setConfig("crawlingCountryMustMatch", countryMustMatch);
+
                // special cases:
                if (crawlingStartURL!= null && fullDomain) {
                    if (crawlingStartURL.isFile()) {
@ -249,7 +257,10 @@ public class Crawler_p {
                                crawlingStart,
                                crawlingStartURL,
                                newcrawlingMustMatch,
-                                CrawlProfile.MATCH_NEVER,
+                                newcrawlingMustNotMatch,
+                                ipMustMatch,
+                                ipMustNotMatch,
+                                countryMustMatch,
                                newcrawlingdepth,
                                crawlingIfOlder,
                                crawlingDomMaxPages,
@ -306,6 +317,9 @@ public class Crawler_p {
                                crawlingStartURL,
                                newcrawlingMustMatch,
                                newcrawlingMustNotMatch,
+                                ipMustMatch,
+                                ipMustNotMatch,
+                                countryMustMatch,
                                newcrawlingdepth,
                                crawlingIfOlder,
                                crawlingDomMaxPages,
@ -426,6 +440,9 @@ public class Crawler_p {
                                    crawlURL,
                                    newcrawlingMustMatch,
                                    CrawlProfile.MATCH_NEVER,
+                                    ipMustMatch,
+                                    ipMustNotMatch,
+                                    countryMustMatch,
                                    newcrawlingdepth,
                                    crawlingIfOlder,
                                    crawlingDomMaxPages,
@ -463,6 +480,9 @@ public class Crawler_p {
                				sitemapURL,
                				CrawlProfile.MATCH_ALL,
                				CrawlProfile.MATCH_NEVER,
+                                ipMustMatch,
+                                ipMustNotMatch,
+                                countryMustMatch,
                				0,
                				crawlingIfOlder,
                				crawlingDomMaxPages,
@ -504,6 +524,9 @@ public class Crawler_p {
                                sitelistURL,
                                newcrawlingMustMatch,
                                CrawlProfile.MATCH_NEVER,
+                                ipMustMatch,
+                                ipMustNotMatch,
+                                countryMustMatch,
                                newcrawlingdepth,
                                crawlingIfOlder,
                                crawlingDomMaxPages,
--- a/htroot/QuickCrawlLink_p.java
+++ b/htroot/QuickCrawlLink_p.java
@ -149,6 +149,9 @@ public class QuickCrawlLink_p {
                        crawlingStartURL.getHost(),
                        crawlingStartURL,
                        crawlingMustMatch,
+                        CrawlProfile.MATCH_ALL,
+                        CrawlProfile.MATCH_NEVER,
+                        "",
                        crawlingMustNotMatch,
                        CrawlingDepth,
                        60 * 24 * 30, // recrawlIfOlder (minutes); here: one month
--- a/source/de/anomic/crawler/CrawlProfile.java
+++ b/source/de/anomic/crawler/CrawlProfile.java
@ -48,8 +48,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
    public static final String HANDLE           = "handle";
    public static final String NAME             = "name";
    public static final String START_URL        = "startURL";
-    public static final String FILTER_MUSTMATCH = "generalFilter";
-    public static final String FILTER_MUSTNOTMATCH = "nevermatch";
    public static final String DEPTH            = "generalDepth";
    public static final String RECRAWL_IF_OLDER = "recrawlIfOlder";
    public static final String DOM_MAX_PAGES    = "domMaxPages";
@ -63,6 +61,11 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
    public static final String XDSTOPW          = "xdstopw";
    public static final String XPSTOPW          = "xpstopw";
    public static final String CACHE_STRAGEGY   = "cacheStrategy";
+    public static final String FILTER_URL_MUSTMATCH     = "generalFilter"; // for URLs
+    public static final String FILTER_URL_MUSTNOTMATCH  = "nevermatch";    // for URLs
+    public static final String FILTER_IP_MUSTMATCH      = "crawlingIPMustMatch";
+    public static final String FILTER_IP_MUSTNOTMATCH   = "crawlingIPMustNotMatch";
+    public static final String FILTER_COUNTRY_MUSTMATCH = "crawlingCountryMustMatch";

    private Pattern mustmatch = null, mustnotmatch = null;

@ -70,8 +73,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
     * Constructor which creates CrawlPofile from parameters.
     * @param name name of the crawl profile
     * @param startURL root URL of the crawl
-     * @param mustmatch URLs which do not match this regex will be ignored
-     * @param mustnotmatch URLs which match this regex will be ignored
+     * @param urlMustMatch URLs which do not match this regex will be ignored
+     * @param urlMustNotMatch URLs which match this regex will be ignored
     * @param depth height of the tree which will be created by the crawler
     * @param recrawlIfOlder documents which have been indexed in the past will
     * be indexed again if they are older than the time (ms) in this parameter
@ -89,8 +92,11 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
    public CrawlProfile(
                 final String name,
                 final DigestURI startURL,
-                 final String mustmatch,
-                 final String mustnotmatch,
+                 final String urlMustMatch,
+                 final String urlMustNotMatch,
+                 final String ipMustMatch,
+                 final String ipMustNotMatch,
+                 final String countryMustMatch,
                 final int depth,
                 final long recrawlIfOlder /*date*/,
                 final int domMaxPages,
@ -113,8 +119,11 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        put(HANDLE,           handle);
        put(NAME,             name);
        put(START_URL,        (startURL == null) ? "" : startURL.toNormalform(true, false));
-        put(FILTER_MUSTMATCH,   (mustmatch == null) ? CrawlProfile.MATCH_ALL : mustmatch);
-        put(FILTER_MUSTNOTMATCH,   (mustnotmatch == null) ? CrawlProfile.MATCH_NEVER : mustnotmatch);
+        put(FILTER_URL_MUSTMATCH,     (urlMustMatch == null) ? CrawlProfile.MATCH_ALL : urlMustMatch);
+        put(FILTER_URL_MUSTNOTMATCH,  (urlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER : urlMustNotMatch);
+        put(FILTER_IP_MUSTMATCH,      (ipMustMatch == null) ? CrawlProfile.MATCH_ALL : ipMustMatch);
+        put(FILTER_IP_MUSTNOTMATCH,   (ipMustNotMatch == null) ? CrawlProfile.MATCH_NEVER : ipMustNotMatch);
+        put(FILTER_COUNTRY_MUSTMATCH, (countryMustMatch == null) ? "" : countryMustMatch);
        put(DEPTH,            depth);
        put(RECRAWL_IF_OLDER, recrawlIfOlder);
        put(DOM_MAX_PAGES,    domMaxPages);
@ -200,7 +209,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
     */
    public Pattern mustMatchPattern() {
        if (this.mustmatch == null) {
-            String r = get(FILTER_MUSTMATCH);
+            String r = get(FILTER_URL_MUSTMATCH);
            if (r == null) r = CrawlProfile.MATCH_ALL;
            this.mustmatch = Pattern.compile(r);
        }
@ -213,7 +222,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
     */
    public Pattern mustNotMatchPattern() {
        if (this.mustnotmatch == null) {
-            String r = get(FILTER_MUSTNOTMATCH);
+            String r = get(FILTER_URL_MUSTNOTMATCH);
            if (r == null) r = CrawlProfile.MATCH_NEVER;
            this.mustnotmatch = Pattern.compile(r);
        }
--- a/source/de/anomic/crawler/CrawlSwitchboard.java
+++ b/source/de/anomic/crawler/CrawlSwitchboard.java
@ -63,7 +63,8 @@ public final class CrawlSwitchboard {
    public static final long CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE = 60L * 24L * 30L;

    private final Log       log;
-    private Map<byte[], Map<String, String>> profilesActiveCrawls, profilesPassiveCrawls, profilesInvalidCrawls;
+    private Map<byte[], Map<String, String>> profilesActiveCrawls;
+    private final Map<byte[], Map<String, String>> profilesPassiveCrawls, profilesInvalidCrawls;
    public  CrawlProfile    defaultProxyProfile;
    public  CrawlProfile    defaultRemoteProfile;
    public  CrawlProfile    defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile;
@ -97,18 +98,18 @@ public final class CrawlSwitchboard {
        for (final byte[] handle : this.profilesActiveCrawls.keySet()) {
            final CrawlProfile p;
            p = new CrawlProfile(this.profilesActiveCrawls.get(handle));
-            if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_MUSTMATCH))) {
-                this.removeActive(handle);
-                this.putInvalid(handle, p);
+            if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_URL_MUSTMATCH))) {
+                removeActive(handle);
+                putInvalid(handle, p);
                Log.logWarning("CrawlProfiles", "removed Profile " + p.handle() + ": " + p.name()
-                        + " from active crawls since " + CrawlProfile.FILTER_MUSTMATCH 
-                        + " is no valid regular expression: " + p.get(CrawlProfile.FILTER_MUSTMATCH));
-            } else if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_MUSTNOTMATCH))) {
-                this.putInvalid(handle, p);
-                this.removeActive(handle);
+                        + " from active crawls since " + CrawlProfile.FILTER_URL_MUSTMATCH
+                        + " is no valid regular expression: " + p.get(CrawlProfile.FILTER_URL_MUSTMATCH));
+            } else if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH))) {
+                putInvalid(handle, p);
+                removeActive(handle);
                Log.logWarning("CrawlProfiles", "removed Profile " + p.handle() + ": " + p.name()
-                        + " from active crawls since " + CrawlProfile.FILTER_MUSTNOTMATCH 
-                        + " is no valid regular expression: " + p.get(CrawlProfile.FILTER_MUSTNOTMATCH));
+                        + " from active crawls since " + CrawlProfile.FILTER_URL_MUSTNOTMATCH
+                        + " is no valid regular expression: " + p.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH));
            } else {
                Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name());
            }
@ -227,7 +228,10 @@ public final class CrawlSwitchboard {
        if (this.defaultProxyProfile == null) {
            // generate new default entry for proxy crawling
            this.defaultProxyProfile = new CrawlProfile(
-                    "proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
+                    "proxy", null,
+                    CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
+                    CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
+                    "",
                    0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, false,
                    true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/,
@ -239,38 +243,38 @@ public final class CrawlSwitchboard {
        }
        if (this.defaultRemoteProfile == null) {
            // generate new default entry for remote crawling
-            this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
+            this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", CrawlProfile.MATCH_NEVER, 0,
                    -1, -1, true, true, true, false, false, true, true, false, CacheStrategy.IFFRESH);
            this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultRemoteProfile.handle()), this.defaultRemoteProfile);
        }
        if (this.defaultTextSnippetLocalProfile == null) {
            // generate new default entry for snippet fetch and optional crawling
-            this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
+            this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0,
                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CacheStrategy.IFEXIST);
            this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()), this.defaultTextSnippetLocalProfile);
        }
        if (this.defaultTextSnippetGlobalProfile == null) {
            // generate new default entry for snippet fetch and optional crawling
-            this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
+            this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0,
                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, true, false, true, true, false, CacheStrategy.IFEXIST);
            this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()), this.defaultTextSnippetGlobalProfile);
        }
        this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST);
        if (this.defaultMediaSnippetLocalProfile == null) {
            // generate new default entry for snippet fetch and optional crawling
-            this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
+            this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0,
                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CacheStrategy.IFEXIST);
            this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()), this.defaultMediaSnippetLocalProfile);
        }
        if (this.defaultMediaSnippetGlobalProfile == null) {
            // generate new default entry for snippet fetch and optional crawling
-            this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
+            this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0,
                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, true, false, true, true, false, true, true, false, CacheStrategy.IFEXIST);
            this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()), this.defaultMediaSnippetGlobalProfile);
        }
        if (this.defaultSurrogateProfile == null) {
            // generate new default entry for surrogate parsing
-            this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
+            this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0,
                    CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, true, true, false, false, false, true, true, false, CacheStrategy.NOCACHE);
            this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultSurrogateProfile.handle()), this.defaultSurrogateProfile);
        }