extended crawling constraints:

- removed never-used secondary crawl depth - added a must-not-match filter that can be used to exclude urls from a crawl - added stub for crawl tags which will be used to identify search results that had been produced from specific crawls please update the yacybar: replace property name 'crawlFilter' with 'mustmatch'. Additionally, a new parameter named 'mustnotmatch' can be used, which should be by default the empty sring (match-never) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5342 6c8d7289-2bf4-0310-a012-ef5d649a1542
17 years ago · dba7ef5144
parent 96174b2b56
commit dba7ef5144
16 changed files with 193 additions and 123 deletions
--- a/htroot/CrawlProfileEditor_p.html
+++ b/htroot/CrawlProfileEditor_p.html
@ -30,7 +30,8 @@
    <td><strong>Status</strong></td>
    <td><strong>Start URL</strong></td>
    <td><strong>Depth</strong></td>
-    <td><strong>Filter</strong></td>
+    <td><strong>Must Match</strong></td>
+    <td><strong>Must Not Match</strong></td>
    <td><strong>MaxAge</strong></td>
    <td><strong>Auto Filter Depth</strong></td>
    <td><strong>Auto Filter Content</strong></td>
@ -48,7 +49,8 @@
    <td>#(status)#terminated::active#(/status)#</td>
    <td><a  href="#[startURL]#">#[startURL]#</a></td>
    <td>#[depth]#</td>
-    <td>#[filter]#</td>
+    <td>#[mustmatch]#</td>
+    <td>#[mustnotmatch]#</td>
    <td>#[crawlingIfOlder]#</td>
    <td>#[crawlingDomFilterDepth]#</td>
    <td>#{crawlingDomFilterContent}##[item]#<br />#{/crawlingDomFilterContent}#</td>
--- a/htroot/CrawlProfileEditor_p.java
+++ b/htroot/CrawlProfileEditor_p.java
@ -62,10 +62,8 @@ public class CrawlProfileEditor_p {
    static {
        labels.add(new eentry(entry.NAME,             "Name",                 true,  eentry.STRING));
        labels.add(new eentry(entry.START_URL,        "Start URL",            true,  eentry.STRING));
-        labels.add(new eentry(entry.GENERAL_FILTER,   "General Filter",       false, eentry.STRING));
-        labels.add(new eentry(entry.SPECIFIC_FILTER,  "Specific Filter",      false, eentry.STRING));
-        labels.add(new eentry(entry.GENERAL_DEPTH,    "General Depth",        false, eentry.INTEGER));
-        labels.add(new eentry(entry.SPECIFIC_DEPTH,   "Specific Depth",       false, eentry.INTEGER));
+        labels.add(new eentry(entry.FILTER_MUSTMATCH,   "General Filter",       false, eentry.STRING));
+        labels.add(new eentry(entry.DEPTH,    "General Depth",        false, eentry.INTEGER));
        labels.add(new eentry(entry.RECRAWL_IF_OLDER, "Recrawl If Older",     false, eentry.INTEGER));
        labels.add(new eentry(entry.DOM_FILTER_DEPTH, "Domain Filter Depth",  false, eentry.INTEGER));
        labels.add(new eentry(entry.DOM_MAX_PAGES,    "Domain Max. Pages",    false, eentry.INTEGER));
@ -214,8 +212,9 @@ public class CrawlProfileEditor_p {
        prop.put("crawlProfiles_" + count + "_name", profile.name());
        prop.putXML("crawlProfiles_" + count + "_startURL", profile.startURL());
        prop.put("crawlProfiles_" + count + "_handle", profile.handle());
-        prop.put("crawlProfiles_" + count + "_depth", profile.generalDepth());
-        prop.put("crawlProfiles_" + count + "_filter", profile.generalFilter());
+        prop.put("crawlProfiles_" + count + "_depth", profile.depth());
+        prop.put("crawlProfiles_" + count + "_mustmatch", profile.mustMatchPattern().toString());
+        prop.put("crawlProfiles_" + count + "_mustnotmatch", profile.mustNotMatchPattern().toString());
        prop.put("crawlProfiles_" + count + "_crawlingIfOlder", (profile.recrawlIfOlder() == 0L) ? "no re-crawl" : ""+ SimpleDateFormat.getDateTimeInstance().format(profile.recrawlIfOlder()));
        prop.put("crawlProfiles_" + count + "_crawlingDomFilterDepth", (profile.domFilterDepth() == Integer.MAX_VALUE) ? "inactive" : Integer.toString(profile.domFilterDepth()));

--- a/htroot/CrawlProfileEditor_p.xml
+++ b/htroot/CrawlProfileEditor_p.xml
@ -6,7 +6,8 @@
 		<status>#(status)#terminated::active#(/status)#</status>
 		<starturl>#[startURL]#</starturl>
 		<depth>#[depth]#</depth>
-		<filter>#[filter]#</filter>
+		<mustmatch>#[mustmatch]#</mustmatch>
+		<mustnotmatch>#[mustnotmatch]#</mustnotmatch>
 		<crawlingIfOlder>#[crawlingIfOlder]#</crawlingIfOlder>
 		<crawlingDomFilterDepth>#[crawlingDomFilterDepth]#</crawlingDomFilterDepth>
 		<crawlingDomFilterContent>
--- a/htroot/CrawlStart_p.html
+++ b/htroot/CrawlStart_p.html
@ -100,18 +100,30 @@
          </td>
        </tr>
        <tr valign="top" class="TableCellDark">
-          <td><label for="crawlingFilter">Crawling Filter</label>:</td>
+          <td><label for="mustmatch">Must-Match Filter</label>:</td>
          <td>
 			<input type="radio" name="range" value="wide" checked="checked" />Use filter&nbsp;&nbsp;
-			<input name="crawlingFilter" id="crawlingFilter" type="text" size="20" maxlength="100" value="#[crawlingFilter]#" /><br />
+			<input name="mustmatch" id="mustmatch" type="text" size="60" maxlength="100" value="#[mustmatch]#" /><br />
 			<input type="radio" name="range" value="domain" />Restrict to start domain<br />
 			<input type="radio" name="range" value="subpath" />Restrict to sub-path
 		  </td>
          <td>
-            The filter is an emacs-like regular expression that must match with the URLs which are used to be crawled; default is 'catch all'.
+            The filter is an emacs-like regular expression that must match with the URLs which are used to be crawled;
+            default is 'catch all'.
            You can also use an automatic domain-restriction to fully crawl a single domain.
          </td>
        </tr>
+        <tr valign="top" class="TableCellDark">
+          <td><label for="mustnotmatch">Must-Not-Match Filter</label>:</td>
+          <td>
+			<input name="mustnotmatch" id="mustnotmatch" type="text" size="80" maxlength="100" value="#[mustnotmatch]#" />
+		  </td>
+          <td>
+            This filter must not match to allow that the page is accepted for crawling.
+            The empty string is a never-match filter which should do well for most cases.
+            If you don't know what this means, please leave this field empty.
+          </td>
+        </tr>
        <tr valign="top" class="TableCellLight">
          <td>Re-crawl known URLs:</td>
          <td>
--- a/htroot/CrawlStart_p.java
+++ b/htroot/CrawlStart_p.java
@ -24,6 +24,7 @@
 // along with this program; if not, write to the Free Software
 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

+import de.anomic.crawler.CrawlProfile;
 import de.anomic.http.httpRequestHeader;
 import de.anomic.plasma.plasmaSwitchboard;
 import de.anomic.plasma.plasmaSwitchboardConstants;
@ -44,7 +45,8 @@ public class CrawlStart_p {
        prop.put("starturl", (intranet) ? repository : "http://");
        prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0"));
        prop.put("crawlingDepth", env.getConfig("crawlingDepth", "0"));
-        prop.put("crawlingFilter", (intranet) ? repository + ".*" : ".*");
+        prop.put("mustmatch", (intranet) ? repository + ".*" : CrawlProfile.MATCH_ALL);
+        prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER);
        
        prop.put("crawlingIfOlderCheck", "0");
        prop.put("crawlingIfOlderUnitYearCheck", "0");
--- a/htroot/QuickCrawlLink_p.java
+++ b/htroot/QuickCrawlLink_p.java
@ -91,7 +91,8 @@ public class QuickCrawlLink_p {
        final String title = post.get("title",null);
        
        // getting other parameters if set
-        final String crawlingFilter  = post.get("crawlingFilter", ".*");
+        final String crawlingMustMatch  = post.get("mustmatch", CrawlProfile.MATCH_ALL);
+        final String crawlingMustNotMatch  = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER);
        final int CrawlingDepth      = Integer.parseInt(post.get("crawlingDepth", "0"));        
        final boolean crawlDynamic   = post.get("crawlingQ", "").equals("on");
        final boolean indexText      = post.get("indexText", "on").equals("on");
@ -129,11 +130,11 @@ public class QuickCrawlLink_p {
            try {
                pe = sb.webIndex.profilesActiveCrawls.newEntry(
                        crawlingStartURL.getHost(), 
-                        crawlingStartURL, 
-                        crawlingFilter, 
-                        crawlingFilter, 
-                        CrawlingDepth, 
-                        CrawlingDepth, 
+                        crawlingStartURL,
+                        CrawlProfile.KEYWORDS_USER,
+                        crawlingMustMatch,
+                        crawlingMustNotMatch,
+                        CrawlingDepth,
                        60 * 24 * 30, // recrawlIfOlder (minutes); here: one month
                        -1, // domFilterDepth, if negative: no auto-filter
                        -1, // domMaxPages, if negative: no count restriction
--- a/htroot/WatchCrawler_p.java
+++ b/htroot/WatchCrawler_p.java
@ -123,16 +123,16 @@ public class WatchCrawler_p {
                    crawlingStart = (crawlingStartURL == null) ? null : crawlingStartURL.toNormalform(true, true);
                    
                    // set the crawling filter
-                    String newcrawlingfilter = post.get("crawlingFilter", ".*");
-                    if (newcrawlingfilter.length() < 2) newcrawlingfilter = ".*"; // avoid that all urls are filtered out if bad value was submitted
-                    
+                    String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL);
+                    String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER);
+                    if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL; // avoid that all urls are filtered out if bad value was submitted
+                    // special cases:
                    if (crawlingStartURL!= null && fullDomain) {
-                        newcrawlingfilter = ".*" + crawlingStartURL.getHost() + ".*";
+                        newcrawlingMustMatch = ".*" + crawlingStartURL.getHost() + ".*";
                    }
                    if (crawlingStart!= null && subPath && (pos = crawlingStart.lastIndexOf("/")) > 0) {
-                        newcrawlingfilter = crawlingStart.substring(0, pos + 1) + ".*";
+                        newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*";
                    }
-                    env.setConfig("crawlingFilter", newcrawlingfilter);
                    
                    final boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
                    env.setConfig("crawlOrder", (crawlOrder) ? "true" : "false");
@ -183,12 +183,12 @@ public class WatchCrawler_p {
                        if ((crawlingStart == null || crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) {
                            // print error message
                            prop.put("info", "4"); //crawlfilter does not match url
-                            prop.putHTML("info_newcrawlingfilter", newcrawlingfilter);
+                            prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                            prop.putHTML("info_crawlingStart", crawlingStart);
                        } else try {
                            
                            // check if the crawl filter works correctly
-                            Pattern.compile(newcrawlingfilter);
+                            Pattern.compile(newcrawlingMustMatch);
                            
                            // stack request
                            // first delete old entry, if exists
@ -201,8 +201,12 @@ public class WatchCrawler_p {
                            // stack url
                            sb.webIndex.profilesPassiveCrawls.removeEntry(crawlingStartURL.hash()); // if there is an old entry, delete it
                            final CrawlProfile.entry pe = sb.webIndex.profilesActiveCrawls.newEntry(
-                                    crawlingStartURL.getHost(), crawlingStartURL, newcrawlingfilter, newcrawlingfilter,
-                                    newcrawlingdepth, newcrawlingdepth,
+                                    crawlingStartURL.getHost(),
+                                    crawlingStartURL,
+                                    CrawlProfile.KEYWORDS_USER,
+                                    newcrawlingMustMatch,
+                                    newcrawlingMustNotMatch,
+                                    newcrawlingdepth,
                                    crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages,
                                    crawlingQ,
                                    indexText, indexMedia,
@ -270,7 +274,7 @@ public class WatchCrawler_p {
                            }
                        } catch (final PatternSyntaxException e) {
                            prop.put("info", "4"); //crawlfilter does not match url
-                            prop.putHTML("info_newcrawlingfilter", newcrawlingfilter);
+                            prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                            prop.putHTML("info_error", e.getMessage());
                        } catch (final Exception e) {
                            // mist
@ -286,7 +290,7 @@ public class WatchCrawler_p {
                            final String fileName = post.get("crawlingFile");  
                            try {
                                // check if the crawl filter works correctly
-                                Pattern.compile(newcrawlingfilter);
+                                Pattern.compile(newcrawlingMustMatch);
                                
                                // loading the file content
                                final File file = new File(fileName);
@ -306,7 +310,21 @@ public class WatchCrawler_p {
                                
                                // creating a crawler profile
                                final yacyURL crawlURL = new yacyURL("file://" + file.toString(), null);
-                                final CrawlProfile.entry profile = sb.webIndex.profilesActiveCrawls.newEntry(fileName, crawlURL, newcrawlingfilter, newcrawlingfilter, newcrawlingdepth, newcrawlingdepth, crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages, crawlingQ, indexText, indexMedia, storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw);
+                                final CrawlProfile.entry profile = sb.webIndex.profilesActiveCrawls.newEntry(
+                                        fileName, crawlURL, CrawlProfile.KEYWORDS_USER,
+                                        newcrawlingMustMatch,
+                                        CrawlProfile.MATCH_NEVER,
+                                        newcrawlingdepth,
+                                        crawlingIfOlder,
+                                        crawlingDomFilterDepth,
+                                        crawlingDomMaxPages,
+                                        crawlingQ,
+                                        indexText,
+                                        indexMedia,
+                                        storeHTCache,
+                                        true,
+                                        crawlOrder,
+                                        xsstopw, xdstopw, xpstopw);
                                
                                // pause local crawl here
                                sb.pauseCrawlJob(plasmaSwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
@ -333,7 +351,7 @@ public class WatchCrawler_p {
                            } catch (final PatternSyntaxException e) {
                                // print error message
                                prop.put("info", "4"); //crawlfilter does not match url
-                                prop.putHTML("info_newcrawlingfilter", newcrawlingfilter);
+                                prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
                                prop.putHTML("info_error", e.getMessage());
                            } catch (final Exception e) {
                                // mist
@ -353,8 +371,10 @@ public class WatchCrawler_p {
                            
                    		// create a new profile
                    		final CrawlProfile.entry pe = sb.webIndex.profilesActiveCrawls.newEntry(
-                    				sitemapURLStr, sitemapURL, newcrawlingfilter, newcrawlingfilter,
-                    				newcrawlingdepth, newcrawlingdepth,
+                    				sitemapURLStr, sitemapURL, CrawlProfile.KEYWORDS_USER,
+                    				newcrawlingMustMatch,
+                    				CrawlProfile.MATCH_NEVER,
+                    				newcrawlingdepth,
                    				crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages,
                    				crawlingQ,
                    				indexText, indexMedia,
--- a/htroot/sharedBlacklist_p.java
+++ b/htroot/sharedBlacklist_p.java
@ -104,7 +104,6 @@ public class sharedBlacklist_p {
                final String Hash = post.get("hash");
                
                // generate the download URL
-                String downloadURL = null;
                String downloadURLOld = null;
                if( sb.webIndex.seedDB != null ){ //no nullpointer error..
                    final yacySeed seed = sb.webIndex.seedDB.getConnected(Hash); 
@ -113,8 +112,6 @@ public class sharedBlacklist_p {
                        final String Port = seed.get(yacySeed.PORT, "8080");
                        final String peerName = seed.get(yacySeed.NAME, "<" + IP + ":" + Port + ">");
                        prop.putHTML("page_source", peerName);
-
-                        downloadURL = "http://" + IP + ":" + Port + "/xml/blacklists.xml";
                        downloadURLOld = "http://" + IP + ":" + Port + "/yacy/list.html?col=black";
                    } else {
                        prop.put("status", STATUS_PEER_UNKNOWN);//YaCy-Peer not found
--- a/source/de/anomic/crawler/CrawlProfile.java
+++ b/source/de/anomic/crawler/CrawlProfile.java
@ -28,6 +28,8 @@ import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Map;
+import java.util.Set;
+import java.util.regex.Pattern;

 import de.anomic.kelondro.kelondroBLOB;
 import de.anomic.kelondro.kelondroBLOBHeap;
@ -43,6 +45,20 @@ import de.anomic.yacy.yacyURL;

 public class CrawlProfile {
    
+    public static final String MATCH_ALL = ".*";
+    public static final String MATCH_NEVER = "";
+    public static final HashSet<String> NO_KEYWORDS      = new HashSet<String>(0);
+    public static final HashSet<String> KEYWORDS_PROXY   = word2set("xproxy");
+    public static final HashSet<String> KEYWORDS_REMOTE  = word2set("xremote");
+    public static final HashSet<String> KEYWORDS_USER    = word2set("xuser");
+    public static final HashSet<String> KEYWORDS_SNIPPET = word2set("xsnippet");
+    
+    private static final HashSet<String> word2set(String word) {
+        HashSet<String> s = new HashSet<String>(1);
+        s.add(word);
+        return s;
+    }
+    
    static HashMap<String, Map<String, DomProfile>> domsCache = new HashMap<String, Map<String, DomProfile>>();
    
    kelondroMap profileTable;
@ -145,8 +161,11 @@ public class CrawlProfile {
        return ne;        
    }
    
-    public entry newEntry(final String name, final yacyURL startURL, final String generalFilter, final String specificFilter,
-                           final int generalDepth, final int specificDepth,
+    public entry newEntry( final String name,
+                           final yacyURL startURL,
+                           final Set<String> keywords,
+                           final String mustmatch, final String mustnotmatch,
+                           final int generalDepth,
                           final long recrawlIfOlder /*date*/, final int domFilterDepth,  final int domMaxPages,
                           final boolean crawlingQ,
                           final boolean indexText, final boolean indexMedia,
@ -154,8 +173,11 @@ public class CrawlProfile {
                           final boolean remoteIndexing,
                           final boolean xsstopw, final boolean xdstopw, final boolean xpstopw) {
        
-        final entry ne = new entry(name, startURL, generalFilter, specificFilter,
-                             generalDepth, specificDepth,
+        final entry ne = new entry(
+                             name, startURL,
+                             keywords,
+                             mustmatch, mustnotmatch,
+                             generalDepth,
                             recrawlIfOlder, domFilterDepth, domMaxPages,
                             crawlingQ,
                             indexText, indexMedia,
@ -235,10 +257,9 @@ public class CrawlProfile {
        public static final String HANDLE           = "handle";
        public static final String NAME             = "name";
        public static final String START_URL        = "startURL";
-        public static final String GENERAL_FILTER   = "generalFilter";
-        public static final String SPECIFIC_FILTER  = "specificFilter";
-        public static final String GENERAL_DEPTH    = "generalDepth";
-        public static final String SPECIFIC_DEPTH   = "specificDepth";
+        public static final String FILTER_MUSTMATCH = "generalFilter";
+        public static final String FILTER_MUSTNOTMATCH = "nevermatch";
+        public static final String DEPTH            = "generalDepth";
        public static final String RECRAWL_IF_OLDER = "recrawlIfOlder";
        public static final String DOM_FILTER_DEPTH = "domFilterDepth";
        public static final String DOM_MAX_PAGES    = "domMaxPages";
@ -254,10 +275,16 @@ public class CrawlProfile {
        
        Map<String, String> mem;
        private Map<String, DomProfile> doms;
+        private Pattern mustmatch = null, mustnotmatch = null;
+        
        
-        public entry(final String name, final yacyURL startURL, final String generalFilter, final String specificFilter,
-                     final int generalDepth, final int specificDepth,
-                     final long recrawlIfOlder /*date*/, final int domFilterDepth, final int domMaxPages,
+        public entry(final String name, final yacyURL startURL,
+                     final Set<String> keywords,
+                     final String mustmatch,
+                     final String mustnotmatch,
+                     final int depth,
+                     final long recrawlIfOlder /*date*/,
+                     final int domFilterDepth, final int domMaxPages,
                     final boolean crawlingQ,
                     final boolean indexText, final boolean indexMedia,
                     final boolean storeHTCache, final boolean storeTXCache,
@ -269,10 +296,9 @@ public class CrawlProfile {
            mem.put(HANDLE,           handle);
            mem.put(NAME,             name);
            mem.put(START_URL,        (startURL == null) ? "" : startURL.toNormalform(true, false));
-            mem.put(GENERAL_FILTER,   (generalFilter == null) ? ".*" : generalFilter);
-            mem.put(SPECIFIC_FILTER,  (specificFilter == null) ? ".*" : specificFilter);
-            mem.put(GENERAL_DEPTH,    Integer.toString(generalDepth));
-            mem.put(SPECIFIC_DEPTH,   Integer.toString(specificDepth));
+            mem.put(FILTER_MUSTMATCH,   (mustmatch == null) ? MATCH_ALL : mustmatch);
+            mem.put(FILTER_MUSTNOTMATCH,   (mustnotmatch == null) ? MATCH_NEVER : mustnotmatch);
+            mem.put(DEPTH,    Integer.toString(depth));
            mem.put(RECRAWL_IF_OLDER, Long.toString(recrawlIfOlder));
            mem.put(DOM_FILTER_DEPTH, Integer.toString(domFilterDepth));
            mem.put(DOM_MAX_PAGES,    Integer.toString(domMaxPages));
@ -322,27 +348,24 @@ public class CrawlProfile {
            final String r = mem.get(START_URL);
            return r;
        }
-        public String generalFilter() {
-            final String r = mem.get(GENERAL_FILTER);
-            if (r == null) return ".*";
-            return r;
-        }
-        public String specificFilter() {
-            final String r = mem.get(SPECIFIC_FILTER);
-            if (r == null) return ".*";
-            return r;
+        public Pattern mustMatchPattern() {
+            if (this.mustmatch == null) {
+                String r = mem.get(FILTER_MUSTMATCH);
+                if (r == null) r = MATCH_ALL;
+                this.mustmatch = Pattern.compile(r);
+            }
+            return this.mustmatch;
        }
-        public int generalDepth() {
-            final String r = mem.get(GENERAL_DEPTH);
-            if (r == null) return 0;
-            try {
-                return Integer.parseInt(r);
-            } catch (final NumberFormatException e) {
-                return 0;
+        public Pattern mustNotMatchPattern() {
+            if (this.mustnotmatch == null) {
+                String r = mem.get(FILTER_MUSTNOTMATCH);
+                if (r == null) r = MATCH_NEVER;
+                this.mustnotmatch = Pattern.compile(r);
            }
+            return this.mustnotmatch;
        }
-        public int specificDepth() {
-            final String r = mem.get(SPECIFIC_DEPTH);
+        public int depth() {
+            final String r = mem.get(DEPTH);
            if (r == null) return 0;
            try {
                return Integer.parseInt(r);
@ -497,4 +520,5 @@ public class CrawlProfile {
            return domname;
        }
    }
+    
 }
--- a/source/de/anomic/crawler/CrawlQueues.java
+++ b/source/de/anomic/crawler/CrawlQueues.java
@ -232,8 +232,9 @@ public class CrawlQueues {
                            + ", initiator=" + urlEntry.initiator()
                            + ", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false")
                            + ", depth=" + urlEntry.depth()
-                            + ", crawlDepth=" + profile.generalDepth()
-                            + ", filter=" + profile.generalFilter()
+                            + ", crawlDepth=" + profile.depth()
+                            + ", must-match=" + profile.mustMatchPattern().toString()
+                            + ", must-not-match=" + profile.mustNotMatchPattern().toString()
                            + ", permission=" + ((sb.webIndex.seedDB == null) ? "undefined" : (((sb.webIndex.seedDB.mySeed().isSenior()) || (sb.webIndex.seedDB.mySeed().isPrincipal())) ? "true" : "false")));

                processLocalCrawling(urlEntry, stats);
--- a/source/de/anomic/crawler/CrawlStacker.java
+++ b/source/de/anomic/crawler/CrawlStacker.java
@ -409,14 +409,22 @@ public final class CrawlStacker extends Thread {
            return errorMsg;
        }
        
-        // filter deny
-        if ((entry.depth() > 0) && (!(entry.url().toString().matches(profile.generalFilter())))) {
-            reason = "url does not match general filter";
-            if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' does not match crawling filter '" + profile.generalFilter() + "'. " +
+        // filter with must-match
+        if ((entry.depth() > 0) && !profile.mustMatchPattern().matcher(entry.url().toString()).matches()) {
+            reason = "url does not match must-match filter";
+            if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' does not match must-match crawling filter '" + profile.mustMatchPattern().toString() + "'. " +
                             "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
            return reason;
        }
-        
+
+        // filter with must-not-match
+        if ((entry.depth() > 0) && profile.mustNotMatchPattern().matcher(entry.url().toString()).matches()) {
+            reason = "url matches must-not-match filter";
+            if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' does matches do-not-match crawling filter '" + profile.mustNotMatchPattern().toString() + "'. " +
+                             "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
+            return reason;
+        }
+
        // deny cgi
        if (entry.url().isCGI())  {
            reason = "cgi url not allowed";
@ -486,7 +494,7 @@ public final class CrawlStacker extends Thread {
        final boolean remote = profile.handle().equals(this.sb.webIndex.defaultRemoteProfile.handle());
        final boolean global = 
            (profile.remoteIndexing()) /* granted */ &&
-            (entry.depth() == profile.generalDepth()) /* leaf node */ && 
+            (entry.depth() == profile.depth()) /* leaf node */ && 
            //(initiatorHash.equals(yacyCore.seedDB.mySeed.hash)) /* not proxy */ &&
            (
                    (sb.webIndex.seedDB.mySeed().isSenior()) ||
--- a/source/de/anomic/data/SitemapParser.java
+++ b/source/de/anomic/data/SitemapParser.java
@ -330,26 +330,27 @@ public class SitemapParser extends DefaultHandler {
    }

    private CrawlProfile.entry createProfile(final String domainName, final yacyURL sitemapURL) {
-        return this.sb.webIndex.profilesActiveCrawls.newEntry(domainName, sitemapURL,
-        // crawlingFilter
-                                                              ".*", ".*",
-                                                              // Depth
-                                                              0, 0,
-                                                              // force recrawling
-                                                              0,
-                                                              // disable Auto-Dom-Filter
-                                                              -1, -1,
-                                                              // allow crawling of dynamic URLs
-                                                              true,
-                                                              // index text + media
-                                                              true, true,
-                                                              // don't store downloaded pages to Web Cache
-                                                              false,
-                                                              // store to TX cache
-                                                              true,
-                                                              // remote Indexing disabled
-                                                              false,
-                                                              // exclude stop-words
-                                                              true, true, true);
+        return this.sb.webIndex.profilesActiveCrawls.newEntry(
+                domainName, sitemapURL, CrawlProfile.KEYWORDS_USER,
+                // crawling Filter
+                CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
+                // Depth
+                0,
+                // force recrawling
+                0,
+                // disable Auto-Dom-Filter
+                -1, -1,
+                // allow crawling of dynamic URLs
+                true,
+                // index text + media
+                true, true,
+                // don't store downloaded pages to Web Cache
+                false,
+                // store to TX cache
+                true,
+                // remote Indexing disabled
+                false,
+                // exclude stop-words
+                true, true, true);
    }
 }
--- a/source/de/anomic/data/bookmarksDB.java
+++ b/source/de/anomic/data/bookmarksDB.java
@ -226,22 +226,22 @@ public class bookmarksDB {
 	    			int pos = 0;					
 					// set crawlingStart to BookmarkUrl    			
 	    			String crawlingStart = bm.getUrl();                    
-	    			String newcrawlingfilter = crawlingfilter;
+	    			String newcrawlingMustMatch = crawlingfilter;
 	    			
                    yacyURL crawlingStartURL = new yacyURL(crawlingStart, null);
                    
                    // set the crawling filter                    
-                    if (newcrawlingfilter.length() < 2) newcrawlingfilter = ".*"; // avoid that all urls are filtered out if bad value was submitted
+                    if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = ".*"; // avoid that all urls are filtered out if bad value was submitted
                    
-                    if (crawlingStartURL!= null && newcrawlingfilter.equals("dom")) {
-                        newcrawlingfilter = ".*" + crawlingStartURL.getHost() + ".*";
+                    if (crawlingStartURL!= null && newcrawlingMustMatch.equals("dom")) {
+                        newcrawlingMustMatch = ".*" + crawlingStartURL.getHost() + ".*";
                    }
-                    if (crawlingStart!= null && newcrawlingfilter.equals("sub") && (pos = crawlingStart.lastIndexOf("/")) > 0) {
-                        newcrawlingfilter = crawlingStart.substring(0, pos + 1) + ".*";
+                    if (crawlingStart!= null && newcrawlingMustMatch.equals("sub") && (pos = crawlingStart.lastIndexOf("/")) > 0) {
+                        newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*";
                    }                    				
 					
 					// check if the crawl filter works correctly    			
-	    			Pattern.compile(newcrawlingfilter);	    			
+	    			Pattern.compile(newcrawlingMustMatch);	    			
                    
                    String urlhash = crawlingStartURL.hash();
                    sb.webIndex.removeURL(urlhash);
@ -251,8 +251,10 @@ public class bookmarksDB {
 	                // stack url
 	                sb.webIndex.profilesPassiveCrawls.removeEntry(crawlingStartURL.hash()); // if there is an old entry, delete it
 	                CrawlProfile.entry pe = sb.webIndex.profilesActiveCrawls.newEntry(
-	                        folder+"/"+crawlingStartURL, crawlingStartURL, newcrawlingfilter, newcrawlingfilter,
-	                        newcrawlingdepth, newcrawlingdepth,
+	                        folder+"/"+crawlingStartURL, crawlingStartURL, CrawlProfile.KEYWORDS_USER,
+	                        newcrawlingMustMatch,
+	                        CrawlProfile.MATCH_NEVER,
+	                        newcrawlingdepth,
 	                        sb.webIndex.profilesActiveCrawls.getRecrawlDate(crawlingIfOlder), crawlingDomFilterDepth, crawlingDomMaxPages,
 	                        crawlingQ,
 	                        indexText, indexMedia,
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -1558,8 +1558,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
        
        if (this.log.isFine()) log.logFine("processResourceStack processCase=" + processCase +
                ", depth=" + entry.depth() +
-                ", maxDepth=" + ((entry.profile() == null) ? "null" : Integer.toString(entry.profile().generalDepth())) +
-                ", filter=" + ((entry.profile() == null) ? "null" : entry.profile().generalFilter()) +
+                ", maxDepth=" + ((entry.profile() == null) ? "null" : Integer.toString(entry.profile().depth())) +
+                ", must-match=" + ((entry.profile() == null) ? "null" : entry.profile().mustMatchPattern().toString()) +
+                ", must-not-match=" + ((entry.profile() == null) ? "null" : entry.profile().mustNotMatchPattern().toString()) +
                ", initiatorHash=" + entry.initiator() +
                //", responseHeader=" + ((entry.responseHeader() == null) ? "null" : entry.responseHeader().toString()) +
                ", url=" + entry.url()); // DEBUG
@ -1591,7 +1592,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
        final long stackStartTime = System.currentTimeMillis();
        if (
                ((processCase == plasmaSwitchboardConstants.PROCESSCASE_4_PROXY_LOAD) || (processCase == plasmaSwitchboardConstants.PROCESSCASE_5_LOCAL_CRAWLING)) &&
-                ((entry.profile() == null) || (entry.depth() < entry.profile().generalDepth()))
+                ((entry.profile() == null) || (entry.depth() < entry.profile().depth()))
        ) {
            final Map<yacyURL, String> hl = document.getHyperlinks();
            final Iterator<Map.Entry<yacyURL, String>> i = hl.entrySet().iterator();
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@ -279,8 +279,7 @@ public final class plasmaWordIndex implements indexRI {
        
        if (this.defaultProxyProfile == null) {
            // generate new default entry for proxy crawling
-            this.defaultProxyProfile = this.profilesActiveCrawls.newEntry("proxy", null, ".*", ".*",
-                    0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
+            this.defaultProxyProfile = this.profilesActiveCrawls.newEntry("proxy", null, CrawlProfile.KEYWORDS_PROXY, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
                    0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
                    this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, -1, false,
                    true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/,
@ -290,27 +289,27 @@ public final class plasmaWordIndex implements indexRI {
        }
        if (this.defaultRemoteProfile == null) {
            // generate new default entry for remote crawling
-            defaultRemoteProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_REMOTE, null, ".*", ".*", 0, 0,
+            defaultRemoteProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_REMOTE, null, CrawlProfile.KEYWORDS_REMOTE, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
                    -1, -1, -1, true, true, true, false, true, false, true, true, false);
        }
        if (this.defaultTextSnippetLocalProfile == null) {
            // generate new default entry for snippet fetch and optional crawling
-            defaultTextSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, ".*", ".*", 0, 0,
+            defaultTextSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.KEYWORDS_SNIPPET, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
            		this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, -1, true, false, false, false, false, false, true, true, false);
        }
        if (this.defaultTextSnippetGlobalProfile == null) {
            // generate new default entry for snippet fetch and optional crawling
-            defaultTextSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, ".*", ".*", 0, 0,
+            defaultTextSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.KEYWORDS_SNIPPET, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
            		this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false);
        }
        if (this.defaultMediaSnippetLocalProfile == null) {
            // generate new default entry for snippet fetch and optional crawling
-            defaultMediaSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, ".*", ".*", 0, 0,
+            defaultMediaSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.KEYWORDS_SNIPPET, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
            		this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, false, false, false, false, true, true, false);
        }
        if (this.defaultMediaSnippetGlobalProfile == null) {
            // generate new default entry for snippet fetch and optional crawling
-            defaultMediaSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, ".*", ".*", 0, 0,
+            defaultMediaSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.KEYWORDS_SNIPPET, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
            		this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false);
        }
    }
--- a/source/de/anomic/urlRedirector/urlRedirectord.java
+++ b/source/de/anomic/urlRedirector/urlRedirectord.java
@ -37,12 +37,12 @@ public class urlRedirectord implements serverHandler, Cloneable {
                            // name
                            "URL Redirector",
                            // start URL
-                            null, 
+                            null,
+                            // keywords
+                            CrawlProfile.KEYWORDS_USER,
                            // crawling filter
-                            ".*", 
-                            ".*", 
+                            CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 
                            // depth
-                            0, 
                            0,
                            // recrawlIfOlder (minutes), if negative: do not re-crawl
                            -1,