fixed default must-match filter for full domain crawls - the old filter

was to restrictive and did not allow intranet crawls
13 years ago · 22f05c83ff
parent 3e61287326
commit 22f05c83ff
2 changed files with 6 additions and 6 deletions
--- a/source/de/anomic/crawler/CrawlProfile.java
+++ b/source/de/anomic/crawler/CrawlProfile.java
@ -482,16 +482,16 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        if (crawlingStartURL.isFile()) {
            return "file://" + crawlingStartURL.getPath() + ".*";
        } else if (crawlingStartURL.isSMB()) {
-            return "smb://" + crawlingStartURL.getHost() + "(?:/|$)+.*";
+            return "smb://" + crawlingStartURL.getHost() + ".*";
        } else if (crawlingStartURL.isFTP()) {
-            return "ftp://" + crawlingStartURL.getHost() + "(?:/|$)+.*";
+            return "ftp://" + crawlingStartURL.getHost() + ".*";
        } else {
            final String host = crawlingStartURL.getHost();
            if (host.startsWith("www.")) {
-                return "https?://" + crawlingStartURL.getHost() + "(?:/|$)+.*";
+                return "https?://" + crawlingStartURL.getHost() + ".*";
            } else {
                // if the www is not given we accept that also
-                return "https?://(?:www.)?" + crawlingStartURL.getHost() + "(?:/|$)+.*";
+                return "https?://(?:www.)?" + crawlingStartURL.getHost() + ".*";
            }
        }
    }
--- a/source/de/anomic/crawler/CrawlStacker.java
+++ b/source/de/anomic/crawler/CrawlStacker.java
@ -414,13 +414,13 @@ public final class CrawlStacker {
        // filter with must-match for URLs
        if ((depth > 0) && !profile.urlMustMatchPattern().matcher(urlstring).matches()) {
            if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' does not match must-match crawling filter '" + profile.urlMustMatchPattern().toString() + "'.");
-            return "url does not match must-match filter";
+            return "url does not match must-match filter " + profile.urlMustMatchPattern().toString();
        }

        // filter with must-not-match for URLs
        if ((depth > 0) && profile.urlMustNotMatchPattern().matcher(urlstring).matches()) {
            if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' matches must-not-match crawling filter '" + profile.urlMustNotMatchPattern().toString() + "'.");
-            return "url matches must-not-match filter";
+            return "url matches must-not-match filter " + profile.urlMustNotMatchPattern().toString();
        }

        // deny cgi