From ee89cf5ae58bb5afbcef2ed8f207da5a6f247049 Mon Sep 17 00:00:00 2001
From: Lotus <lotus@localhost>
Date: Tue, 7 Feb 2012 16:13:13 +0100
Subject: [PATCH] fix must match filter for full domain crawl

allow:
http://www.example.com
http://www.example.com/
http://www.example.com/abc.html?xyz=q
block:
http://www.example.com.cn
http://www.example.com.cn/dsf
---
 source/de/anomic/crawler/CrawlProfile.java | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/source/de/anomic/crawler/CrawlProfile.java b/source/de/anomic/crawler/CrawlProfile.java
index 5b667dd6c..7e92a52c7 100644
--- a/source/de/anomic/crawler/CrawlProfile.java
+++ b/source/de/anomic/crawler/CrawlProfile.java
@@ -482,16 +482,16 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
         if (crawlingStartURL.isFile()) {
             return "file://" + crawlingStartURL.getPath() + ".*";
         } else if (crawlingStartURL.isSMB()) {
-            return "smb://" + crawlingStartURL.getHost() + ".*";
+            return "smb://" + crawlingStartURL.getHost() + "(?:/|$)+.*";
         } else if (crawlingStartURL.isFTP()) {
-            return "ftp://" + crawlingStartURL.getHost() + ".*";
+            return "ftp://" + crawlingStartURL.getHost() + "(?:/|$)+.*";
         } else {
             final String host = crawlingStartURL.getHost();
             if (host.startsWith("www.")) {
-                return "https?://" + crawlingStartURL.getHost() + ".*";
+                return "https?://" + crawlingStartURL.getHost() + "(?:/|$)+.*";
             } else {
                 // if the www is not given we accept that also
-                return "https?://(www.)?" + crawlingStartURL.getHost() + ".*";
+                return "https?://(?:www.)?" + crawlingStartURL.getHost() + "(?:/|$)+.*";
             }
         }
     }