From ee89cf5ae58bb5afbcef2ed8f207da5a6f247049 Mon Sep 17 00:00:00 2001 From: Lotus Date: Tue, 7 Feb 2012 16:13:13 +0100 Subject: [PATCH] fix must match filter for full domain crawl allow: http://www.example.com http://www.example.com/ http://www.example.com/abc.html?xyz=q block: http://www.example.com.cn http://www.example.com.cn/dsf --- source/de/anomic/crawler/CrawlProfile.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/source/de/anomic/crawler/CrawlProfile.java b/source/de/anomic/crawler/CrawlProfile.java index 5b667dd6c..7e92a52c7 100644 --- a/source/de/anomic/crawler/CrawlProfile.java +++ b/source/de/anomic/crawler/CrawlProfile.java @@ -482,16 +482,16 @@ public class CrawlProfile extends ConcurrentHashMap implements M if (crawlingStartURL.isFile()) { return "file://" + crawlingStartURL.getPath() + ".*"; } else if (crawlingStartURL.isSMB()) { - return "smb://" + crawlingStartURL.getHost() + ".*"; + return "smb://" + crawlingStartURL.getHost() + "(?:/|$)+.*"; } else if (crawlingStartURL.isFTP()) { - return "ftp://" + crawlingStartURL.getHost() + ".*"; + return "ftp://" + crawlingStartURL.getHost() + "(?:/|$)+.*"; } else { final String host = crawlingStartURL.getHost(); if (host.startsWith("www.")) { - return "https?://" + crawlingStartURL.getHost() + ".*"; + return "https?://" + crawlingStartURL.getHost() + "(?:/|$)+.*"; } else { // if the www is not given we accept that also - return "https?://(www.)?" + crawlingStartURL.getHost() + ".*"; + return "https?://(?:www.)?" + crawlingStartURL.getHost() + "(?:/|$)+.*"; } } }