From 0cc0290978a613338c41bb17819bbdc6608b88e1 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Mon, 27 Feb 2012 00:52:44 +0100 Subject: [PATCH] bugfix for a must-not-match pattern check. This bug did not make the check semantically wrong, but a trick that prevented an IP lookup in case that the filter was not used did not work. That bugfix causes that crawling gets a huge speed boost for noload urls! --- source/de/anomic/crawler/CrawlStacker.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index 57aa22143..ea1515e1a 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -495,8 +495,8 @@ public final class CrawlStacker { } // filter with must-not-match for IPs - if ((depth > 0) && profile.ipMustMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && url.getHost() != null && profile.ipMustNotMatchPattern().matcher(url.getInetAddress().getHostAddress()).matches()) { - if (this.log.isFine()) this.log.logFine("IP " + url.getInetAddress().getHostAddress() + " of URL '" + urlstring + "' matches must-not-match crawling filter '" + profile.ipMustMatchPattern().toString() + "'."); + if ((depth > 0) && profile.ipMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && url.getHost() != null && profile.ipMustNotMatchPattern().matcher(url.getInetAddress().getHostAddress()).matches()) { + if (this.log.isFine()) this.log.logFine("IP " + url.getInetAddress().getHostAddress() + " of URL '" + urlstring + "' matches must-not-match crawling filter '" + profile.ipMustNotMatchPattern().toString() + "'."); return "ip " + url.getInetAddress().getHostAddress() + " of url matches must-not-match filter"; }