diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java index 9d79efefd..54908e1df 100644 --- a/source/net/yacy/crawler/CrawlStacker.java +++ b/source/net/yacy/crawler/CrawlStacker.java @@ -303,7 +303,9 @@ public final class CrawlStacker { return error; } - error = checkAcceptance(entry.url(), profile, entry.depth()); + error = checkAcceptanceChangeable(entry.url(), profile, entry.depth()); + if (error != null) return error; + error = checkAcceptanceInitially(entry.url(), profile); if (error != null) return error; // store information @@ -367,53 +369,16 @@ public final class CrawlStacker { return null; } - public String checkAcceptance(final DigestURL url, final CrawlProfile profile, final int depth) { + /** + * Test if an url shall be accepted for crawl using attributes that are consistent for the whole crawl + * These tests are incomplete and must be followed with an checkAcceptanceChangeable - test. + * @param url + * @param profile + * @return null if the url is accepted, an error string in case if the url is not accepted with an error description + */ + public String checkAcceptanceInitially(final DigestURL url, final CrawlProfile profile) { - // check if the protocol is supported - final String urlProtocol = url.getProtocol(); final String urlstring = url.toString(); - if (!Switchboard.getSwitchboard().loader.isSupportedProtocol(urlProtocol)) { - this.log.severe("Unsupported protocol in URL '" + urlstring + "'."); - return "unsupported protocol"; - } - - // check if ip is local ip address - final String urlRejectReason = urlInAcceptedDomain(url); - if (urlRejectReason != null) { - if (this.log.isFine()) this.log.fine("denied_(" + urlRejectReason + ")"); - return "denied_(" + urlRejectReason + ")"; - } - - // check blacklist - if (Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, url)) { - this.log.fine("URL '" + urlstring + "' is in blacklist."); - return "url in blacklist"; - } - - // filter with must-match for URLs - if ((depth > 0) && !profile.urlMustMatchPattern().matcher(urlstring).matches()) { - if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' does not match must-match crawling filter '" + profile.urlMustMatchPattern().toString() + "'."); - return ERROR_NO_MATCH_MUST_MATCH_FILTER + profile.urlMustMatchPattern().toString(); - } - - // filter with must-not-match for URLs - if ((depth > 0) && profile.urlMustNotMatchPattern().matcher(urlstring).matches()) { - if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' matches must-not-match crawling filter '" + profile.urlMustNotMatchPattern().toString() + "'."); - return ERROR_MATCH_WITH_MUST_NOT_MATCH_FILTER + profile.urlMustNotMatchPattern().toString(); - } - - // deny cgi - if (url.isIndividual() && !profile.crawlingQ()) { // TODO: make special property for crawlingIndividual - if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' is CGI URL."); - return "individual url (sessionid etc) not wanted"; - } - - // deny post properties - if (url.isPOST() && !profile.crawlingQ()) { - if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' is post URL."); - return "post url not allowed"; - } - // check if the url is double registered final HarvestProcess dbocc = this.nextQueue.exists(url.hash()); // returns the name of the queue if entry exists final Date oldDate = this.indexSegment.fulltext().getLoadDate(ASCII.String(url.hash())); @@ -452,13 +417,72 @@ public final class CrawlStacker { final AtomicInteger dp = profile.getCount(url.getHost()); if (dp != null && dp.get() >= maxAllowedPagesPerDomain) { if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' appeared too often in crawl stack, a maximum of " + maxAllowedPagesPerDomain + " is allowed."); - return "crawl stack domain counter exceeded"; + return "crawl stack domain counter exceeded (test by profile)"; } + /* if (ResultURLs.domainCount(EventOrigin.LOCAL_CRAWLING, url.getHost()) >= maxAllowedPagesPerDomain) { if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' appeared too often in result stack, a maximum of " + maxAllowedPagesPerDomain + " is allowed."); - return "result stack domain counter exceeded"; + return "result stack domain counter exceeded (test by domainCount)"; } + */ + } + + return null; + } + + /** + * Test if an url shall be accepted using attributes that are defined by a crawl start but can be changed during a crawl. + * @param url + * @param profile + * @param depth + * @return null if the url is accepted, an error string in case if the url is not accepted with an error description + */ + public String checkAcceptanceChangeable(final DigestURL url, final CrawlProfile profile, final int depth) { + + // check if the protocol is supported + final String urlProtocol = url.getProtocol(); + final String urlstring = url.toString(); + if (!Switchboard.getSwitchboard().loader.isSupportedProtocol(urlProtocol)) { + this.log.severe("Unsupported protocol in URL '" + urlstring + "'."); + return "unsupported protocol"; + } + + // check if ip is local ip address + final String urlRejectReason = urlInAcceptedDomain(url); + if (urlRejectReason != null) { + if (this.log.isFine()) this.log.fine("denied_(" + urlRejectReason + ")"); + return "denied_(" + urlRejectReason + ")"; + } + + // check blacklist + if (Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, url)) { + this.log.fine("URL '" + urlstring + "' is in blacklist."); + return "url in blacklist"; + } + + // filter with must-match for URLs + if ((depth > 0) && !profile.urlMustMatchPattern().matcher(urlstring).matches()) { + if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' does not match must-match crawling filter '" + profile.urlMustMatchPattern().toString() + "'."); + return ERROR_NO_MATCH_MUST_MATCH_FILTER + profile.urlMustMatchPattern().toString(); + } + + // filter with must-not-match for URLs + if ((depth > 0) && profile.urlMustNotMatchPattern().matcher(urlstring).matches()) { + if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' matches must-not-match crawling filter '" + profile.urlMustNotMatchPattern().toString() + "'."); + return ERROR_MATCH_WITH_MUST_NOT_MATCH_FILTER + profile.urlMustNotMatchPattern().toString(); + } + + // deny cgi + if (url.isIndividual() && !profile.crawlingQ()) { // TODO: make special property for crawlingIndividual + if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' is CGI URL."); + return "individual url (sessionid etc) not wanted"; + } + + // deny post properties + if (url.isPOST() && !profile.crawlingQ()) { + if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' is post URL."); + return "post url not allowed"; } // the following filters use a DNS lookup to check if the url matches with IP filter @@ -499,7 +523,6 @@ public final class CrawlStacker { return null; } - /** * Test a url if it can be used for crawling/indexing * This mainly checks if the url is in the declared domain (local/global) diff --git a/source/net/yacy/crawler/CrawlSwitchboard.java b/source/net/yacy/crawler/CrawlSwitchboard.java index 3c67410fd..b5053ee52 100644 --- a/source/net/yacy/crawler/CrawlSwitchboard.java +++ b/source/net/yacy/crawler/CrawlSwitchboard.java @@ -107,8 +107,9 @@ public final class CrawlSwitchboard { public CrawlSwitchboard(final String networkName, Switchboard switchboard) { - this.log = switchboard.log; - this.queuesRoot = switchboard.queuesRoot; + this.switchboard = switchboard; + this.log = this.switchboard.log; + this.queuesRoot = this.switchboard.queuesRoot; this.log.info("Initializing Word Index for the network '" + networkName + "'."); if ( networkName == null || networkName.isEmpty() ) { @@ -595,6 +596,7 @@ public final class CrawlSwitchboard { deletionCandidate.remove(request.profileHandle()); } } catch (final Throwable e) { + ConcurrentLog.logException(e); return new HashSet(0); } return deletionCandidate; diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index f5f6db1e5..86294f2ae 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2508,7 +2508,7 @@ public final class Switchboard extends serverSwitch { if (response.profile() != null) { ArrayList newDocs = new ArrayList(); for (Document doc: documents) { - String rejectReason = this.crawlStacker.checkAcceptance(doc.dc_source(), response.profile(), 1 /*depth is irrelevant here, we just make clear its not the start url*/); + String rejectReason = this.crawlStacker.checkAcceptanceChangeable(doc.dc_source(), response.profile(), 1 /*depth is irrelevant here, we just make clear its not the start url*/); if (rejectReason == null) { newDocs.add(doc); } else { @@ -3003,7 +3003,7 @@ public final class Switchboard extends serverSwitch { } final Request request = this.loader.request(e.getValue(), true, true); final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle())); - final String acceptedError = this.crawlStacker.checkAcceptance(e.getValue(), profile, 0); + final String acceptedError = this.crawlStacker.checkAcceptanceChangeable(e.getValue(), profile, 0); if (acceptedError != null) { this.log.warn("addToIndex: cannot load " + urlName + ": " + acceptedError); continue; @@ -3076,7 +3076,8 @@ public final class Switchboard extends serverSwitch { DigestURL url = e.getValue(); final Request request = this.loader.request(url, true, true); final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle())); - final String acceptedError = this.crawlStacker.checkAcceptance(url, profile, 0); + String acceptedError = this.crawlStacker.checkAcceptanceChangeable(url, profile, 0); + if (acceptedError == null) acceptedError = this.crawlStacker.checkAcceptanceInitially(url, profile); if (acceptedError != null) { this.log.info("addToCrawler: cannot load " + url.toNormalform(true) + ": " + acceptedError); return;