fix for crawl domain counter limitation (limit was reached too early)

12 years ago · 3bf0104199
parent 82bfd9e00a
commit 3bf0104199
3 changed files with 80 additions and 54 deletions
--- a/source/net/yacy/crawler/CrawlStacker.java
+++ b/source/net/yacy/crawler/CrawlStacker.java
@ -303,7 +303,9 @@ public final class CrawlStacker {
            return error;
        }

-        error = checkAcceptance(entry.url(), profile, entry.depth());
+        error = checkAcceptanceChangeable(entry.url(), profile, entry.depth());
+        if (error != null) return error;
+        error = checkAcceptanceInitially(entry.url(), profile);
        if (error != null) return error;

        // store information
@ -367,53 +369,16 @@ public final class CrawlStacker {
        return null;
    }

-    public String checkAcceptance(final DigestURL url, final CrawlProfile profile, final int depth) {
+    /**
+     * Test if an url shall be accepted for crawl using attributes that are consistent for the whole crawl
+     * These tests are incomplete and must be followed with an checkAcceptanceChangeable - test.
+     * @param url
+     * @param profile
+     * @return null if the url is accepted, an error string in case if the url is not accepted with an error description
+     */
+    public String checkAcceptanceInitially(final DigestURL url, final CrawlProfile profile) {

-        // check if the protocol is supported
-        final String urlProtocol = url.getProtocol();
        final String urlstring = url.toString();
-        if (!Switchboard.getSwitchboard().loader.isSupportedProtocol(urlProtocol)) {
-            this.log.severe("Unsupported protocol in URL '" + urlstring + "'.");
-            return "unsupported protocol";
-        }
-
-        // check if ip is local ip address
-        final String urlRejectReason = urlInAcceptedDomain(url);
-        if (urlRejectReason != null) {
-            if (this.log.isFine()) this.log.fine("denied_(" + urlRejectReason + ")");
-            return "denied_(" + urlRejectReason + ")";
-        }
-
-        // check blacklist
-        if (Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, url)) {
-            this.log.fine("URL '" + urlstring + "' is in blacklist.");
-            return "url in blacklist";
-        }
-
-        // filter with must-match for URLs
-        if ((depth > 0) && !profile.urlMustMatchPattern().matcher(urlstring).matches()) {
-            if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' does not match must-match crawling filter '" + profile.urlMustMatchPattern().toString() + "'.");
-            return ERROR_NO_MATCH_MUST_MATCH_FILTER + profile.urlMustMatchPattern().toString();
-        }
-
-        // filter with must-not-match for URLs
-        if ((depth > 0) && profile.urlMustNotMatchPattern().matcher(urlstring).matches()) {
-            if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' matches must-not-match crawling filter '" + profile.urlMustNotMatchPattern().toString() + "'.");
-            return ERROR_MATCH_WITH_MUST_NOT_MATCH_FILTER + profile.urlMustNotMatchPattern().toString();
-        }
-
-        // deny cgi
-        if (url.isIndividual() && !profile.crawlingQ())  { // TODO: make special property for crawlingIndividual
-            if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' is CGI URL.");
-            return "individual url (sessionid etc) not wanted";
-        }
-
-        // deny post properties
-        if (url.isPOST() && !profile.crawlingQ())  {
-            if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' is post URL.");
-            return "post url not allowed";
-        }
-
        // check if the url is double registered
        final HarvestProcess dbocc = this.nextQueue.exists(url.hash()); // returns the name of the queue if entry exists
        final Date oldDate = this.indexSegment.fulltext().getLoadDate(ASCII.String(url.hash()));
@ -452,13 +417,72 @@ public final class CrawlStacker {
            final AtomicInteger dp = profile.getCount(url.getHost());
            if (dp != null && dp.get() >= maxAllowedPagesPerDomain) {
                if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' appeared too often in crawl stack, a maximum of " + maxAllowedPagesPerDomain + " is allowed.");
-                return "crawl stack domain counter exceeded";
+                return "crawl stack domain counter exceeded (test by profile)";
            }

+            /*
            if (ResultURLs.domainCount(EventOrigin.LOCAL_CRAWLING, url.getHost()) >= maxAllowedPagesPerDomain) {
                if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' appeared too often in result stack, a maximum of " + maxAllowedPagesPerDomain + " is allowed.");
-                return "result stack domain counter exceeded";
+                return "result stack domain counter exceeded (test by domainCount)";
            }
+            */
+        }
+
+        return null;
+    }
+
+    /**
+     * Test if an url shall be accepted using attributes that are defined by a crawl start but can be changed during a crawl.
+     * @param url
+     * @param profile
+     * @param depth
+     * @return null if the url is accepted, an error string in case if the url is not accepted with an error description
+     */
+    public String checkAcceptanceChangeable(final DigestURL url, final CrawlProfile profile, final int depth) {
+
+        // check if the protocol is supported
+        final String urlProtocol = url.getProtocol();
+        final String urlstring = url.toString();
+        if (!Switchboard.getSwitchboard().loader.isSupportedProtocol(urlProtocol)) {
+            this.log.severe("Unsupported protocol in URL '" + urlstring + "'.");
+            return "unsupported protocol";
+        }
+
+        // check if ip is local ip address
+        final String urlRejectReason = urlInAcceptedDomain(url);
+        if (urlRejectReason != null) {
+            if (this.log.isFine()) this.log.fine("denied_(" + urlRejectReason + ")");
+            return "denied_(" + urlRejectReason + ")";
+        }
+
+        // check blacklist
+        if (Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, url)) {
+            this.log.fine("URL '" + urlstring + "' is in blacklist.");
+            return "url in blacklist";
+        }
+
+        // filter with must-match for URLs
+        if ((depth > 0) && !profile.urlMustMatchPattern().matcher(urlstring).matches()) {
+            if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' does not match must-match crawling filter '" + profile.urlMustMatchPattern().toString() + "'.");
+            return ERROR_NO_MATCH_MUST_MATCH_FILTER + profile.urlMustMatchPattern().toString();
+        }
+
+        // filter with must-not-match for URLs
+        if ((depth > 0) && profile.urlMustNotMatchPattern().matcher(urlstring).matches()) {
+            if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' matches must-not-match crawling filter '" + profile.urlMustNotMatchPattern().toString() + "'.");
+            return ERROR_MATCH_WITH_MUST_NOT_MATCH_FILTER + profile.urlMustNotMatchPattern().toString();
+        }
+
+        // deny cgi
+        if (url.isIndividual() && !profile.crawlingQ())  { // TODO: make special property for crawlingIndividual
+            if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' is CGI URL.");
+            return "individual url (sessionid etc) not wanted";
+        }
+
+        // deny post properties
+        if (url.isPOST() && !profile.crawlingQ())  {
+            if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' is post URL.");
+            return "post url not allowed";
        }

        // the following filters use a DNS lookup to check if the url matches with IP filter
@ -499,7 +523,6 @@ public final class CrawlStacker {
        return null;
    }

-
    /**
     * Test a url if it can be used for crawling/indexing
     * This mainly checks if the url is in the declared domain (local/global)
--- a/source/net/yacy/crawler/CrawlSwitchboard.java
+++ b/source/net/yacy/crawler/CrawlSwitchboard.java
@ -107,8 +107,9 @@ public final class CrawlSwitchboard {

    public CrawlSwitchboard(final String networkName, Switchboard switchboard) {

-        this.log = switchboard.log;
-        this.queuesRoot = switchboard.queuesRoot;
+        this.switchboard = switchboard;
+        this.log = this.switchboard.log;
+        this.queuesRoot = this.switchboard.queuesRoot;
        this.log.info("Initializing Word Index for the network '" + networkName + "'.");

        if ( networkName == null || networkName.isEmpty() ) {
@ -595,6 +596,7 @@ public final class CrawlSwitchboard {
                deletionCandidate.remove(request.profileHandle());
            }
        } catch (final Throwable e) {
+            ConcurrentLog.logException(e);
            return new HashSet<String>(0);
        }
        return deletionCandidate;
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -2508,7 +2508,7 @@ public final class Switchboard extends serverSwitch {
        if (response.profile() != null) {
            ArrayList<Document> newDocs = new ArrayList<Document>();
            for (Document doc: documents) {
-                String rejectReason = this.crawlStacker.checkAcceptance(doc.dc_source(), response.profile(), 1 /*depth is irrelevant here, we just make clear its not the start url*/);
+                String rejectReason = this.crawlStacker.checkAcceptanceChangeable(doc.dc_source(), response.profile(), 1 /*depth is irrelevant here, we just make clear its not the start url*/);
                if (rejectReason == null) {
                    newDocs.add(doc);
                } else {
@ -3003,7 +3003,7 @@ public final class Switchboard extends serverSwitch {
            }
            final Request request = this.loader.request(e.getValue(), true, true);
            final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle()));
-            final String acceptedError = this.crawlStacker.checkAcceptance(e.getValue(), profile, 0);
+            final String acceptedError = this.crawlStacker.checkAcceptanceChangeable(e.getValue(), profile, 0);
            if (acceptedError != null) {
                this.log.warn("addToIndex: cannot load " + urlName + ": " + acceptedError);
                continue;
@ -3076,7 +3076,8 @@ public final class Switchboard extends serverSwitch {
            DigestURL url = e.getValue();
            final Request request = this.loader.request(url, true, true);
            final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle()));
-            final String acceptedError = this.crawlStacker.checkAcceptance(url, profile, 0);
+            String acceptedError = this.crawlStacker.checkAcceptanceChangeable(url, profile, 0);
+            if (acceptedError == null) acceptedError = this.crawlStacker.checkAcceptanceInitially(url, profile);
            if (acceptedError != null) {
                this.log.info("addToCrawler: cannot load " + url.toNormalform(true) + ": " + acceptedError);
                return;