Fixed exceeding max size of failreason_s Solr field on large link list

When using the 'From Link-List of URL' as a crawl start, with lists in the order of one or more thousands of links, the failreason_s Solr field maximum size (32kb) was exceeded by the string representation of the URL must-match filter when a crawl URL was rejected because not matching.
7 years ago · dcad393fe5
parent f467601561
commit dcad393fe5
4 changed files with 25 additions and 4 deletions
--- a/source/net/yacy/crawler/CrawlStacker.java
+++ b/source/net/yacy/crawler/CrawlStacker.java
@ -496,8 +496,11 @@ public final class CrawlStacker implements WorkflowTask<Request>{

        // filter with must-match for URLs
        if ((depth > 0) && !profile.urlMustMatchPattern().matcher(urlstring).matches()) {
-            if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("URL '" + urlstring + "' does not match must-match crawling filter '" + profile.urlMustMatchPattern().toString() + "'.");
-            return ERROR_NO_MATCH_MUST_MATCH_FILTER + profile.urlMustMatchPattern().toString();
+        	final String patternStr = profile.formattedUrlMustMatchPattern();
+            if (CrawlStacker.log.isFine()) {
+            	CrawlStacker.log.fine("URL '" + urlstring + "' does not match must-match crawling filter '" + patternStr + "'.");
+            }
+            return ERROR_NO_MATCH_MUST_MATCH_FILTER + patternStr;
        }

        // filter with must-not-match for URLs
--- a/source/net/yacy/crawler/data/CrawlProfile.java
+++ b/source/net/yacy/crawler/data/CrawlProfile.java
@ -467,6 +467,24 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        }
        return this.crawlerurlmustmatch;
    }
+    
+	/**
+	 * Render the urlMustMatchPattern as a String of limited size, suffixing it with
+	 * "..." when it is truncated. Used to prevent unnecessary growth of the logs,
+	 * and to prevent exceeding the field size limit for
+	 * CollectionSchema.failreason_s (32k) when the pattern is present in a fail doc
+	 * added to the Solr index.
+	 * 
+	 * @return the urlMustMatchPattern formatted as a String of limited size
+	 */
+    public String formattedUrlMustMatchPattern() {
+    	String patternStr = urlMustMatchPattern().toString();
+    	if(patternStr.length() > 1000) {
+    		/* The pattern may be quite large when using the 'From Link-List of URL' crawl start point. */
+    		patternStr = patternStr.substring(0, Math.min(patternStr.length(), 1000)) + "...";
+    	}
+    	return patternStr;
+    }

    /**
     * Gets the regex which must not be matched by URLs in order to be crawled.
--- a/source/net/yacy/crawler/data/CrawlQueues.java
+++ b/source/net/yacy/crawler/data/CrawlQueues.java
@ -371,7 +371,7 @@ public class CrawlQueues {
                            + ", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false")
                            + ", depth=" + urlEntry.depth()
                            + ", crawlDepth=" + profile.depth()
-                            + ", must-match=" + profile.urlMustMatchPattern().toString()
+                            + ", must-match=" + profile.formattedUrlMustMatchPattern()
                            + ", must-not-match=" + profile.urlMustNotMatchPattern().toString()
                            + ", permission=" + ((this.sb.peers == null) ? "undefined" : (((this.sb.peers.mySeed().isSenior()) || (this.sb.peers.mySeed().isPrincipal())) ? "true" : "false")));
                }
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -2992,7 +2992,7 @@ public final class Switchboard extends serverSwitch {
                "processResourceStack processCase=" + processCase
                + ", depth=" + response.depth()
                + ", maxDepth=" + ((response.profile() == null) ? "null" : Integer.toString(response.profile().depth()))
-                + ", must-match=" + ((response.profile() == null) ? "null" : response.profile().urlMustMatchPattern().toString())
+                + ", must-match=" + ((response.profile() == null) ? "null" : response.profile().formattedUrlMustMatchPattern())
                + ", must-not-match=" + ((response.profile() == null) ? "null" : response.profile().urlMustNotMatchPattern().toString())
                + ", initiatorHash=" + ((response.initiator() == null) ? "null" : ASCII.String(response.initiator()))
                + ", url=" + response.url()); // DEBUG