Fixed exceeding max size of failreason_s Solr field on large link list

When using the 'From Link-List of URL' as a crawl start, with lists in
the order of one or more thousands of links, the failreason_s Solr field
maximum size (32kb) was exceeded by the string representation of the URL
must-match filter when a crawl URL was rejected because not matching.
pull/186/head
luccioman 6 years ago
parent f467601561
commit dcad393fe5

@ -496,8 +496,11 @@ public final class CrawlStacker implements WorkflowTask<Request>{
// filter with must-match for URLs
if ((depth > 0) && !profile.urlMustMatchPattern().matcher(urlstring).matches()) {
if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("URL '" + urlstring + "' does not match must-match crawling filter '" + profile.urlMustMatchPattern().toString() + "'.");
return ERROR_NO_MATCH_MUST_MATCH_FILTER + profile.urlMustMatchPattern().toString();
final String patternStr = profile.formattedUrlMustMatchPattern();
if (CrawlStacker.log.isFine()) {
CrawlStacker.log.fine("URL '" + urlstring + "' does not match must-match crawling filter '" + patternStr + "'.");
}
return ERROR_NO_MATCH_MUST_MATCH_FILTER + patternStr;
}
// filter with must-not-match for URLs

@ -467,6 +467,24 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
}
return this.crawlerurlmustmatch;
}
/**
* Render the urlMustMatchPattern as a String of limited size, suffixing it with
* "..." when it is truncated. Used to prevent unnecessary growth of the logs,
* and to prevent exceeding the field size limit for
* CollectionSchema.failreason_s (32k) when the pattern is present in a fail doc
* added to the Solr index.
*
* @return the urlMustMatchPattern formatted as a String of limited size
*/
public String formattedUrlMustMatchPattern() {
String patternStr = urlMustMatchPattern().toString();
if(patternStr.length() > 1000) {
/* The pattern may be quite large when using the 'From Link-List of URL' crawl start point. */
patternStr = patternStr.substring(0, Math.min(patternStr.length(), 1000)) + "...";
}
return patternStr;
}
/**
* Gets the regex which must not be matched by URLs in order to be crawled.

@ -371,7 +371,7 @@ public class CrawlQueues {
+ ", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false")
+ ", depth=" + urlEntry.depth()
+ ", crawlDepth=" + profile.depth()
+ ", must-match=" + profile.urlMustMatchPattern().toString()
+ ", must-match=" + profile.formattedUrlMustMatchPattern()
+ ", must-not-match=" + profile.urlMustNotMatchPattern().toString()
+ ", permission=" + ((this.sb.peers == null) ? "undefined" : (((this.sb.peers.mySeed().isSenior()) || (this.sb.peers.mySeed().isPrincipal())) ? "true" : "false")));
}

@ -2992,7 +2992,7 @@ public final class Switchboard extends serverSwitch {
"processResourceStack processCase=" + processCase
+ ", depth=" + response.depth()
+ ", maxDepth=" + ((response.profile() == null) ? "null" : Integer.toString(response.profile().depth()))
+ ", must-match=" + ((response.profile() == null) ? "null" : response.profile().urlMustMatchPattern().toString())
+ ", must-match=" + ((response.profile() == null) ? "null" : response.profile().formattedUrlMustMatchPattern())
+ ", must-not-match=" + ((response.profile() == null) ? "null" : response.profile().urlMustNotMatchPattern().toString())
+ ", initiatorHash=" + ((response.initiator() == null) ? "null" : ASCII.String(response.initiator()))
+ ", url=" + response.url()); // DEBUG

Loading…
Cancel
Save