fix for crawl domain counter limitation (limit was reached too early)

pull/1/head
Michael Peter Christen 11 years ago
parent 82bfd9e00a
commit 3bf0104199

@ -303,7 +303,9 @@ public final class CrawlStacker {
return error;
}
error = checkAcceptance(entry.url(), profile, entry.depth());
error = checkAcceptanceChangeable(entry.url(), profile, entry.depth());
if (error != null) return error;
error = checkAcceptanceInitially(entry.url(), profile);
if (error != null) return error;
// store information
@ -367,53 +369,16 @@ public final class CrawlStacker {
return null;
}
public String checkAcceptance(final DigestURL url, final CrawlProfile profile, final int depth) {
/**
* Test if an url shall be accepted for crawl using attributes that are consistent for the whole crawl
* These tests are incomplete and must be followed with an checkAcceptanceChangeable - test.
* @param url
* @param profile
* @return null if the url is accepted, an error string in case if the url is not accepted with an error description
*/
public String checkAcceptanceInitially(final DigestURL url, final CrawlProfile profile) {
// check if the protocol is supported
final String urlProtocol = url.getProtocol();
final String urlstring = url.toString();
if (!Switchboard.getSwitchboard().loader.isSupportedProtocol(urlProtocol)) {
this.log.severe("Unsupported protocol in URL '" + urlstring + "'.");
return "unsupported protocol";
}
// check if ip is local ip address
final String urlRejectReason = urlInAcceptedDomain(url);
if (urlRejectReason != null) {
if (this.log.isFine()) this.log.fine("denied_(" + urlRejectReason + ")");
return "denied_(" + urlRejectReason + ")";
}
// check blacklist
if (Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, url)) {
this.log.fine("URL '" + urlstring + "' is in blacklist.");
return "url in blacklist";
}
// filter with must-match for URLs
if ((depth > 0) && !profile.urlMustMatchPattern().matcher(urlstring).matches()) {
if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' does not match must-match crawling filter '" + profile.urlMustMatchPattern().toString() + "'.");
return ERROR_NO_MATCH_MUST_MATCH_FILTER + profile.urlMustMatchPattern().toString();
}
// filter with must-not-match for URLs
if ((depth > 0) && profile.urlMustNotMatchPattern().matcher(urlstring).matches()) {
if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' matches must-not-match crawling filter '" + profile.urlMustNotMatchPattern().toString() + "'.");
return ERROR_MATCH_WITH_MUST_NOT_MATCH_FILTER + profile.urlMustNotMatchPattern().toString();
}
// deny cgi
if (url.isIndividual() && !profile.crawlingQ()) { // TODO: make special property for crawlingIndividual
if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' is CGI URL.");
return "individual url (sessionid etc) not wanted";
}
// deny post properties
if (url.isPOST() && !profile.crawlingQ()) {
if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' is post URL.");
return "post url not allowed";
}
// check if the url is double registered
final HarvestProcess dbocc = this.nextQueue.exists(url.hash()); // returns the name of the queue if entry exists
final Date oldDate = this.indexSegment.fulltext().getLoadDate(ASCII.String(url.hash()));
@ -452,13 +417,72 @@ public final class CrawlStacker {
final AtomicInteger dp = profile.getCount(url.getHost());
if (dp != null && dp.get() >= maxAllowedPagesPerDomain) {
if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' appeared too often in crawl stack, a maximum of " + maxAllowedPagesPerDomain + " is allowed.");
return "crawl stack domain counter exceeded";
return "crawl stack domain counter exceeded (test by profile)";
}
/*
if (ResultURLs.domainCount(EventOrigin.LOCAL_CRAWLING, url.getHost()) >= maxAllowedPagesPerDomain) {
if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' appeared too often in result stack, a maximum of " + maxAllowedPagesPerDomain + " is allowed.");
return "result stack domain counter exceeded";
return "result stack domain counter exceeded (test by domainCount)";
}
*/
}
return null;
}
/**
* Test if an url shall be accepted using attributes that are defined by a crawl start but can be changed during a crawl.
* @param url
* @param profile
* @param depth
* @return null if the url is accepted, an error string in case if the url is not accepted with an error description
*/
public String checkAcceptanceChangeable(final DigestURL url, final CrawlProfile profile, final int depth) {
// check if the protocol is supported
final String urlProtocol = url.getProtocol();
final String urlstring = url.toString();
if (!Switchboard.getSwitchboard().loader.isSupportedProtocol(urlProtocol)) {
this.log.severe("Unsupported protocol in URL '" + urlstring + "'.");
return "unsupported protocol";
}
// check if ip is local ip address
final String urlRejectReason = urlInAcceptedDomain(url);
if (urlRejectReason != null) {
if (this.log.isFine()) this.log.fine("denied_(" + urlRejectReason + ")");
return "denied_(" + urlRejectReason + ")";
}
// check blacklist
if (Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, url)) {
this.log.fine("URL '" + urlstring + "' is in blacklist.");
return "url in blacklist";
}
// filter with must-match for URLs
if ((depth > 0) && !profile.urlMustMatchPattern().matcher(urlstring).matches()) {
if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' does not match must-match crawling filter '" + profile.urlMustMatchPattern().toString() + "'.");
return ERROR_NO_MATCH_MUST_MATCH_FILTER + profile.urlMustMatchPattern().toString();
}
// filter with must-not-match for URLs
if ((depth > 0) && profile.urlMustNotMatchPattern().matcher(urlstring).matches()) {
if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' matches must-not-match crawling filter '" + profile.urlMustNotMatchPattern().toString() + "'.");
return ERROR_MATCH_WITH_MUST_NOT_MATCH_FILTER + profile.urlMustNotMatchPattern().toString();
}
// deny cgi
if (url.isIndividual() && !profile.crawlingQ()) { // TODO: make special property for crawlingIndividual
if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' is CGI URL.");
return "individual url (sessionid etc) not wanted";
}
// deny post properties
if (url.isPOST() && !profile.crawlingQ()) {
if (this.log.isFine()) this.log.fine("URL '" + urlstring + "' is post URL.");
return "post url not allowed";
}
// the following filters use a DNS lookup to check if the url matches with IP filter
@ -499,7 +523,6 @@ public final class CrawlStacker {
return null;
}
/**
* Test a url if it can be used for crawling/indexing
* This mainly checks if the url is in the declared domain (local/global)

@ -107,8 +107,9 @@ public final class CrawlSwitchboard {
public CrawlSwitchboard(final String networkName, Switchboard switchboard) {
this.log = switchboard.log;
this.queuesRoot = switchboard.queuesRoot;
this.switchboard = switchboard;
this.log = this.switchboard.log;
this.queuesRoot = this.switchboard.queuesRoot;
this.log.info("Initializing Word Index for the network '" + networkName + "'.");
if ( networkName == null || networkName.isEmpty() ) {
@ -595,6 +596,7 @@ public final class CrawlSwitchboard {
deletionCandidate.remove(request.profileHandle());
}
} catch (final Throwable e) {
ConcurrentLog.logException(e);
return new HashSet<String>(0);
}
return deletionCandidate;

@ -2508,7 +2508,7 @@ public final class Switchboard extends serverSwitch {
if (response.profile() != null) {
ArrayList<Document> newDocs = new ArrayList<Document>();
for (Document doc: documents) {
String rejectReason = this.crawlStacker.checkAcceptance(doc.dc_source(), response.profile(), 1 /*depth is irrelevant here, we just make clear its not the start url*/);
String rejectReason = this.crawlStacker.checkAcceptanceChangeable(doc.dc_source(), response.profile(), 1 /*depth is irrelevant here, we just make clear its not the start url*/);
if (rejectReason == null) {
newDocs.add(doc);
} else {
@ -3003,7 +3003,7 @@ public final class Switchboard extends serverSwitch {
}
final Request request = this.loader.request(e.getValue(), true, true);
final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle()));
final String acceptedError = this.crawlStacker.checkAcceptance(e.getValue(), profile, 0);
final String acceptedError = this.crawlStacker.checkAcceptanceChangeable(e.getValue(), profile, 0);
if (acceptedError != null) {
this.log.warn("addToIndex: cannot load " + urlName + ": " + acceptedError);
continue;
@ -3076,7 +3076,8 @@ public final class Switchboard extends serverSwitch {
DigestURL url = e.getValue();
final Request request = this.loader.request(url, true, true);
final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle()));
final String acceptedError = this.crawlStacker.checkAcceptance(url, profile, 0);
String acceptedError = this.crawlStacker.checkAcceptanceChangeable(url, profile, 0);
if (acceptedError == null) acceptedError = this.crawlStacker.checkAcceptanceInitially(url, profile);
if (acceptedError != null) {
this.log.info("addToCrawler: cannot load " + url.toNormalform(true) + ": " + acceptedError);
return;

Loading…
Cancel
Save