@ -303,7 +303,9 @@ public final class CrawlStacker {
return error ;
}
error = checkAcceptance ( entry . url ( ) , profile , entry . depth ( ) ) ;
error = checkAcceptanceChangeable ( entry . url ( ) , profile , entry . depth ( ) ) ;
if ( error ! = null ) return error ;
error = checkAcceptanceInitially ( entry . url ( ) , profile ) ;
if ( error ! = null ) return error ;
// store information
@ -367,53 +369,16 @@ public final class CrawlStacker {
return null ;
}
public String checkAcceptance ( final DigestURL url , final CrawlProfile profile , final int depth ) {
/ * *
* Test if an url shall be accepted for crawl using attributes that are consistent for the whole crawl
* These tests are incomplete and must be followed with an checkAcceptanceChangeable - test .
* @param url
* @param profile
* @return null if the url is accepted , an error string in case if the url is not accepted with an error description
* /
public String checkAcceptanceInitially ( final DigestURL url , final CrawlProfile profile ) {
// check if the protocol is supported
final String urlProtocol = url . getProtocol ( ) ;
final String urlstring = url . toString ( ) ;
if ( ! Switchboard . getSwitchboard ( ) . loader . isSupportedProtocol ( urlProtocol ) ) {
this . log . severe ( "Unsupported protocol in URL '" + urlstring + "'." ) ;
return "unsupported protocol" ;
}
// check if ip is local ip address
final String urlRejectReason = urlInAcceptedDomain ( url ) ;
if ( urlRejectReason ! = null ) {
if ( this . log . isFine ( ) ) this . log . fine ( "denied_(" + urlRejectReason + ")" ) ;
return "denied_(" + urlRejectReason + ")" ;
}
// check blacklist
if ( Switchboard . urlBlacklist . isListed ( BlacklistType . CRAWLER , url ) ) {
this . log . fine ( "URL '" + urlstring + "' is in blacklist." ) ;
return "url in blacklist" ;
}
// filter with must-match for URLs
if ( ( depth > 0 ) & & ! profile . urlMustMatchPattern ( ) . matcher ( urlstring ) . matches ( ) ) {
if ( this . log . isFine ( ) ) this . log . fine ( "URL '" + urlstring + "' does not match must-match crawling filter '" + profile . urlMustMatchPattern ( ) . toString ( ) + "'." ) ;
return ERROR_NO_MATCH_MUST_MATCH_FILTER + profile . urlMustMatchPattern ( ) . toString ( ) ;
}
// filter with must-not-match for URLs
if ( ( depth > 0 ) & & profile . urlMustNotMatchPattern ( ) . matcher ( urlstring ) . matches ( ) ) {
if ( this . log . isFine ( ) ) this . log . fine ( "URL '" + urlstring + "' matches must-not-match crawling filter '" + profile . urlMustNotMatchPattern ( ) . toString ( ) + "'." ) ;
return ERROR_MATCH_WITH_MUST_NOT_MATCH_FILTER + profile . urlMustNotMatchPattern ( ) . toString ( ) ;
}
// deny cgi
if ( url . isIndividual ( ) & & ! profile . crawlingQ ( ) ) { // TODO: make special property for crawlingIndividual
if ( this . log . isFine ( ) ) this . log . fine ( "URL '" + urlstring + "' is CGI URL." ) ;
return "individual url (sessionid etc) not wanted" ;
}
// deny post properties
if ( url . isPOST ( ) & & ! profile . crawlingQ ( ) ) {
if ( this . log . isFine ( ) ) this . log . fine ( "URL '" + urlstring + "' is post URL." ) ;
return "post url not allowed" ;
}
// check if the url is double registered
final HarvestProcess dbocc = this . nextQueue . exists ( url . hash ( ) ) ; // returns the name of the queue if entry exists
final Date oldDate = this . indexSegment . fulltext ( ) . getLoadDate ( ASCII . String ( url . hash ( ) ) ) ;
@ -452,13 +417,72 @@ public final class CrawlStacker {
final AtomicInteger dp = profile . getCount ( url . getHost ( ) ) ;
if ( dp ! = null & & dp . get ( ) > = maxAllowedPagesPerDomain ) {
if ( this . log . isFine ( ) ) this . log . fine ( "URL '" + urlstring + "' appeared too often in crawl stack, a maximum of " + maxAllowedPagesPerDomain + " is allowed." ) ;
return "crawl stack domain counter exceeded ";
return "crawl stack domain counter exceeded (test by profile) ";
}
/ *
if ( ResultURLs . domainCount ( EventOrigin . LOCAL_CRAWLING , url . getHost ( ) ) > = maxAllowedPagesPerDomain ) {
if ( this . log . isFine ( ) ) this . log . fine ( "URL '" + urlstring + "' appeared too often in result stack, a maximum of " + maxAllowedPagesPerDomain + " is allowed." ) ;
return "result stack domain counter exceeded ";
return "result stack domain counter exceeded (test by domainCount) ";
}
* /
}
return null ;
}
/ * *
* Test if an url shall be accepted using attributes that are defined by a crawl start but can be changed during a crawl .
* @param url
* @param profile
* @param depth
* @return null if the url is accepted , an error string in case if the url is not accepted with an error description
* /
public String checkAcceptanceChangeable ( final DigestURL url , final CrawlProfile profile , final int depth ) {
// check if the protocol is supported
final String urlProtocol = url . getProtocol ( ) ;
final String urlstring = url . toString ( ) ;
if ( ! Switchboard . getSwitchboard ( ) . loader . isSupportedProtocol ( urlProtocol ) ) {
this . log . severe ( "Unsupported protocol in URL '" + urlstring + "'." ) ;
return "unsupported protocol" ;
}
// check if ip is local ip address
final String urlRejectReason = urlInAcceptedDomain ( url ) ;
if ( urlRejectReason ! = null ) {
if ( this . log . isFine ( ) ) this . log . fine ( "denied_(" + urlRejectReason + ")" ) ;
return "denied_(" + urlRejectReason + ")" ;
}
// check blacklist
if ( Switchboard . urlBlacklist . isListed ( BlacklistType . CRAWLER , url ) ) {
this . log . fine ( "URL '" + urlstring + "' is in blacklist." ) ;
return "url in blacklist" ;
}
// filter with must-match for URLs
if ( ( depth > 0 ) & & ! profile . urlMustMatchPattern ( ) . matcher ( urlstring ) . matches ( ) ) {
if ( this . log . isFine ( ) ) this . log . fine ( "URL '" + urlstring + "' does not match must-match crawling filter '" + profile . urlMustMatchPattern ( ) . toString ( ) + "'." ) ;
return ERROR_NO_MATCH_MUST_MATCH_FILTER + profile . urlMustMatchPattern ( ) . toString ( ) ;
}
// filter with must-not-match for URLs
if ( ( depth > 0 ) & & profile . urlMustNotMatchPattern ( ) . matcher ( urlstring ) . matches ( ) ) {
if ( this . log . isFine ( ) ) this . log . fine ( "URL '" + urlstring + "' matches must-not-match crawling filter '" + profile . urlMustNotMatchPattern ( ) . toString ( ) + "'." ) ;
return ERROR_MATCH_WITH_MUST_NOT_MATCH_FILTER + profile . urlMustNotMatchPattern ( ) . toString ( ) ;
}
// deny cgi
if ( url . isIndividual ( ) & & ! profile . crawlingQ ( ) ) { // TODO: make special property for crawlingIndividual
if ( this . log . isFine ( ) ) this . log . fine ( "URL '" + urlstring + "' is CGI URL." ) ;
return "individual url (sessionid etc) not wanted" ;
}
// deny post properties
if ( url . isPOST ( ) & & ! profile . crawlingQ ( ) ) {
if ( this . log . isFine ( ) ) this . log . fine ( "URL '" + urlstring + "' is post URL." ) ;
return "post url not allowed" ;
}
// the following filters use a DNS lookup to check if the url matches with IP filter
@ -499,7 +523,6 @@ public final class CrawlStacker {
return null ;
}
/ * *
* Test a url if it can be used for crawling / indexing
* This mainly checks if the url is in the declared domain ( local / global )