@ -42,7 +42,7 @@ import de.anomic.yacy.yacyURL;
public final class CrawlStacker {
public final class CrawlStacker {
final Log log = new Log ( "STACKCRAWL" ) ;
private Log log = new Log ( "STACKCRAWL" ) ;
private serverProcessor < CrawlEntry > fastQueue , slowQueue ;
private serverProcessor < CrawlEntry > fastQueue , slowQueue ;
private long dnsHit , dnsMiss ;
private long dnsHit , dnsMiss ;
@ -162,31 +162,27 @@ public final class CrawlStacker {
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
final long startTime = System . currentTimeMillis ( ) ;
final long startTime = System . currentTimeMillis ( ) ;
String reason = null ; // failure reason
// check if the protocol is supported
// check if the protocol is supported
final String urlProtocol = entry . url ( ) . getProtocol ( ) ;
final String urlProtocol = entry . url ( ) . getProtocol ( ) ;
if ( ! nextQueue . isSupportedProtocol ( urlProtocol ) ) {
if ( ! nextQueue . isSupportedProtocol ( urlProtocol ) ) {
reason = "unsupported protocol" ;
this . log . logSevere ( "Unsupported protocol in URL '" + entry . url ( ) . toString ( ) + "'. " +
this . log . logSevere ( "Unsupported protocol in URL '" + entry . url ( ) . toString ( ) + "'. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return reason ;
return "unsupported protocol" ;
}
}
// check if ip is local ip address
// check if ip is local ip address
final String urlRejectReason = urlInAcceptedDomain ( entry . url ( ) ) ;
final String urlRejectReason = urlInAcceptedDomain ( entry . url ( ) ) ;
if ( urlRejectReason ! = null ) {
if ( urlRejectReason ! = null ) {
reason = "denied_(" + urlRejectReason + ")" ;
if ( this . log . isFine ( ) ) this . log . logFine ( "denied_(" + urlRejectReason + ") Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
if ( this . log . isFine ( ) ) this . log . logFine ( reason + "Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return "denied_(" + urlRejectReason + ")" ;
return reason ;
}
}
// check blacklist
// check blacklist
if ( plasmaSwitchboard . urlBlacklist . isListed ( Blacklist . BLACKLIST_CRAWLER , entry . url ( ) ) ) {
if ( plasmaSwitchboard . urlBlacklist . isListed ( Blacklist . BLACKLIST_CRAWLER , entry . url ( ) ) ) {
reason = "url in blacklist" ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' is in blacklist. " +
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' is in blacklist. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return reason ;
return "url in blacklist" ;
}
}
final CrawlProfile . entry profile = wordIndex . profilesActiveCrawls . getEntry ( entry . profileHandle ( ) ) ;
final CrawlProfile . entry profile = wordIndex . profilesActiveCrawls . getEntry ( entry . profileHandle ( ) ) ;
@ -198,36 +194,30 @@ public final class CrawlStacker {
// filter with must-match
// filter with must-match
if ( ( entry . depth ( ) > 0 ) & & ! profile . mustMatchPattern ( ) . matcher ( entry . url ( ) . toString ( ) ) . matches ( ) ) {
if ( ( entry . depth ( ) > 0 ) & & ! profile . mustMatchPattern ( ) . matcher ( entry . url ( ) . toString ( ) ) . matches ( ) ) {
reason = "url does not match must-match filter" ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' does not match must-match crawling filter '" + profile . mustMatchPattern ( ) . toString ( ) + "'. " +
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' does not match must-match crawling filter '" + profile . mustMatchPattern ( ) . toString ( ) + "'. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return reason ;
return "url does not match must-match filter" ;
}
}
// filter with must-not-match
// filter with must-not-match
if ( ( entry . depth ( ) > 0 ) & & profile . mustNotMatchPattern ( ) . matcher ( entry . url ( ) . toString ( ) ) . matches ( ) ) {
if ( ( entry . depth ( ) > 0 ) & & profile . mustNotMatchPattern ( ) . matcher ( entry . url ( ) . toString ( ) ) . matches ( ) ) {
reason = "url matches must-not-match filter" ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' does matches do-not-match crawling filter '" + profile . mustNotMatchPattern ( ) . toString ( ) + "'. " +
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' does matches do-not-match crawling filter '" + profile . mustNotMatchPattern ( ) . toString ( ) + "'. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return reason ;
return "url matches must-not-match filter" ;
}
}
// deny cgi
// deny cgi
if ( entry . url ( ) . isCGI ( ) ) {
if ( entry . url ( ) . isCGI ( ) ) {
reason = "cgi url not allowed" ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' is CGI URL. " +
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' is CGI URL. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return reason ;
return "cgi url not allowed" ;
}
}
// deny post properties
// deny post properties
if ( entry . url ( ) . isPOST ( ) & & ! ( profile . crawlingQ ( ) ) ) {
if ( entry . url ( ) . isPOST ( ) & & ! ( profile . crawlingQ ( ) ) ) {
reason = "post url not allowed" ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' is post URL. " +
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' is post URL. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return reason ;
return "post url not allowed" ;
}
}
final yacyURL referrerURL = ( entry . referrerhash ( ) = = null ) ? null : nextQueue . getURL ( entry . referrerhash ( ) ) ;
final yacyURL referrerURL = ( entry . referrerhash ( ) = = null ) ? null : nextQueue . getURL ( entry . referrerhash ( ) ) ;
@ -239,18 +229,16 @@ public final class CrawlStacker {
// deny urls that do not match with the profile domain list
// deny urls that do not match with the profile domain list
if ( ! ( profile . grantedDomAppearance ( entry . url ( ) . getHost ( ) ) ) ) {
if ( ! ( profile . grantedDomAppearance ( entry . url ( ) . getHost ( ) ) ) ) {
reason = "url does not match domain filter" ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' is not listed in granted domains. " +
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' is not listed in granted domains. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return reason ;
return "url does not match domain filter" ;
}
}
// deny urls that exceed allowed number of occurrences
// deny urls that exceed allowed number of occurrences
if ( ! ( profile . grantedDomCount ( entry . url ( ) . getHost ( ) ) ) ) {
if ( ! ( profile . grantedDomCount ( entry . url ( ) . getHost ( ) ) ) ) {
reason = "domain counter exceeded" ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' appeared too often, a maximum of " + profile . domMaxPages ( ) + " is allowed. " +
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' appeared too often, a maximum of " + profile . domMaxPages ( ) + " is allowed. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return reason ;
return "domain counter exceeded" ;
}
}
// check if the url is double registered
// check if the url is double registered
@ -260,14 +248,12 @@ public final class CrawlStacker {
final boolean recrawl = ( oldEntry ! = null ) & & ( profile . recrawlIfOlder ( ) > oldEntry . loaddate ( ) . getTime ( ) ) ;
final boolean recrawl = ( oldEntry ! = null ) & & ( profile . recrawlIfOlder ( ) > oldEntry . loaddate ( ) . getTime ( ) ) ;
// do double-check
// do double-check
if ( ( dbocc ! = null ) & & ( ! recrawl ) ) {
if ( ( dbocc ! = null ) & & ( ! recrawl ) ) {
reason = "double " + dbocc ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return reason ;
return "double " + dbocc ;
}
}
if ( ( oldEntry ! = null ) & & ( ! recrawl ) ) {
if ( ( oldEntry ! = null ) & & ( ! recrawl ) ) {
reason = "double LURL" ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' is double registered in 'LURL'. " + "Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' is double registered in 'LURL'. " + "Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return reason ;
return "double LURL" ;
}
}
// show potential re-crawl
// show potential re-crawl
@ -313,7 +299,6 @@ public final class CrawlStacker {
return null ;
return null ;
}
}
/ * *
/ * *
* Test a url if it can be used for crawling / indexing
* Test a url if it can be used for crawling / indexing
* This mainly checks if the url is in the declared domain ( local / global )
* This mainly checks if the url is in the declared domain ( local / global )