@ -42,7 +42,7 @@ import de.anomic.yacy.yacyURL;
public final class CrawlStacker {
final Log log = new Log ( "STACKCRAWL" ) ;
private Log log = new Log ( "STACKCRAWL" ) ;
private serverProcessor < CrawlEntry > fastQueue , slowQueue ;
private long dnsHit , dnsMiss ;
@ -137,7 +137,7 @@ public final class CrawlStacker {
public void enqueueEntry ( final CrawlEntry entry ) {
// DEBUG
if ( log . isFinest ( ) ) log . logFinest ( "ENQUEUE " + entry . url ( ) + ", referer=" + entry . referrerhash ( ) + ", initiator=" + entry . initiator ( ) + ", name=" + entry . name ( ) + ", load=" + entry . loaddate ( ) + ", depth=" + entry . depth ( ) ) ;
if ( log . isFinest ( ) ) log . logFinest ( "ENQUEUE " + entry . url ( ) + ", referer=" + entry . referrerhash ( ) + ", initiator=" + entry . initiator ( ) + ", name=" + entry . name ( ) + ", load=" + entry . loaddate ( ) + ", depth=" + entry . depth ( ) ) ;
if ( prefetchHost ( entry . url ( ) . getHost ( ) ) ) {
try {
@ -162,31 +162,27 @@ public final class CrawlStacker {
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
final long startTime = System . currentTimeMillis ( ) ;
String reason = null ; // failure reason
// check if the protocol is supported
final String urlProtocol = entry . url ( ) . getProtocol ( ) ;
if ( ! nextQueue . isSupportedProtocol ( urlProtocol ) ) {
reason = "unsupported protocol" ;
this . log . logSevere ( "Unsupported protocol in URL '" + entry . url ( ) . toString ( ) + "'. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return reason ;
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return "unsupported protocol" ;
}
// check if ip is local ip address
final String urlRejectReason = urlInAcceptedDomain ( entry . url ( ) ) ;
if ( urlRejectReason ! = null ) {
reason = "denied_(" + urlRejectReason + ")" ;
if ( this . log . isFine ( ) ) this . log . logFine ( reason + "Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return reason ;
if ( this . log . isFine ( ) ) this . log . logFine ( "denied_(" + urlRejectReason + ") Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return "denied_(" + urlRejectReason + ")" ;
}
// check blacklist
if ( plasmaSwitchboard . urlBlacklist . isListed ( Blacklist . BLACKLIST_CRAWLER , entry . url ( ) ) ) {
reason = "url in blacklist" ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' is in blacklist. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return reason ;
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return "url in blacklist" ;
}
final CrawlProfile . entry profile = wordIndex . profilesActiveCrawls . getEntry ( entry . profileHandle ( ) ) ;
@ -198,36 +194,30 @@ public final class CrawlStacker {
// filter with must-match
if ( ( entry . depth ( ) > 0 ) & & ! profile . mustMatchPattern ( ) . matcher ( entry . url ( ) . toString ( ) ) . matches ( ) ) {
reason = "url does not match must-match filter" ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' does not match must-match crawling filter '" + profile . mustMatchPattern ( ) . toString ( ) + "'. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return reason ;
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return "url does not match must-match filter" ;
}
// filter with must-not-match
if ( ( entry . depth ( ) > 0 ) & & profile . mustNotMatchPattern ( ) . matcher ( entry . url ( ) . toString ( ) ) . matches ( ) ) {
reason = "url matches must-not-match filter" ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' does matches do-not-match crawling filter '" + profile . mustNotMatchPattern ( ) . toString ( ) + "'. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return reason ;
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return "url matches must-not-match filter" ;
}
// deny cgi
if ( entry . url ( ) . isCGI ( ) ) {
reason = "cgi url not allowed" ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' is CGI URL. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return reason ;
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return "cgi url not allowed" ;
}
// deny post properties
if ( entry . url ( ) . isPOST ( ) & & ! ( profile . crawlingQ ( ) ) ) {
reason = "post url not allowed" ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' is post URL. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return reason ;
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return "post url not allowed" ;
}
final yacyURL referrerURL = ( entry . referrerhash ( ) = = null ) ? null : nextQueue . getURL ( entry . referrerhash ( ) ) ;
@ -239,18 +229,16 @@ public final class CrawlStacker {
// deny urls that do not match with the profile domain list
if ( ! ( profile . grantedDomAppearance ( entry . url ( ) . getHost ( ) ) ) ) {
reason = "url does not match domain filter" ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' is not listed in granted domains. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return reason ;
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return "url does not match domain filter" ;
}
// deny urls that exceed allowed number of occurrences
if ( ! ( profile . grantedDomCount ( entry . url ( ) . getHost ( ) ) ) ) {
reason = "domain counter exceeded" ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' appeared too often, a maximum of " + profile . domMaxPages ( ) + " is allowed. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return reason ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' appeared too often, a maximum of " + profile . domMaxPages ( ) + " is allowed. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return "domain counter exceeded" ;
}
// check if the url is double registered
@ -260,14 +248,12 @@ public final class CrawlStacker {
final boolean recrawl = ( oldEntry ! = null ) & & ( profile . recrawlIfOlder ( ) > oldEntry . loaddate ( ) . getTime ( ) ) ;
// do double-check
if ( ( dbocc ! = null ) & & ( ! recrawl ) ) {
reason = "double " + dbocc ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return reason ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return "double " + dbocc ;
}
if ( ( oldEntry ! = null ) & & ( ! recrawl ) ) {
reason = "double LURL" ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' is double registered in 'LURL'. " + "Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return reason ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' is double registered in 'LURL'. " + "Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return "double LURL" ;
}
// show potential re-crawl
@ -313,7 +299,6 @@ public final class CrawlStacker {
return null ;
}
/ * *
* Test a url if it can be used for crawling / indexing
* This mainly checks if the url is in the declared domain ( local / global )