@ -41,17 +41,17 @@ import de.anomic.server.serverProcessor;
import de.anomic.yacy.yacyURL ;
public final class CrawlStacker {
final Log log = new Log ( "STACKCRAWL" ) ;
private Log log = new Log ( "STACKCRAWL" ) ;
private serverProcessor < CrawlEntry > fastQueue , slowQueue ;
private long dnsHit , dnsMiss ;
private CrawlQueues nextQueue ;
private plasmaWordIndex wordIndex ;
private boolean acceptLocalURLs , acceptGlobalURLs ;
// this is the process that checks url for double-occurrences and for allowance/disallowance by robots.txt
public CrawlStacker ( CrawlQueues cq , plasmaWordIndex wordIndex , boolean acceptLocalURLs , boolean acceptGlobalURLs ) {
this . nextQueue = cq ;
this . wordIndex = wordIndex ;
@ -59,10 +59,10 @@ public final class CrawlStacker {
this . dnsMiss = 0 ;
this . acceptLocalURLs = acceptLocalURLs ;
this . acceptGlobalURLs = acceptGlobalURLs ;
this . fastQueue = new serverProcessor < CrawlEntry > ( "CrawlStackerFast" , "This process checks new urls before they are enqueued into the balancer (proper, double-check, correct domain, filter)" , new String [ ] { "Balancer" } , this , "job" , 10000 , null , 2 ) ;
this . slowQueue = new serverProcessor < CrawlEntry > ( "CrawlStackerSlow" , "This is like CrawlStackerFast, but does additionaly a DNS lookup. The CrawlStackerFast does not need this because it can use the DNS cache." , new String [ ] { "Balancer" } , this , "job" , 1000 , null , 5 ) ;
this . log . logInfo ( "STACKCRAWL thread initialized." ) ;
}
@ -74,20 +74,20 @@ public final class CrawlStacker {
this . fastQueue . clear ( ) ;
this . slowQueue . clear ( ) ;
}
public void announceClose ( ) {
this . log . logInfo ( "Flushing remaining " + size ( ) + " crawl stacker job entries." ) ;
this . fastQueue . announceShutdown ( ) ;
this . slowQueue . announceShutdown ( ) ;
}
public void close ( ) {
this . log . logInfo ( "Shutdown. waiting for remaining " + size ( ) + " crawl stacker job entries. please wait." ) ;
this . fastQueue . announceShutdown ( ) ;
this . slowQueue . announceShutdown ( ) ;
this . fastQueue . awaitShutdown ( 2000 ) ;
this . slowQueue . awaitShutdown ( 2000 ) ;
this . log . logInfo ( "Shutdown. Closing stackCrawl queue." ) ;
clear ( ) ;
@ -105,7 +105,7 @@ public final class CrawlStacker {
// we just don't know anything about that host
return false ;
}
/ *
public boolean job ( ) {
if ( this . fastQueue . queueSize ( ) > 0 & & job ( this . fastQueue ) ) return true ;
@ -113,7 +113,7 @@ public final class CrawlStacker {
return job ( this . slowQueue ) ;
}
* /
public CrawlEntry job ( CrawlEntry entry ) {
// this is the method that is called by the busy thread from outside
if ( entry = = null ) return null ;
@ -133,11 +133,11 @@ public final class CrawlStacker {
}
return null ;
}
public void enqueueEntry ( final CrawlEntry entry ) {
// DEBUG
if ( log . isFinest ( ) ) log . logFinest ( "ENQUEUE " + entry . url ( ) + ", referer=" + entry . referrerhash ( ) + ", initiator=" + entry . initiator ( ) + ", name=" + entry . name ( ) + ", load=" + entry . loaddate ( ) + ", depth=" + entry . depth ( ) ) ;
if ( log . isFinest ( ) ) log . logFinest ( "ENQUEUE " + entry . url ( ) + ", referer=" + entry . referrerhash ( ) + ", initiator=" + entry . initiator ( ) + ", name=" + entry . name ( ) + ", load=" + entry . loaddate ( ) + ", depth=" + entry . depth ( ) ) ;
if ( prefetchHost ( entry . url ( ) . getHost ( ) ) ) {
try {
@ -149,89 +149,79 @@ public final class CrawlStacker {
} else {
try {
this . slowQueue . enQueue ( entry ) ;
this . dnsMiss + + ;
this . dnsMiss + + ;
} catch ( InterruptedException e ) {
e . printStackTrace ( ) ;
}
}
}
public String stackCrawl ( final CrawlEntry entry ) {
// stacks a crawl item. The position can also be remote
// returns null if successful, a reason string if not successful
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
final long startTime = System . currentTimeMillis ( ) ;
String reason = null ; // failure reason
// check if the protocol is supported
final String urlProtocol = entry . url ( ) . getProtocol ( ) ;
if ( ! nextQueue . isSupportedProtocol ( urlProtocol ) ) {
reason = "unsupported protocol" ;
this . log . logSevere ( "Unsupported protocol in URL '" + entry . url ( ) . toString ( ) + "'. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return reason ;
this . log . logSevere ( "Unsupported protocol in URL '" + entry . url ( ) . toString ( ) + "'. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return "unsupported protocol" ;
}
// check if ip is local ip address
final String urlRejectReason = urlInAcceptedDomain ( entry . url ( ) ) ;
if ( urlRejectReason ! = null ) {
reason = "denied_(" + urlRejectReason + ")" ;
if ( this . log . isFine ( ) ) this . log . logFine ( reason + "Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return reason ;
if ( this . log . isFine ( ) ) this . log . logFine ( "denied_(" + urlRejectReason + ") Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return "denied_(" + urlRejectReason + ")" ;
}
// check blacklist
if ( plasmaSwitchboard . urlBlacklist . isListed ( Blacklist . BLACKLIST_CRAWLER , entry . url ( ) ) ) {
reason = "url in blacklist" ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' is in blacklist. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return reason ;
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return "url in blacklist" ;
}
final CrawlProfile . entry profile = wordIndex . profilesActiveCrawls . getEntry ( entry . profileHandle ( ) ) ;
if ( profile = = null ) {
final String errorMsg = "LOST STACKER PROFILE HANDLE '" + entry . profileHandle ( ) + "' for URL " + entry . url ( ) ;
log . logWarning ( errorMsg ) ;
return errorMsg ;
}
// filter with must-match
if ( ( entry . depth ( ) > 0 ) & & ! profile . mustMatchPattern ( ) . matcher ( entry . url ( ) . toString ( ) ) . matches ( ) ) {
reason = "url does not match must-match filter" ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' does not match must-match crawling filter '" + profile . mustMatchPattern ( ) . toString ( ) + "'. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return reason ;
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return "url does not match must-match filter" ;
}
// filter with must-not-match
if ( ( entry . depth ( ) > 0 ) & & profile . mustNotMatchPattern ( ) . matcher ( entry . url ( ) . toString ( ) ) . matches ( ) ) {
reason = "url matches must-not-match filter" ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' does matches do-not-match crawling filter '" + profile . mustNotMatchPattern ( ) . toString ( ) + "'. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return reason ;
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return "url matches must-not-match filter" ;
}
// deny cgi
if ( entry . url ( ) . isCGI ( ) ) {
reason = "cgi url not allowed" ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' is CGI URL. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return reason ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' is CGI URL. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return "cgi url not allowed" ;
}
// deny post properties
if ( entry . url ( ) . isPOST ( ) & & ! ( profile . crawlingQ ( ) ) ) {
reason = "post url not allowed" ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' is post URL. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return reason ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' is post URL. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return "post url not allowed" ;
}
final yacyURL referrerURL = ( entry . referrerhash ( ) = = null ) ? null : nextQueue . getURL ( entry . referrerhash ( ) ) ;
// add domain to profile domain list
if ( ( profile . domFilterDepth ( ) ! = Integer . MAX_VALUE ) | | ( profile . domMaxPages ( ) ! = Integer . MAX_VALUE ) ) {
profile . domInc ( entry . url ( ) . getHost ( ) , ( referrerURL = = null ) ? null : referrerURL . getHost ( ) . toLowerCase ( ) , entry . depth ( ) ) ;
@ -239,18 +229,16 @@ public final class CrawlStacker {
// deny urls that do not match with the profile domain list
if ( ! ( profile . grantedDomAppearance ( entry . url ( ) . getHost ( ) ) ) ) {
reason = "url does not match domain filter" ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' is not listed in granted domains. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return reason ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' is not listed in granted domains. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return "url does not match domain filter" ;
}
// deny urls that exceed allowed number of occurrences
if ( ! ( profile . grantedDomCount ( entry . url ( ) . getHost ( ) ) ) ) {
reason = "domain counter exceeded" ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' appeared too often, a maximum of " + profile . domMaxPages ( ) + " is allowed. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return reason ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' appeared too often, a maximum of " + profile . domMaxPages ( ) + " is allowed. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return "domain counter exceeded" ;
}
// check if the url is double registered
@ -260,36 +248,34 @@ public final class CrawlStacker {
final boolean recrawl = ( oldEntry ! = null ) & & ( profile . recrawlIfOlder ( ) > oldEntry . loaddate ( ) . getTime ( ) ) ;
// do double-check
if ( ( dbocc ! = null ) & & ( ! recrawl ) ) {
reason = "double " + dbocc ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return reason ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return "double " + dbocc ;
}
if ( ( oldEntry ! = null ) & & ( ! recrawl ) ) {
reason = "double LURL" ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' is double registered in 'LURL'. " + "Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return reason ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' is double registered in 'LURL'. " + "Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
return "double LURL" ;
}
// show potential re-crawl
if ( recrawl & & oldEntry ! = null ) {
if ( this . log . isFine ( ) ) this . log . logFine ( "RE-CRAWL of URL '" + entry . url ( ) . toString ( ) + "': this url was crawled " +
( ( System . currentTimeMillis ( ) - oldEntry . loaddate ( ) . getTime ( ) ) / 60000 / 60 / 24 ) + " days ago." ) ;
}
}
// store information
final boolean local = entry . initiator ( ) . equals ( wordIndex . peers ( ) . mySeed ( ) . hash ) ;
final boolean proxy = ( entry . initiator ( ) = = null | | entry . initiator ( ) . equals ( "------------" ) ) & & profile . handle ( ) . equals ( wordIndex . defaultProxyProfile . handle ( ) ) ;
final boolean remote = profile . handle ( ) . equals ( wordIndex . defaultRemoteProfile . handle ( ) ) ;
final boolean global =
final boolean global =
( profile . remoteIndexing ( ) ) /* granted */ & &
( entry . depth ( ) = = profile . depth ( ) ) /* leaf node */ & &
( entry . depth ( ) = = profile . depth ( ) ) /* leaf node */ & &
//(initiatorHash.equals(yacyCore.seedDB.mySeed.hash)) /* not proxy */ &&
(
( wordIndex . peers ( ) . mySeed ( ) . isSenior ( ) ) | |
( wordIndex . peers ( ) . mySeed ( ) . isPrincipal ( ) )
) /* qualified */ ;
if ( ! local & & ! global & & ! remote & & ! proxy ) {
this . log . logSevere ( "URL '" + entry . url ( ) . toString ( ) + "' cannot be crawled. initiator = " + entry . initiator ( ) + ", profile.handle = " + profile . handle ( ) ) ;
} else {
@ -309,10 +295,9 @@ public final class CrawlStacker {
nextQueue . noticeURL . push ( NoticedURL . STACK_TYPE_REMOTE , entry ) ;
}
}
return null ;
}
/ * *
* Test a url if it can be used for crawling / indexing
@ -344,11 +329,11 @@ public final class CrawlStacker {
( "the host '" + host + "' is local, but local addresses are not accepted" ) :
( "the host '" + host + "' is global, but global addresses are not accepted" ) ;
}
public boolean acceptLocalURLs ( ) {
return this . acceptLocalURLs ;
}
public boolean acceptGlobalURLs ( ) {
return this . acceptGlobalURLs ;
}