@ -49,13 +49,13 @@ public final class CrawlStacker {
private final Log log = new Log ( "STACKCRAWL" ) ;
private final WorkflowProcessor < Request > fastQueue , slowQueue ;
//private long dnsHit;
private long dnsMiss ;
private final CrawlQueues nextQueue ;
private final CrawlSwitchboard crawler ;
private final Segment indexSegment ;
private final yacySeedDB peers ;
private final boolean acceptLocalURLs , acceptGlobalURLs ;
//private long dnsHit;
private long dnsMiss ;
private final CrawlQueues nextQueue ;
private final CrawlSwitchboard crawler ;
private final Segment indexSegment ;
private final yacySeedDB peers ;
private final boolean acceptLocalURLs , acceptGlobalURLs ;
// this is the process that checks url for double-occurrences and for allowance/disallowance by robots.txt
@ -178,96 +178,142 @@ public final class CrawlStacker {
// stacks a crawl item. The position can also be remote
// returns null if successful, a reason string if not successful
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
final CrawlProfile . entry profile = crawler . profilesActiveCrawls . getEntry ( entry . profileHandle ( ) ) ;
String error ;
if ( profile = = null ) {
error = "LOST STACKER PROFILE HANDLE '" + entry . profileHandle ( ) + "' for URL " + entry . url ( ) ;
log . logWarning ( error ) ;
return error ;
}
error = checkAcceptance ( entry . url ( ) , profile , entry . depth ( ) ) ;
if ( error ! = null ) return error ;
final DigestURI referrerURL = ( entry . referrerhash ( ) = = null | | entry . referrerhash ( ) . length = = 0 ) ? null : nextQueue . getURL ( entry . referrerhash ( ) ) ;
final long startTime = System . currentTimeMillis ( ) ;
// add domain to profile domain list
if ( ( profile . domFilterDepth ( ) ! = Integer . MAX_VALUE ) | | ( profile . domMaxPages ( ) ! = Integer . MAX_VALUE ) ) {
profile . domInc ( entry . url ( ) . getHost ( ) , ( referrerURL = = null ) ? null : referrerURL . getHost ( ) . toLowerCase ( ) , entry . depth ( ) ) ;
}
// store information
final boolean local = Base64Order . enhancedCoder . equal ( entry . initiator ( ) , peers . mySeed ( ) . hash . getBytes ( ) ) ;
final boolean proxy = ( entry . initiator ( ) = = null | | entry . initiator ( ) . length = = 0 | | new String ( entry . initiator ( ) ) . equals ( "------------" ) ) & & profile . handle ( ) . equals ( crawler . defaultProxyProfile . handle ( ) ) ;
final boolean remote = profile . handle ( ) . equals ( crawler . defaultRemoteProfile . handle ( ) ) ;
final boolean global =
( profile . remoteIndexing ( ) ) /* granted */ & &
( entry . depth ( ) = = profile . depth ( ) ) /* leaf node */ & &
//(initiatorHash.equals(yacyCore.seedDB.mySeed.hash)) /* not proxy */ &&
(
( peers . mySeed ( ) . isSenior ( ) ) | |
( peers . mySeed ( ) . isPrincipal ( ) )
) /* qualified */ ;
if ( ! local & & ! global & & ! remote & & ! proxy ) {
error = "URL '" + entry . url ( ) . toString ( ) + "' cannot be crawled. initiator = " + new String ( entry . initiator ( ) ) + ", profile.handle = " + profile . handle ( ) ;
this . log . logSevere ( error ) ;
return error ;
}
if ( global ) {
// it may be possible that global == true and local == true, so do not check an error case against it
if ( proxy ) this . log . logWarning ( "URL '" + entry . url ( ) . toString ( ) + "' has conflicting initiator properties: global = true, proxy = true, initiator = " + new String ( entry . initiator ( ) ) + ", profile.handle = " + profile . handle ( ) ) ;
if ( remote ) this . log . logWarning ( "URL '" + entry . url ( ) . toString ( ) + "' has conflicting initiator properties: global = true, remote = true, initiator = " + new String ( entry . initiator ( ) ) + ", profile.handle = " + profile . handle ( ) ) ;
//int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT);
nextQueue . noticeURL . push ( NoticedURL . STACK_TYPE_LIMIT , entry ) ;
//assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT);
//this.log.logInfo("stacked/global: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT));
} else if ( local ) {
if ( proxy ) this . log . logWarning ( "URL '" + entry . url ( ) . toString ( ) + "' has conflicting initiator properties: local = true, proxy = true, initiator = " + new String ( entry . initiator ( ) ) + ", profile.handle = " + profile . handle ( ) ) ;
if ( remote ) this . log . logWarning ( "URL '" + entry . url ( ) . toString ( ) + "' has conflicting initiator properties: local = true, remote = true, initiator = " + new String ( entry . initiator ( ) ) + ", profile.handle = " + profile . handle ( ) ) ;
//int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE);
nextQueue . noticeURL . push ( NoticedURL . STACK_TYPE_CORE , entry ) ;
//assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE);
//this.log.logInfo("stacked/local: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE));
} else if ( proxy ) {
if ( remote ) this . log . logWarning ( "URL '" + entry . url ( ) . toString ( ) + "' has conflicting initiator properties: proxy = true, remote = true, initiator = " + new String ( entry . initiator ( ) ) + ", profile.handle = " + profile . handle ( ) ) ;
//int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE);
nextQueue . noticeURL . push ( NoticedURL . STACK_TYPE_CORE , entry ) ;
//assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE);
//this.log.logInfo("stacked/proxy: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE));
} else if ( remote ) {
//int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE);
nextQueue . noticeURL . push ( NoticedURL . STACK_TYPE_REMOTE , entry ) ;
//assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE);
//this.log.logInfo("stacked/remote: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE));
}
return null ;
}
public String checkAcceptance ( final DigestURI url , final CrawlProfile . entry profile , int depth ) {
// check if the protocol is supported
final String urlProtocol = entry . url ( ) . getProtocol ( ) ;
final String urlProtocol = url. getProtocol ( ) ;
if ( ! Switchboard . getSwitchboard ( ) . loader . isSupportedProtocol ( urlProtocol ) ) {
this . log . logSevere ( "Unsupported protocol in URL '" + entry . url ( ) . toString ( ) + "'. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
this . log . logSevere ( "Unsupported protocol in URL '" + url . toString ( ) + "'." ) ;
return "unsupported protocol" ;
}
// check if ip is local ip address
final String urlRejectReason = urlInAcceptedDomain ( entry . url ( ) ) ;
final String urlRejectReason = urlInAcceptedDomain ( url) ;
if ( urlRejectReason ! = null ) {
if ( this . log . isFine ( ) ) this . log . logFine ( "denied_(" + urlRejectReason + ") Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
if ( this . log . isFine ( ) ) this . log . logFine ( "denied_(" + urlRejectReason + ") ") ;
return "denied_(" + urlRejectReason + ")" ;
}
// check blacklist
if ( Switchboard . urlBlacklist . isListed ( Blacklist . BLACKLIST_CRAWLER , entry . url ( ) ) ) {
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' is in blacklist. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
if ( Switchboard . urlBlacklist . isListed ( Blacklist . BLACKLIST_CRAWLER , url ) ) {
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + url . toString ( ) + "' is in blacklist." ) ;
return "url in blacklist" ;
}
final CrawlProfile . entry profile = crawler . profilesActiveCrawls . getEntry ( entry . profileHandle ( ) ) ;
if ( profile = = null ) {
final String errorMsg = "LOST STACKER PROFILE HANDLE '" + entry . profileHandle ( ) + "' for URL " + entry . url ( ) ;
log . logWarning ( errorMsg ) ;
return errorMsg ;
}
// filter with must-match
if ( ( entry . depth ( ) > 0 ) & & ! profile . mustMatchPattern ( ) . matcher ( entry . url ( ) . toString ( ) ) . matches ( ) ) {
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' does not match must-match crawling filter '" + profile . mustMatchPattern ( ) . toString ( ) + "'. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
if ( ( depth > 0 ) & & ! profile . mustMatchPattern ( ) . matcher ( url . toString ( ) ) . matches ( ) ) {
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + url . toString ( ) + "' does not match must-match crawling filter '" + profile . mustMatchPattern ( ) . toString ( ) + "'." ) ;
return "url does not match must-match filter" ;
}
// filter with must-not-match
if ( ( entry . depth ( ) > 0 ) & & profile . mustNotMatchPattern ( ) . matcher ( entry . url ( ) . toString ( ) ) . matches ( ) ) {
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' does matches do-not-match crawling filter '" + profile . mustNotMatchPattern ( ) . toString ( ) + "'. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
if ( ( depth > 0 ) & & profile . mustNotMatchPattern ( ) . matcher ( url . toString ( ) ) . matches ( ) ) {
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + url . toString ( ) + "' does matches do-not-match crawling filter '" + profile . mustNotMatchPattern ( ) . toString ( ) + "'." ) ;
return "url matches must-not-match filter" ;
}
// deny cgi
if ( entry . url ( ) . isIndividual ( ) ) {
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' is CGI URL. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
if ( url . isIndividual ( ) ) {
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + url . toString ( ) + "' is CGI URL." ) ;
return "cgi url not allowed" ;
}
// deny post properties
if ( entry . url ( ) . isPOST ( ) & & ! ( profile . crawlingQ ( ) ) ) {
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' is post URL. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
if ( url . isPOST ( ) & & ! ( profile . crawlingQ ( ) ) ) {
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + url . toString ( ) + "' is post URL." ) ;
return "post url not allowed" ;
}
final DigestURI referrerURL = ( entry . referrerhash ( ) = = null | | entry . referrerhash ( ) . length = = 0 ) ? null : nextQueue . getURL ( entry . referrerhash ( ) ) ;
// add domain to profile domain list
if ( ( profile . domFilterDepth ( ) ! = Integer . MAX_VALUE ) | | ( profile . domMaxPages ( ) ! = Integer . MAX_VALUE ) ) {
profile . domInc ( entry . url ( ) . getHost ( ) , ( referrerURL = = null ) ? null : referrerURL . getHost ( ) . toLowerCase ( ) , entry . depth ( ) ) ;
}
// deny urls that do not match with the profile domain list
if ( ! ( profile . grantedDomAppearance ( entry . url ( ) . getHost ( ) ) ) ) {
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' is not listed in granted domains. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
if ( ! ( profile . grantedDomAppearance ( url . getHost ( ) ) ) ) {
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + url . toString ( ) + "' is not listed in granted domains." ) ;
return "url does not match domain filter" ;
}
// deny urls that exceed allowed number of occurrences
if ( ! ( profile . grantedDomCount ( entry . url ( ) . getHost ( ) ) ) ) {
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry . url ( ) . toString ( ) + "' appeared too often, a maximum of " + profile . domMaxPages ( ) + " is allowed. " +
"Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms" ) ;
if ( ! ( profile . grantedDomCount ( url . getHost ( ) ) ) ) {
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + url . toString ( ) + "' appeared too often, a maximum of " + profile . domMaxPages ( ) + " is allowed." ) ;
return "domain counter exceeded" ;
}
// check if the url is double registered
final String dbocc = nextQueue . urlExists ( entry. url( ) . hash ( ) ) ; // returns the name of the queue if entry exists
URIMetadataRow oldEntry = indexSegment . urlMetadata ( ) . load ( entry. url( ) . hash ( ) , null , 0 ) ;
final String dbocc = nextQueue . urlExists ( url. hash ( ) ) ; // returns the name of the queue if entry exists
URIMetadataRow oldEntry = indexSegment . urlMetadata ( ) . load ( url. hash ( ) , null , 0 ) ;
if ( oldEntry = = null ) {
if ( dbocc ! = null ) {
// do double-check
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry. url( ) . toString ( ) + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms ") ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + url. toString ( ) + "' is double registered in '" + dbocc + "'. ") ;
if ( dbocc . equals ( "errors" ) ) {
ZURL . Entry errorEntry = nextQueue . errorURL . get ( entry. url( ) . hash ( ) ) ;
ZURL . Entry errorEntry = nextQueue . errorURL . get ( url. hash ( ) ) ;
return "double in: errors (" + errorEntry . anycause ( ) + ")" ;
} else {
return "double in: " + dbocc ;
@ -277,15 +323,15 @@ public final class CrawlStacker {
final boolean recrawl = profile . recrawlIfOlder ( ) > oldEntry . loaddate ( ) . getTime ( ) ;
if ( recrawl ) {
if ( this . log . isFine ( ) )
this . log . logFine ( "RE-CRAWL of URL '" + entry. url( ) . toString ( ) + "': this url was crawled " +
this . log . logFine ( "RE-CRAWL of URL '" + url. toString ( ) + "': this url was crawled " +
( ( System . currentTimeMillis ( ) - oldEntry . loaddate ( ) . getTime ( ) ) / 60000 / 60 / 24 ) + " days ago." ) ;
} else {
if ( dbocc = = null ) {
return "double in: LURL-DB" ;
} else {
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + entry. url( ) . toString ( ) + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + ( System . currentTimeMillis ( ) - startTime ) + "ms ") ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + url. toString ( ) + "' is double registered in '" + dbocc + "'. " + "Stack processing time: ") ;
if ( dbocc . equals ( "errors" ) ) {
ZURL . Entry errorEntry = nextQueue . errorURL . get ( entry. url( ) . hash ( ) ) ;
ZURL . Entry errorEntry = nextQueue . errorURL . get ( url. hash ( ) ) ;
return "double in: errors (" + errorEntry . anycause ( ) + ")" ;
} else {
return "double in: " + dbocc ;
@ -293,57 +339,11 @@ public final class CrawlStacker {
}
}
}
// store information
final boolean local = Base64Order . enhancedCoder . equal ( entry . initiator ( ) , peers . mySeed ( ) . hash . getBytes ( ) ) ;
final boolean proxy = ( entry . initiator ( ) = = null | | entry . initiator ( ) . length = = 0 | | new String ( entry . initiator ( ) ) . equals ( "------------" ) ) & & profile . handle ( ) . equals ( crawler . defaultProxyProfile . handle ( ) ) ;
final boolean remote = profile . handle ( ) . equals ( crawler . defaultRemoteProfile . handle ( ) ) ;
final boolean global =
( profile . remoteIndexing ( ) ) /* granted */ & &
( entry . depth ( ) = = profile . depth ( ) ) /* leaf node */ & &
//(initiatorHash.equals(yacyCore.seedDB.mySeed.hash)) /* not proxy */ &&
(
( peers . mySeed ( ) . isSenior ( ) ) | |
( peers . mySeed ( ) . isPrincipal ( ) )
) /* qualified */ ;
if ( ! local & & ! global & & ! remote & & ! proxy ) {
String error = "URL '" + entry . url ( ) . toString ( ) + "' cannot be crawled. initiator = " + new String ( entry . initiator ( ) ) + ", profile.handle = " + profile . handle ( ) ;
this . log . logSevere ( error ) ;
return error ;
}
if ( global ) {
// it may be possible that global == true and local == true, so do not check an error case against it
if ( proxy ) this . log . logWarning ( "URL '" + entry . url ( ) . toString ( ) + "' has conflicting initiator properties: global = true, proxy = true, initiator = " + new String ( entry . initiator ( ) ) + ", profile.handle = " + profile . handle ( ) ) ;
if ( remote ) this . log . logWarning ( "URL '" + entry . url ( ) . toString ( ) + "' has conflicting initiator properties: global = true, remote = true, initiator = " + new String ( entry . initiator ( ) ) + ", profile.handle = " + profile . handle ( ) ) ;
//int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT);
nextQueue . noticeURL . push ( NoticedURL . STACK_TYPE_LIMIT , entry ) ;
//assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT);
//this.log.logInfo("stacked/global: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT));
} else if ( local ) {
if ( proxy ) this . log . logWarning ( "URL '" + entry . url ( ) . toString ( ) + "' has conflicting initiator properties: local = true, proxy = true, initiator = " + new String ( entry . initiator ( ) ) + ", profile.handle = " + profile . handle ( ) ) ;
if ( remote ) this . log . logWarning ( "URL '" + entry . url ( ) . toString ( ) + "' has conflicting initiator properties: local = true, remote = true, initiator = " + new String ( entry . initiator ( ) ) + ", profile.handle = " + profile . handle ( ) ) ;
//int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE);
nextQueue . noticeURL . push ( NoticedURL . STACK_TYPE_CORE , entry ) ;
//assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE);
//this.log.logInfo("stacked/local: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE));
} else if ( proxy ) {
if ( remote ) this . log . logWarning ( "URL '" + entry . url ( ) . toString ( ) + "' has conflicting initiator properties: proxy = true, remote = true, initiator = " + new String ( entry . initiator ( ) ) + ", profile.handle = " + profile . handle ( ) ) ;
//int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE);
nextQueue . noticeURL . push ( NoticedURL . STACK_TYPE_CORE , entry ) ;
//assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE);
//this.log.logInfo("stacked/proxy: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE));
} else if ( remote ) {
//int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE);
nextQueue . noticeURL . push ( NoticedURL . STACK_TYPE_REMOTE , entry ) ;
//assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE);
//this.log.logInfo("stacked/remote: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE));
}
return null ;
}
/ * *
* Test a url if it can be used for crawling / indexing
* This mainly checks if the url is in the declared domain ( local / global )