@ -44,7 +44,6 @@ import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8 ;
import net.yacy.cora.document.UTF8 ;
import net.yacy.cora.protocol.Domains ;
import net.yacy.cora.protocol.Domains ;
import net.yacy.cora.protocol.ftp.FTPClient ;
import net.yacy.cora.protocol.ftp.FTPClient ;
import net.yacy.document.TextParser ;
import net.yacy.kelondro.data.meta.DigestURI ;
import net.yacy.kelondro.data.meta.DigestURI ;
import net.yacy.kelondro.data.meta.URIMetadataRow ;
import net.yacy.kelondro.data.meta.URIMetadataRow ;
import net.yacy.kelondro.logging.Log ;
import net.yacy.kelondro.logging.Log ;
@ -52,7 +51,6 @@ import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.workflow.WorkflowProcessor ;
import net.yacy.kelondro.workflow.WorkflowProcessor ;
import net.yacy.repository.Blacklist ;
import net.yacy.repository.Blacklist ;
import net.yacy.repository.FilterEngine ;
import net.yacy.repository.FilterEngine ;
import de.anomic.crawler.ResultURLs.EventOrigin ;
import de.anomic.crawler.ResultURLs.EventOrigin ;
import de.anomic.crawler.ZURL.FailCategory ;
import de.anomic.crawler.ZURL.FailCategory ;
import de.anomic.crawler.retrieval.FTPLoader ;
import de.anomic.crawler.retrieval.FTPLoader ;
@ -93,18 +91,18 @@ public final class CrawlStacker {
}
}
private Map < String , DomProfile > doms ;
private final Map < String , DomProfile > doms ;
// this is the process that checks url for double-occurrences and for allowance/disallowance by robots.txt
// this is the process that checks url for double-occurrences and for allowance/disallowance by robots.txt
public CrawlStacker (
public CrawlStacker (
CrawlQueues cq ,
final CrawlQueues cq ,
CrawlSwitchboard cs ,
final CrawlSwitchboard cs ,
Segment indexSegment ,
final Segment indexSegment ,
yacySeedDB peers ,
final yacySeedDB peers ,
boolean acceptLocalURLs ,
final boolean acceptLocalURLs ,
boolean acceptGlobalURLs ,
final boolean acceptGlobalURLs ,
FilterEngine domainList ) {
final FilterEngine domainList ) {
this . nextQueue = cq ;
this . nextQueue = cq ;
this . crawler = cs ;
this . crawler = cs ;
this . indexSegment = indexSegment ;
this . indexSegment = indexSegment ;
@ -122,17 +120,17 @@ public final class CrawlStacker {
}
}
private void domInc ( final String domain , final String referrer , final int depth ) {
private void domInc ( final String domain , final String referrer , final int depth ) {
final DomProfile dp = doms. get ( domain ) ;
final DomProfile dp = this . doms. get ( domain ) ;
if ( dp = = null ) {
if ( dp = = null ) {
// new domain
// new domain
doms. put ( domain , new DomProfile ( referrer , depth ) ) ;
this . doms. put ( domain , new DomProfile ( referrer , depth ) ) ;
} else {
} else {
// increase counter
// increase counter
dp . inc ( ) ;
dp . inc ( ) ;
}
}
}
}
public String domName ( final boolean attr , final int index ) {
public String domName ( final boolean attr , final int index ) {
final Iterator < Map . Entry < String , DomProfile > > domnamesi = doms. entrySet ( ) . iterator ( ) ;
final Iterator < Map . Entry < String , DomProfile > > domnamesi = this . doms. entrySet ( ) . iterator ( ) ;
String domname = "" ;
String domname = "" ;
Map . Entry < String , DomProfile > ey ;
Map . Entry < String , DomProfile > ey ;
DomProfile dp ;
DomProfile dp ;
@ -195,7 +193,7 @@ public final class CrawlStacker {
return false ;
return false ;
}
}
public Request job ( Request entry ) {
public Request job ( final Request entry ) {
// this is the method that is called by the busy thread from outside
// this is the method that is called by the busy thread from outside
if ( entry = = null ) return null ;
if ( entry = = null ) return null ;
@ -204,7 +202,7 @@ public final class CrawlStacker {
// if the url was rejected we store it into the error URL db
// if the url was rejected we store it into the error URL db
if ( rejectReason ! = null ) {
if ( rejectReason ! = null ) {
nextQueue. errorURL . push ( entry , ASCII . getBytes ( peers. mySeed ( ) . hash ) , new Date ( ) , 1 , FailCategory . FINAL_LOAD_CONTEXT , rejectReason , - 1 ) ;
this . nextQueue. errorURL . push ( entry , ASCII . getBytes ( this . peers. mySeed ( ) . hash ) , new Date ( ) , 1 , FailCategory . FINAL_LOAD_CONTEXT , rejectReason , - 1 ) ;
}
}
} catch ( final Exception e ) {
} catch ( final Exception e ) {
CrawlStacker . this . log . logWarning ( "Error while processing stackCrawl entry.\n" + "Entry: " + entry . toString ( ) + "Error: " + e . toString ( ) , e ) ;
CrawlStacker . this . log . logWarning ( "Error while processing stackCrawl entry.\n" + "Entry: " + entry . toString ( ) + "Error: " + e . toString ( ) , e ) ;
@ -216,25 +214,25 @@ public final class CrawlStacker {
public void enqueueEntry ( final Request entry ) {
public void enqueueEntry ( final Request entry ) {
// DEBUG
// DEBUG
if ( log . isFinest ( ) ) log. logFinest ( "ENQUEUE " + entry . url ( ) + ", referer=" + entry . referrerhash ( ) + ", initiator=" + ( ( entry . initiator ( ) = = null ) ? "" : ASCII . String ( entry . initiator ( ) ) ) + ", name=" + entry . name ( ) + ", appdate=" + entry . appdate ( ) + ", depth=" + entry . depth ( ) ) ;
if ( this . log . isFinest ( ) ) this . log. logFinest ( "ENQUEUE " + entry . url ( ) + ", referer=" + entry . referrerhash ( ) + ", initiator=" + ( ( entry . initiator ( ) = = null ) ? "" : ASCII . String ( entry . initiator ( ) ) ) + ", name=" + entry . name ( ) + ", appdate=" + entry . appdate ( ) + ", depth=" + entry . depth ( ) ) ;
if ( prefetchHost ( entry . url ( ) . getHost ( ) ) ) {
if ( prefetchHost ( entry . url ( ) . getHost ( ) ) ) {
try {
try {
this . fastQueue . enQueue ( entry ) ;
this . fastQueue . enQueue ( entry ) ;
//this.dnsHit++;
//this.dnsHit++;
} catch ( InterruptedException e ) {
} catch ( final InterruptedException e ) {
Log . logException ( e ) ;
Log . logException ( e ) ;
}
}
} else {
} else {
try {
try {
this . slowQueue . enQueue ( entry ) ;
this . slowQueue . enQueue ( entry ) ;
this . dnsMiss + + ;
this . dnsMiss + + ;
} catch ( InterruptedException e ) {
} catch ( final InterruptedException e ) {
Log . logException ( e ) ;
Log . logException ( e ) ;
}
}
}
}
}
}
public void enqueueEntriesAsynchronous ( final byte [ ] initiator , final String profileHandle , final Map < MultiProtocolURI , Properties > hyperlinks , boolean replace ) {
public void enqueueEntriesAsynchronous ( final byte [ ] initiator , final String profileHandle , final Map < MultiProtocolURI , Properties > hyperlinks , final boolean replace ) {
new Thread ( ) {
new Thread ( ) {
public void run ( ) {
public void run ( ) {
enqueueEntries ( initiator , profileHandle , hyperlinks , true ) ;
enqueueEntries ( initiator , profileHandle , hyperlinks , true ) ;
@ -242,15 +240,15 @@ public final class CrawlStacker {
} . start ( ) ;
} . start ( ) ;
}
}
private void enqueueEntries ( byte [ ] initiator , String profileHandle , Map < MultiProtocolURI , Properties > hyperlinks , boolean replace ) {
private void enqueueEntries ( final byte [ ] initiator , final String profileHandle , final Map < MultiProtocolURI , Properties > hyperlinks , final boolean replace ) {
for ( Map . Entry < MultiProtocolURI , Properties > e : hyperlinks . entrySet ( ) ) {
for ( final Map . Entry < MultiProtocolURI , Properties > e : hyperlinks . entrySet ( ) ) {
if ( e . getKey ( ) = = null ) continue ;
if ( e . getKey ( ) = = null ) continue ;
// delete old entry, if exists to force a re-load of the url (thats wanted here)
// delete old entry, if exists to force a re-load of the url (thats wanted here)
final DigestURI url = new DigestURI ( e . getKey ( ) ) ;
final DigestURI url = new DigestURI ( e . getKey ( ) ) ;
final byte [ ] urlhash = url . hash ( ) ;
final byte [ ] urlhash = url . hash ( ) ;
if ( replace ) {
if ( replace ) {
indexSegment. urlMetadata ( ) . remove ( urlhash ) ;
this . indexSegment. urlMetadata ( ) . remove ( urlhash ) ;
this . nextQueue . urlRemove ( urlhash ) ;
this . nextQueue . urlRemove ( urlhash ) ;
String u = url . toNormalform ( true , true ) ;
String u = url . toNormalform ( true , true ) ;
if ( u . endsWith ( "/" ) ) {
if ( u . endsWith ( "/" ) ) {
@ -259,11 +257,11 @@ public final class CrawlStacker {
u = u + "/index.html" ;
u = u + "/index.html" ;
}
}
try {
try {
byte [ ] uh = new DigestURI ( u , null ) . hash ( ) ;
final byte [ ] uh = new DigestURI ( u , null ) . hash ( ) ;
indexSegment. urlMetadata ( ) . remove ( uh ) ;
this . indexSegment. urlMetadata ( ) . remove ( uh ) ;
this . nextQueue . noticeURL . removeByURLHash ( uh ) ;
this . nextQueue . noticeURL . removeByURLHash ( uh ) ;
this . nextQueue . errorURL . remove ( uh ) ;
this . nextQueue . errorURL . remove ( uh ) ;
} catch ( MalformedURLException e1 ) { }
} catch ( final MalformedURLException e1 ) { }
}
}
if ( url . getProtocol ( ) . equals ( "ftp" ) ) {
if ( url . getProtocol ( ) . equals ( "ftp" ) ) {
@ -301,12 +299,12 @@ public final class CrawlStacker {
DigestURI url = null ;
DigestURI url = null ;
try {
try {
url = new DigestURI ( "ftp://" + host + ( port = = 21 ? "" : ":" + port ) + MultiProtocolURI . escape ( entry . name ) ) ;
url = new DigestURI ( "ftp://" + host + ( port = = 21 ? "" : ":" + port ) + MultiProtocolURI . escape ( entry . name ) ) ;
} catch ( MalformedURLException e ) {
} catch ( final MalformedURLException e ) {
continue ;
continue ;
}
}
final byte [ ] urlhash = url . hash ( ) ;
final byte [ ] urlhash = url . hash ( ) ;
if ( replace ) {
if ( replace ) {
indexSegment. urlMetadata ( ) . remove ( urlhash ) ;
CrawlStacker. this . indexSegment. urlMetadata ( ) . remove ( urlhash ) ;
cq . noticeURL . removeByURLHash ( urlhash ) ;
cq . noticeURL . removeByURLHash ( urlhash ) ;
cq . errorURL . remove ( urlhash ) ;
cq . errorURL . remove ( urlhash ) ;
}
}
@ -325,8 +323,8 @@ public final class CrawlStacker {
entry . size
entry . size
) ) ;
) ) ;
}
}
} catch ( IOException e1 ) {
} catch ( final IOException e1 ) {
} catch ( InterruptedException e ) {
} catch ( final InterruptedException e ) {
}
}
}
}
} . start ( ) ;
} . start ( ) ;
@ -338,9 +336,9 @@ public final class CrawlStacker {
* @return null if successfull , a reason string if not successful
* @return null if successfull , a reason string if not successful
* /
* /
public String stackSimpleCrawl ( final DigestURI url ) {
public String stackSimpleCrawl ( final DigestURI url ) {
CrawlProfile pe = this . crawler . defaultSurrogateProfile ;
final CrawlProfile pe = this . crawler . defaultSurrogateProfile ;
return stackCrawl ( new Request (
return stackCrawl ( new Request (
peers. mySeed ( ) . hash . getBytes ( ) ,
this . peers. mySeed ( ) . hash . getBytes ( ) ,
url ,
url ,
null ,
null ,
"CRAWLING-ROOT" ,
"CRAWLING-ROOT" ,
@ -361,11 +359,11 @@ public final class CrawlStacker {
public String stackCrawl ( final Request entry ) {
public String stackCrawl ( final Request entry ) {
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
final CrawlProfile profile = crawler. getActive ( UTF8 . getBytes ( entry . profileHandle ( ) ) ) ;
final CrawlProfile profile = this . crawler. getActive ( UTF8 . getBytes ( entry . profileHandle ( ) ) ) ;
String error ;
String error ;
if ( profile = = null ) {
if ( profile = = null ) {
error = "LOST STACKER PROFILE HANDLE '" + entry . profileHandle ( ) + "' for URL " + entry . url ( ) ;
error = "LOST STACKER PROFILE HANDLE '" + entry . profileHandle ( ) + "' for URL " + entry . url ( ) ;
log. logWarning ( error ) ;
this . log. logWarning ( error ) ;
return error ;
return error ;
}
}
@ -373,16 +371,16 @@ public final class CrawlStacker {
if ( error ! = null ) return error ;
if ( error ! = null ) return error ;
// store information
// store information
final boolean local = Base64Order . enhancedCoder . equal ( entry . initiator ( ) , UTF8 . getBytes ( peers. mySeed ( ) . hash ) ) ;
final boolean local = Base64Order . enhancedCoder . equal ( entry . initiator ( ) , UTF8 . getBytes ( this . peers. mySeed ( ) . hash ) ) ;
final boolean proxy = ( entry . initiator ( ) = = null | | entry . initiator ( ) . length = = 0 | | ASCII . String ( entry . initiator ( ) ) . equals ( "------------" ) ) & & profile . handle ( ) . equals ( crawler. defaultProxyProfile . handle ( ) ) ;
final boolean proxy = ( entry . initiator ( ) = = null | | entry . initiator ( ) . length = = 0 | | ASCII . String ( entry . initiator ( ) ) . equals ( "------------" ) ) & & profile . handle ( ) . equals ( this . crawler. defaultProxyProfile . handle ( ) ) ;
final boolean remote = profile . handle ( ) . equals ( crawler. defaultRemoteProfile . handle ( ) ) ;
final boolean remote = profile . handle ( ) . equals ( this . crawler. defaultRemoteProfile . handle ( ) ) ;
final boolean global =
final boolean global =
( profile . remoteIndexing ( ) ) /* granted */ & &
( profile . remoteIndexing ( ) ) /* granted */ & &
( entry . depth ( ) = = profile . depth ( ) ) /* leaf node */ & &
( entry . depth ( ) = = profile . depth ( ) ) /* leaf node */ & &
//(initiatorHash.equals(yacyCore.seedDB.mySeed.hash)) /* not proxy */ &&
//(initiatorHash.equals(yacyCore.seedDB.mySeed.hash)) /* not proxy */ &&
(
(
( peers. mySeed ( ) . isSenior ( ) ) | |
( this . peers. mySeed ( ) . isSenior ( ) ) | |
( peers. mySeed ( ) . isPrincipal ( ) )
( this . peers. mySeed ( ) . isPrincipal ( ) )
) /* qualified */ ;
) /* qualified */ ;
if ( ! local & & ! global & & ! remote & & ! proxy ) {
if ( ! local & & ! global & & ! remote & & ! proxy ) {
@ -393,7 +391,7 @@ public final class CrawlStacker {
long maxFileSize = Long . MAX_VALUE ;
long maxFileSize = Long . MAX_VALUE ;
if ( entry . size ( ) > 0 ) {
if ( entry . size ( ) > 0 ) {
String protocol = entry . url ( ) . getProtocol ( ) ;
final String protocol = entry . url ( ) . getProtocol ( ) ;
if ( protocol . equals ( "http" ) | | protocol . equals ( "https" ) ) maxFileSize = Switchboard . getSwitchboard ( ) . getConfigLong ( "crawler.http.maxFileSize" , HTTPLoader . DEFAULT_MAXFILESIZE ) ;
if ( protocol . equals ( "http" ) | | protocol . equals ( "https" ) ) maxFileSize = Switchboard . getSwitchboard ( ) . getConfigLong ( "crawler.http.maxFileSize" , HTTPLoader . DEFAULT_MAXFILESIZE ) ;
if ( protocol . equals ( "ftp" ) ) maxFileSize = Switchboard . getSwitchboard ( ) . getConfigLong ( "crawler.ftp.maxFileSize" , FTPLoader . DEFAULT_MAXFILESIZE ) ;
if ( protocol . equals ( "ftp" ) ) maxFileSize = Switchboard . getSwitchboard ( ) . getConfigLong ( "crawler.ftp.maxFileSize" , FTPLoader . DEFAULT_MAXFILESIZE ) ;
if ( protocol . equals ( "smb" ) ) maxFileSize = Switchboard . getSwitchboard ( ) . getConfigLong ( "crawler.smb.maxFileSize" , SMBLoader . DEFAULT_MAXFILESIZE ) ;
if ( protocol . equals ( "smb" ) ) maxFileSize = Switchboard . getSwitchboard ( ) . getConfigLong ( "crawler.smb.maxFileSize" , SMBLoader . DEFAULT_MAXFILESIZE ) ;
@ -401,15 +399,15 @@ public final class CrawlStacker {
// check availability of parser and maxfilesize
// check availability of parser and maxfilesize
String warning = null ;
String warning = null ;
if ( entry . size ( ) > maxFileSize ||
if ( entry . size ( ) > maxFileSize /* ||
( entry . url ( ) . getFileExtension ( ) . length ( ) > 0 & & TextParser . supports ( entry . url ( ) , null ) ! = null )
( entry . url ( ) . getFileExtension ( ) . length ( ) > 0 & & TextParser . supports ( entry . url ( ) , null ) ! = null )
) {
*/ ) {
warning = nextQueue. noticeURL . push ( NoticedURL . StackType . NOLOAD , entry ) ;
warning = this . nextQueue. noticeURL . push ( NoticedURL . StackType . NOLOAD , entry ) ;
if ( warning ! = null ) this . log . logWarning ( "CrawlStacker.stackCrawl of URL " + entry . url ( ) . toNormalform ( true , false ) + " - not pushed: " + warning ) ;
if ( warning ! = null ) this . log . logWarning ( "CrawlStacker.stackCrawl of URL " + entry . url ( ) . toNormalform ( true , false ) + " - not pushed: " + warning ) ;
return null ;
return null ;
}
}
final DigestURI referrerURL = ( entry . referrerhash ( ) = = null | | entry . referrerhash ( ) . length = = 0 ) ? null : nextQueue. getURL ( entry . referrerhash ( ) ) ;
final DigestURI referrerURL = ( entry . referrerhash ( ) = = null | | entry . referrerhash ( ) . length = = 0 ) ? null : this . nextQueue. getURL ( entry . referrerhash ( ) ) ;
// add domain to profile domain list
// add domain to profile domain list
if ( profile . domMaxPages ( ) ! = Integer . MAX_VALUE ) {
if ( profile . domMaxPages ( ) ! = Integer . MAX_VALUE ) {
@ -420,23 +418,23 @@ public final class CrawlStacker {
// it may be possible that global == true and local == true, so do not check an error case against it
// it may be possible that global == true and local == true, so do not check an error case against it
if ( proxy ) this . log . logWarning ( "URL '" + entry . url ( ) . toString ( ) + "' has conflicting initiator properties: global = true, proxy = true, initiator = proxy" + ", profile.handle = " + profile . handle ( ) ) ;
if ( proxy ) this . log . logWarning ( "URL '" + entry . url ( ) . toString ( ) + "' has conflicting initiator properties: global = true, proxy = true, initiator = proxy" + ", profile.handle = " + profile . handle ( ) ) ;
if ( remote ) this . log . logWarning ( "URL '" + entry . url ( ) . toString ( ) + "' has conflicting initiator properties: global = true, remote = true, initiator = " + ASCII . String ( entry . initiator ( ) ) + ", profile.handle = " + profile . handle ( ) ) ;
if ( remote ) this . log . logWarning ( "URL '" + entry . url ( ) . toString ( ) + "' has conflicting initiator properties: global = true, remote = true, initiator = " + ASCII . String ( entry . initiator ( ) ) + ", profile.handle = " + profile . handle ( ) ) ;
warning = nextQueue. noticeURL . push ( NoticedURL . StackType . LIMIT , entry ) ;
warning = this . nextQueue. noticeURL . push ( NoticedURL . StackType . LIMIT , entry ) ;
} else if ( local ) {
} else if ( local ) {
if ( proxy ) this . log . logWarning ( "URL '" + entry . url ( ) . toString ( ) + "' has conflicting initiator properties: local = true, proxy = true, initiator = proxy" + ", profile.handle = " + profile . handle ( ) ) ;
if ( proxy ) this . log . logWarning ( "URL '" + entry . url ( ) . toString ( ) + "' has conflicting initiator properties: local = true, proxy = true, initiator = proxy" + ", profile.handle = " + profile . handle ( ) ) ;
if ( remote ) this . log . logWarning ( "URL '" + entry . url ( ) . toString ( ) + "' has conflicting initiator properties: local = true, remote = true, initiator = " + ASCII . String ( entry . initiator ( ) ) + ", profile.handle = " + profile . handle ( ) ) ;
if ( remote ) this . log . logWarning ( "URL '" + entry . url ( ) . toString ( ) + "' has conflicting initiator properties: local = true, remote = true, initiator = " + ASCII . String ( entry . initiator ( ) ) + ", profile.handle = " + profile . handle ( ) ) ;
warning = nextQueue. noticeURL . push ( NoticedURL . StackType . CORE , entry ) ;
warning = this . nextQueue. noticeURL . push ( NoticedURL . StackType . CORE , entry ) ;
} else if ( proxy ) {
} else if ( proxy ) {
if ( remote ) this . log . logWarning ( "URL '" + entry . url ( ) . toString ( ) + "' has conflicting initiator properties: proxy = true, remote = true, initiator = " + ASCII . String ( entry . initiator ( ) ) + ", profile.handle = " + profile . handle ( ) ) ;
if ( remote ) this . log . logWarning ( "URL '" + entry . url ( ) . toString ( ) + "' has conflicting initiator properties: proxy = true, remote = true, initiator = " + ASCII . String ( entry . initiator ( ) ) + ", profile.handle = " + profile . handle ( ) ) ;
warning = nextQueue. noticeURL . push ( NoticedURL . StackType . CORE , entry ) ;
warning = this . nextQueue. noticeURL . push ( NoticedURL . StackType . CORE , entry ) ;
} else if ( remote ) {
} else if ( remote ) {
warning = nextQueue. noticeURL . push ( NoticedURL . StackType . REMOTE , entry ) ;
warning = this . nextQueue. noticeURL . push ( NoticedURL . StackType . REMOTE , entry ) ;
}
}
if ( warning ! = null ) this . log . logWarning ( "CrawlStacker.stackCrawl of URL " + entry . url ( ) . toNormalform ( true , false ) + " - not pushed: " + warning ) ;
if ( warning ! = null ) this . log . logWarning ( "CrawlStacker.stackCrawl of URL " + entry . url ( ) . toNormalform ( true , false ) + " - not pushed: " + warning ) ;
return null ;
return null ;
}
}
public String checkAcceptance ( final DigestURI url , final CrawlProfile profile , int depth ) {
public String checkAcceptance ( final DigestURI url , final CrawlProfile profile , final int depth ) {
// check if the protocol is supported
// check if the protocol is supported
final String urlProtocol = url . getProtocol ( ) ;
final String urlProtocol = url . getProtocol ( ) ;
@ -483,14 +481,14 @@ public final class CrawlStacker {
}
}
// check if the url is double registered
// check if the url is double registered
final String dbocc = nextQueue. urlExists ( url . hash ( ) ) ; // returns the name of the queue if entry exists
final String dbocc = this . nextQueue. urlExists ( url . hash ( ) ) ; // returns the name of the queue if entry exists
URIMetadataRow oldEntry = indexSegment. urlMetadata ( ) . load ( url . hash ( ) ) ;
final URIMetadataRow oldEntry = this . indexSegment. urlMetadata ( ) . load ( url . hash ( ) ) ;
if ( oldEntry = = null ) {
if ( oldEntry = = null ) {
if ( dbocc ! = null ) {
if ( dbocc ! = null ) {
// do double-check
// do double-check
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + url . toString ( ) + "' is double registered in '" + dbocc + "'." ) ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + url . toString ( ) + "' is double registered in '" + dbocc + "'." ) ;
if ( dbocc . equals ( "errors" ) ) {
if ( dbocc . equals ( "errors" ) ) {
ZURL . Entry errorEntry = nextQueue. errorURL . get ( url . hash ( ) ) ;
final ZURL . Entry errorEntry = this . nextQueue. errorURL . get ( url . hash ( ) ) ;
return "double in: errors (" + errorEntry . anycause ( ) + ")" ;
return "double in: errors (" + errorEntry . anycause ( ) + ")" ;
} else {
} else {
return "double in: " + dbocc ;
return "double in: " + dbocc ;
@ -508,7 +506,7 @@ public final class CrawlStacker {
} else {
} else {
if ( this . log . isInfo ( ) ) this . log . logInfo ( "URL '" + url . toString ( ) + "' is double registered in '" + dbocc + "'. " + "Stack processing time:" ) ;
if ( this . log . isInfo ( ) ) this . log . logInfo ( "URL '" + url . toString ( ) + "' is double registered in '" + dbocc + "'. " + "Stack processing time:" ) ;
if ( dbocc . equals ( "errors" ) ) {
if ( dbocc . equals ( "errors" ) ) {
ZURL . Entry errorEntry = nextQueue. errorURL . get ( url . hash ( ) ) ;
final ZURL . Entry errorEntry = this . nextQueue. errorURL . get ( url . hash ( ) ) ;
return "double in: errors (" + errorEntry . anycause ( ) + ")" ;
return "double in: errors (" + errorEntry . anycause ( ) + ")" ;
} else {
} else {
return "double in: " + dbocc ;
return "double in: " + dbocc ;
@ -520,7 +518,7 @@ public final class CrawlStacker {
// deny urls that exceed allowed number of occurrences
// deny urls that exceed allowed number of occurrences
final int maxAllowedPagesPerDomain = profile . domMaxPages ( ) ;
final int maxAllowedPagesPerDomain = profile . domMaxPages ( ) ;
if ( maxAllowedPagesPerDomain < Integer . MAX_VALUE ) {
if ( maxAllowedPagesPerDomain < Integer . MAX_VALUE ) {
final DomProfile dp = doms. get ( url . getHost ( ) ) ;
final DomProfile dp = this . doms. get ( url . getHost ( ) ) ;
if ( dp ! = null & & dp . count > = maxAllowedPagesPerDomain ) {
if ( dp ! = null & & dp . count > = maxAllowedPagesPerDomain ) {
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + url . toString ( ) + "' appeared too often in crawl stack, a maximum of " + profile . domMaxPages ( ) + " is allowed." ) ;
if ( this . log . isFine ( ) ) this . log . logFine ( "URL '" + url . toString ( ) + "' appeared too often in crawl stack, a maximum of " + profile . domMaxPages ( ) + " is allowed." ) ;
return "crawl stack domain counter exceeded" ;
return "crawl stack domain counter exceeded" ;
@ -559,7 +557,7 @@ public final class CrawlStacker {
// check if this is a local address and we are allowed to index local pages:
// check if this is a local address and we are allowed to index local pages:
//boolean local = hostAddress.isSiteLocalAddress() || hostAddress.isLoopbackAddress();
//boolean local = hostAddress.isSiteLocalAddress() || hostAddress.isLoopbackAddress();
//assert local == yacyURL.isLocalDomain(url.hash()); // TODO: remove the dnsResolve above!
//assert local == yacyURL.isLocalDomain(url.hash()); // TODO: remove the dnsResolve above!
InetAddress ia = Domains . dnsResolve ( host ) ;
final InetAddress ia = Domains . dnsResolve ( host ) ;
return ( local ) ?
return ( local ) ?
( "the host '" + host + "' is local, but local addresses are not accepted: " + ( ( ia = = null ) ? "null" : ia . getHostAddress ( ) ) ) :
( "the host '" + host + "' is local, but local addresses are not accepted: " + ( ( ia = = null ) ? "null" : ia . getHostAddress ( ) ) ) :
( "the host '" + host + "' is global, but global addresses are not accepted: " + ( ( ia = = null ) ? "null" : ia . getHostAddress ( ) ) ) ;
( "the host '" + host + "' is global, but global addresses are not accepted: " + ( ( ia = = null ) ? "null" : ia . getHostAddress ( ) ) ) ;