@ -8,10 +8,6 @@
// This file was contributed by Martin Thelian
// ([MC] removed all multithreading and thread pools, this is not necessary here; complete renovation 2007)
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
@ -31,7 +27,6 @@ package net.yacy.crawler;
import java.io.IOException ;
import java.net.InetAddress ;
import java.net.MalformedURLException ;
import java.net.UnknownHostException ;
import java.util.Date ;
import java.util.Locale ;
import java.util.Map ;
@ -76,7 +71,7 @@ public final class CrawlStacker {
private final Log log = new Log ( "STACKCRAWL" ) ;
private final RobotsTxt robots ;
private final WorkflowProcessor < Request > fastQueue, slow Queue;
private final WorkflowProcessor < Request > request Queue;
private final CrawlQueues nextQueue ;
private final CrawlSwitchboard crawler ;
private final Segment indexSegment ;
@ -103,56 +98,37 @@ public final class CrawlStacker {
this . acceptLocalURLs = acceptLocalURLs ;
this . acceptGlobalURLs = acceptGlobalURLs ;
this . domainList = domainList ;
this . fastQueue = new WorkflowProcessor < Request > ( "CrawlStackerFast" , "This process checks new urls before they are enqueued into the balancer (proper, double-check, correct domain, filter)" , new String [ ] { "Balancer" } , this , "job" , 10000 , null , WorkflowProcessor . availableCPU ) ;
this . slowQueue = new WorkflowProcessor < Request > ( "CrawlStackerSlow" , "This is like CrawlStackerFast, but does additionaly a DNS lookup. The CrawlStackerFast does not need this because it can use the DNS cache." , new String [ ] { "Balancer" } , this , "job" , 1000 , null , 2 ) ;
this . requestQueue = new WorkflowProcessor < Request > ( "CrawlStacker" , "This process checks new urls before they are enqueued into the balancer (proper, double-check, correct domain, filter)" , new String [ ] { "Balancer" } , this , "job" , 10000 , null , WorkflowProcessor . availableCPU ) ;
this . log . logInfo ( "STACKCRAWL thread initialized." ) ;
}
public int size ( ) {
return this . fastQueue. queueSize ( ) + this . slow Queue. queueSize ( ) ;
return this . request Queue. queueSize ( ) ;
}
public boolean isEmpty ( ) {
if ( ! this . fastQueue . queueIsEmpty ( ) ) return false ;
if ( ! this . slowQueue . queueIsEmpty ( ) ) return false ;
if ( ! this . requestQueue . queueIsEmpty ( ) ) return false ;
return true ;
}
public void clear ( ) {
this . fastQueue . clear ( ) ;
this . slowQueue . clear ( ) ;
this . requestQueue . clear ( ) ;
}
public void announceClose ( ) {
this . log . logInfo ( "Flushing remaining " + size ( ) + " crawl stacker job entries." ) ;
this . fastQueue . shutdown ( ) ;
this . slowQueue . shutdown ( ) ;
this . requestQueue . shutdown ( ) ;
}
public synchronized void close ( ) {
this . log . logInfo ( "Shutdown. waiting for remaining " + size ( ) + " crawl stacker job entries. please wait." ) ;
this . fastQueue . shutdown ( ) ;
this . slowQueue . shutdown ( ) ;
this . requestQueue . shutdown ( ) ;
this . log . logInfo ( "Shutdown. Closing stackCrawl queue." ) ;
clear ( ) ;
}
private static boolean prefetchHost ( final String host ) {
// returns true when the host was known in the dns cache.
// If not, the host is stacked on the fetch stack and false is returned
try {
if ( Domains . dnsResolveFromCache ( host ) ! = null ) return true ; // found entry
} catch ( final UnknownHostException e ) {
// we know that this is unknown
return false ;
}
// we just don't know anything about that host
return false ;
}
public Request job ( final Request entry ) {
// this is the method that is called by the busy thread from outside
if ( entry = = null ) return null ;
@ -175,20 +151,9 @@ public final class CrawlStacker {
// DEBUG
if ( this . log . isFinest ( ) ) this . log . logFinest ( "ENQUEUE " + entry . url ( ) + ", referer=" + entry . referrerhash ( ) + ", initiator=" + ( ( entry . initiator ( ) = = null ) ? "" : ASCII . String ( entry . initiator ( ) ) ) + ", name=" + entry . name ( ) + ", appdate=" + entry . appdate ( ) + ", depth=" + entry . depth ( ) ) ;
if ( prefetchHost ( entry . url ( ) . getHost ( ) ) ) {
try {
this . fastQueue . enQueue ( entry ) ;
//this.dnsHit++;
} catch ( final InterruptedException e ) {
Log . logException ( e ) ;
}
} else {
try {
this . slowQueue . enQueue ( entry ) ;
} catch ( final InterruptedException e ) {
Log . logException ( e ) ;
}
try {
this . requestQueue . enQueue ( entry ) ;
} catch ( InterruptedException e ) {
}
}
public void enqueueEntriesAsynchronous ( final byte [ ] initiator , final String profileHandle , final Map < DigestURI , Properties > hyperlinks ) {