@ -228,9 +228,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String f = getConfig ( "proxyBlackListsActive" , null ) ;
if ( f ! = null ) {
urlBlacklist . loadLists ( "black" , f , "/" ) ;
log . log System ( "loaded black-list from file " + f + ", " + urlBlacklist . size ( ) + " entries" ) ;
log . log Config ( "loaded black-list from file " + f + ", " + urlBlacklist . size ( ) + " entries" ) ;
}
log . log System ( "Proxy Handler Initialized" ) ;
log . log Config ( "Proxy Handler Initialized" ) ;
// load stopwords
if ( stopwords = = null ) {
@ -245,21 +245,21 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
int ramHTTP = Integer . parseInt ( getConfig ( "ramCacheHTTP" , "1024" ) ) / 1024 ;
int ramMessage = Integer . parseInt ( getConfig ( "ramCacheMessage" , "1024" ) ) / 1024 ;
int ramWiki = Integer . parseInt ( getConfig ( "ramCacheWiki" , "1024" ) ) / 1024 ;
log . log System ( "LURL Cache memory = " + ppRamString ( ramLURL ) ) ;
log . log System ( "NURL Cache memory = " + ppRamString ( ramNURL ) ) ;
log . log System ( "EURL Cache memory = " + ppRamString ( ramEURL ) ) ;
log . log System ( "RWI Cache memory = " + ppRamString ( ramRWI ) ) ;
log . log System ( "HTTP Cache memory = " + ppRamString ( ramHTTP ) ) ;
log . log System ( "Message Cache memory = " + ppRamString ( ramMessage ) ) ;
log . log System ( "Wiki Cache memory = " + ppRamString ( ramWiki ) ) ;
log . log Config ( "LURL Cache memory = " + ppRamString ( ramLURL ) ) ;
log . log Config ( "NURL Cache memory = " + ppRamString ( ramNURL ) ) ;
log . log Config ( "EURL Cache memory = " + ppRamString ( ramEURL ) ) ;
log . log Config ( "RWI Cache memory = " + ppRamString ( ramRWI ) ) ;
log . log Config ( "HTTP Cache memory = " + ppRamString ( ramHTTP ) ) ;
log . log Config ( "Message Cache memory = " + ppRamString ( ramMessage ) ) ;
log . log Config ( "Wiki Cache memory = " + ppRamString ( ramWiki ) ) ;
// make crawl profiles database and default profiles
log . log System ( "Initializing Crawl Profiles" ) ;
log . log Config ( "Initializing Crawl Profiles" ) ;
profiles = new plasmaCrawlProfile ( new File ( plasmaPath , "crawlProfiles0.db" ) ) ;
initProfiles ( ) ;
// start indexing management
log . log System ( "Starting Indexing Management" ) ;
log . log Config ( "Starting Indexing Management" ) ;
urlPool = new plasmaURLPool ( plasmaPath , ramLURL , ramNURL , ramEURL ) ;
wordIndex = new plasmaWordIndex ( plasmaPath , ramRWI , log ) ;
@ -268,7 +268,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
searchManager = new plasmaSearch ( urlPool . loadedURL , wordIndex ) ;
// start a cache manager
log . log System ( "Starting HT Cache Manager" ) ;
log . log Config ( "Starting HT Cache Manager" ) ;
// create the Cache directorie - Borg-0300
String cp = getConfig ( "proxyCache" , "DATA/HTCACHE" ) ;
@ -278,7 +278,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if ( ! ( htCachePath . exists ( ) ) ) htCachePath . mkdirs ( ) ;
if ( ! ( htCachePath . isDirectory ( ) ) ) {
// if the cache does not exists or is a file and not a directory, panic
serverLog . log System ( "PLASMA" , "the cache path " + htCachePath . toString ( ) + " is not a directory or does not exists and cannot be created" ) ;
serverLog . log Config ( "PLASMA" , "the cache path " + htCachePath . toString ( ) + " is not a directory or does not exists and cannot be created" ) ;
System . exit ( 0 ) ;
} else {
serverLog . logInfo ( "PLASMA" , "proxyCache=" + cp ) ;
@ -288,24 +288,24 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
this . cacheManager = new plasmaHTCache ( htCachePath , maxCacheSize , ramHTTP ) ;
// make parser
log . log System ( "Starting Parser" ) ;
log . log Config ( "Starting Parser" ) ;
this . parser = new plasmaParser ( ) ;
// initialize switchboard queue
sbQueue = new plasmaSwitchboardQueue ( this . cacheManager , urlPool . loadedURL , new File ( plasmaPath , "switchboardQueue1.stack" ) , 10 , profiles ) ;
// define an extension-blacklist
log . log System ( "Parser: Initializing Extension Mappings for Media/Parser" ) ;
log . log Config ( "Parser: Initializing Extension Mappings for Media/Parser" ) ;
plasmaParser . initMediaExt ( plasmaParser . extString2extList ( getConfig ( "mediaExt" , "" ) ) ) ;
plasmaParser . initSupportedRealtimeFileExt ( plasmaParser . extString2extList ( getConfig ( "parseableExt" , "" ) ) ) ;
// define a realtime parsable mimetype list
log . log System ( "Parser: Initializing Mime Types" ) ;
log . log Config ( "Parser: Initializing Mime Types" ) ;
plasmaParser . initRealtimeParsableMimeTypes ( getConfig ( "parseableRealtimeMimeTypes" , "application/xhtml+xml,text/html,text/plain" ) ) ;
plasmaParser . initParseableMimeTypes ( getConfig ( "parseableMimeTypes" , null ) ) ;
// start a loader
log . log System ( "Starting Crawl Loader" ) ;
log . log Config ( "Starting Crawl Loader" ) ;
int remoteport ;
try { remoteport = Integer . parseInt ( getConfig ( "remoteProxyPort" , "3128" ) ) ; }
catch ( NumberFormatException e ) { remoteport = 3128 ; }
@ -318,19 +318,19 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
this . log ) ;
// init boards
log . log System ( "Starting Message Board" ) ;
log . log Config ( "Starting Message Board" ) ;
messageDB = new messageBoard ( new File ( getRootPath ( ) , "DATA/SETTINGS/message.db" ) , ramMessage ) ;
log . log System ( "Starting Wiki Board" ) ;
log . log Config ( "Starting Wiki Board" ) ;
wikiDB = new wikiBoard ( new File ( getRootPath ( ) , "DATA/SETTINGS/wiki.db" ) ,
new File ( getRootPath ( ) , "DATA/SETTINGS/wiki-bkp.db" ) , ramWiki ) ;
// init cookie-Monitor
log . log System ( "Starting Cookie Monitor" ) ;
log . log Config ( "Starting Cookie Monitor" ) ;
outgoingCookies = new HashMap ( ) ;
incomingCookies = new HashMap ( ) ;
// clean up profiles
log . log System ( "Cleaning Profiles" ) ;
log . log Config ( "Cleaning Profiles" ) ;
cleanProfiles ( ) ;
// init facility DB
@ -350,13 +350,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
* /
// generate snippets cache
log . log System ( "Initializing Snippet Cache" ) ;
log . log Config ( "Initializing Snippet Cache" ) ;
snippetCache = new plasmaSnippetCache ( cacheManager , parser ,
remoteProxyHost , remoteProxyPort , remoteProxyUse ,
log ) ;
// start yacy core
log . log System ( "Starting YaCy Protocol Core" ) ;
log . log Config ( "Starting YaCy Protocol Core" ) ;
//try{Thread.currentThread().sleep(5000);} catch (InterruptedException e) {} // for profiler
this . yc = new yacyCore ( this ) ;
//log.logSystem("Started YaCy Protocol Core");
@ -364,7 +364,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
serverInstantThread . oneTimeJob ( yc , "loadSeeds" , yc . log , 3000 ) ;
// deploy threads
log . log System ( "Starting Threads" ) ;
log . log Config ( "Starting Threads" ) ;
System . gc ( ) ; // help for profiler
int indexing_cluster = Integer . parseInt ( getConfig ( "80_indexing_cluster" , "1" ) ) ;
if ( indexing_cluster < 1 ) indexing_cluster = 1 ;
@ -424,7 +424,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
//plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/kiosk/archiv/ct/2003/4/20"), query, true, 260);
sb = this ;
log . log System ( "Finished Switchboard Initialization" ) ;
log . log Config ( "Finished Switchboard Initialization" ) ;
}
public static plasmaSwitchboard getSwitchboard ( ) {
@ -554,12 +554,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
public void close ( ) {
log . log System ( "SWITCHBOARD SHUTDOWN STEP 1: sending termination signal to managed threads:" ) ;
log . log Config ( "SWITCHBOARD SHUTDOWN STEP 1: sending termination signal to managed threads:" ) ;
terminateAllThreads ( true ) ;
log . log System ( "SWITCHBOARD SHUTDOWN STEP 2: sending termination signal to threaded indexing (stand by..)" ) ;
log . log Config ( "SWITCHBOARD SHUTDOWN STEP 2: sending termination signal to threaded indexing (stand by..)" ) ;
int waitingBoundSeconds = Integer . parseInt ( getConfig ( "maxWaitingWordFlush" , "120" ) ) ;
wordIndex . close ( waitingBoundSeconds ) ;
log . log System ( "SWITCHBOARD SHUTDOWN STEP 3: sending termination signal to database manager" ) ;
log . log Config ( "SWITCHBOARD SHUTDOWN STEP 3: sending termination signal to database manager" ) ;
try {
indexDistribution . close ( ) ;
cacheLoader . close ( ) ;
@ -572,7 +572,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
cacheManager . close ( ) ;
sbQueue . close ( ) ;
} catch ( IOException e ) { }
log . log System ( "SWITCHBOARD SHUTDOWN TERMINATED" ) ;
log . log Config ( "SWITCHBOARD SHUTDOWN TERMINATED" ) ;
}
public int queueSize ( ) {
@ -592,14 +592,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
try {
sbQueue . push ( ( plasmaSwitchboardQueue . Entry ) job ) ;
} catch ( IOException e ) {
log . log Error ( "IOError in plasmaSwitchboard.enQueue: " + e . getMessage ( ) , e ) ;
log . log Failure ( "IOError in plasmaSwitchboard.enQueue: " + e . getMessage ( ) , e ) ;
}
}
public boolean deQueue ( ) {
// work off fresh entries from the proxy or from the crawler
if ( onlineCaution ( ) ) {
log . log Debug ( "deQueue: online caution, omitting resource stack processing" ) ;
log . log Fine ( "deQueue: online caution, omitting resource stack processing" ) ;
return false ;
}
plasmaSwitchboardQueue . Entry nextentry ;
@ -610,7 +610,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
// do one processing step
log . log Debug ( "DEQUEUE: sbQueueSize=" + sbQueue . size ( ) +
log . log Fine ( "DEQUEUE: sbQueueSize=" + sbQueue . size ( ) +
", coreStackSize=" + urlPool . noticeURL . stackSize ( plasmaCrawlNURL . STACK_TYPE_CORE ) +
", limitStackSize=" + urlPool . noticeURL . stackSize ( plasmaCrawlNURL . STACK_TYPE_LIMIT ) +
", overhangStackSize=" + urlPool . noticeURL . stackSize ( plasmaCrawlNURL . STACK_TYPE_OVERHANG ) +
@ -619,7 +619,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
nextentry = sbQueue . pop ( ) ;
if ( nextentry = = null ) return false ;
} catch ( IOException e ) {
log . log Error ( "IOError in plasmaSwitchboard.deQueue: " + e . getMessage ( ) , e ) ;
log . log Failure ( "IOError in plasmaSwitchboard.deQueue: " + e . getMessage ( ) , e ) ;
return false ;
}
}
@ -705,17 +705,17 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
return false ;
}
if ( sbQueue . size ( ) > = indexingSlots ) {
log . log Debug ( "CoreCrawl: too many processes in indexing queue, dismissed (" +
log . log Fine ( "CoreCrawl: too many processes in indexing queue, dismissed (" +
"sbQueueSize=" + sbQueue . size ( ) + ")" ) ;
return false ;
}
if ( cacheLoader . size ( ) > = crawlSlots ) {
log . log Debug ( "CoreCrawl: too many processes in loader queue, dismissed (" +
log . log Fine ( "CoreCrawl: too many processes in loader queue, dismissed (" +
"cacheLoader=" + cacheLoader . size ( ) + ")" ) ;
return false ;
}
if ( onlineCaution ( ) ) {
log . log Debug ( "CoreCrawl: online caution, omitting processing" ) ;
log . log Fine ( "CoreCrawl: online caution, omitting processing" ) ;
return false ;
}
// if the server is busy, we do crawling more slowly
@ -735,21 +735,21 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
plasmaCrawlNURL . Entry urlEntry = urlPool . noticeURL . pop ( plasmaCrawlNURL . STACK_TYPE_CORE ) ;
String stats = "LOCALCRAWL[" + urlPool . noticeURL . stackSize ( plasmaCrawlNURL . STACK_TYPE_CORE ) + ", " + urlPool . noticeURL . stackSize ( plasmaCrawlNURL . STACK_TYPE_LIMIT ) + ", " + urlPool . noticeURL . stackSize ( plasmaCrawlNURL . STACK_TYPE_OVERHANG ) + ", " + urlPool . noticeURL . stackSize ( plasmaCrawlNURL . STACK_TYPE_REMOTE ) + "]" ;
if ( ( urlEntry . url ( ) = = null ) | | ( urlEntry . url ( ) . toString ( ) . length ( ) < 10 ) ) {
log . log Error ( stats + ": urlEntry.url() == null. URL-Hash: " + ( ( urlEntry . hash ( ) = = null ) ? "Unknown" : urlEntry . hash ( ) ) ) ;
log . log Failure ( stats + ": urlEntry.url() == null. URL-Hash: " + ( ( urlEntry . hash ( ) = = null ) ? "Unknown" : urlEntry . hash ( ) ) ) ;
return true ;
}
String profileHandle = urlEntry . profileHandle ( ) ;
//System.out.println("DEBUG plasmaSwitchboard.processCrawling: profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
if ( profileHandle = = null ) {
log . log Error ( stats + ": NULL PROFILE HANDLE '" + urlEntry . profileHandle ( ) + "' (must be internal error) for URL " + urlEntry . url ( ) ) ;
log . log Failure ( stats + ": NULL PROFILE HANDLE '" + urlEntry . profileHandle ( ) + "' (must be internal error) for URL " + urlEntry . url ( ) ) ;
return true ;
}
plasmaCrawlProfile . entry profile = profiles . getEntry ( profileHandle ) ;
if ( profile = = null ) {
log . log Error ( stats + ": LOST PROFILE HANDLE '" + urlEntry . profileHandle ( ) + "' (must be internal error) for URL " + urlEntry . url ( ) ) ;
log . log Failure ( stats + ": LOST PROFILE HANDLE '" + urlEntry . profileHandle ( ) + "' (must be internal error) for URL " + urlEntry . url ( ) ) ;
return true ;
}
log . log Debug ( "LOCALCRAWL: url=" + urlEntry . url ( ) + ", initiator=" + urlEntry . initiator ( ) +
log . log Fine ( "LOCALCRAWL: url=" + urlEntry . url ( ) + ", initiator=" + urlEntry . initiator ( ) +
", crawlOrder=" + ( ( profile . remoteIndexing ( ) ) ? "true" : "false" ) + ", depth=" + urlEntry . depth ( ) + ", crawlDepth=" + profile . generalDepth ( ) + ", filter=" + profile . generalFilter ( ) +
", permission=" + ( ( yacyCore . seedDB = = null ) ? "undefined" : ( ( ( yacyCore . seedDB . mySeed . isSenior ( ) ) | | ( yacyCore . seedDB . mySeed . isPrincipal ( ) ) ) ? "true" : "false" ) ) ) ;
@ -782,12 +782,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if ( sbQueue . size ( ) > = indexingSlots ) {
log . log Debug ( "LimitCrawl: too many processes in indexing queue, dismissed to protect emergency case (" +
log . log Fine ( "LimitCrawl: too many processes in indexing queue, dismissed to protect emergency case (" +
"sbQueueSize=" + sbQueue . size ( ) + ")" ) ;
return false ;
}
if ( cacheLoader . size ( ) > = crawlSlots ) {
log . log Debug ( "LimitCrawl: too many processes in loader queue, dismissed to protect emergency case (" +
log . log Fine ( "LimitCrawl: too many processes in loader queue, dismissed to protect emergency case (" +
"cacheLoader=" + cacheLoader . size ( ) + ")" ) ;
return false ;
}
@ -810,17 +810,17 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
plasmaCrawlNURL . Entry urlEntry = urlPool . noticeURL . pop ( plasmaCrawlNURL . STACK_TYPE_LIMIT ) ;
String stats = "REMOTECRAWLTRIGGER[" + urlPool . noticeURL . stackSize ( plasmaCrawlNURL . STACK_TYPE_CORE ) + ", " + urlPool . noticeURL . stackSize ( plasmaCrawlNURL . STACK_TYPE_LIMIT ) + ", " + urlPool . noticeURL . stackSize ( plasmaCrawlNURL . STACK_TYPE_OVERHANG ) + ", " + urlPool . noticeURL . stackSize ( plasmaCrawlNURL . STACK_TYPE_REMOTE ) + "]" ;
if ( urlEntry . url ( ) = = null ) {
log . log Error ( stats + ": urlEntry.url() == null" ) ;
log . log Failure ( stats + ": urlEntry.url() == null" ) ;
return true ;
}
String profileHandle = urlEntry . profileHandle ( ) ;
//System.out.println("DEBUG plasmaSwitchboard.processCrawling: profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
plasmaCrawlProfile . entry profile = profiles . getEntry ( profileHandle ) ;
if ( profile = = null ) {
log . log Error ( stats + ": LOST PROFILE HANDLE '" + urlEntry . profileHandle ( ) + "' (must be internal error) for URL " + urlEntry . url ( ) ) ;
log . log Failure ( stats + ": LOST PROFILE HANDLE '" + urlEntry . profileHandle ( ) + "' (must be internal error) for URL " + urlEntry . url ( ) ) ;
return true ;
}
log . log Debug ( "plasmaSwitchboard.limitCrawlTriggerJob: url=" + urlEntry . url ( ) + ", initiator=" + urlEntry . initiator ( ) +
log . log Fine ( "plasmaSwitchboard.limitCrawlTriggerJob: url=" + urlEntry . url ( ) + ", initiator=" + urlEntry . initiator ( ) +
", crawlOrder=" + ( ( profile . remoteIndexing ( ) ) ? "true" : "false" ) + ", depth=" + urlEntry . depth ( ) + ", crawlDepth=" + profile . generalDepth ( ) + ", filter=" + profile . generalFilter ( ) +
", permission=" + ( ( yacyCore . seedDB = = null ) ? "undefined" : ( ( ( yacyCore . seedDB . mySeed . isSenior ( ) ) | | ( yacyCore . seedDB . mySeed . isPrincipal ( ) ) ) ? "true" : "false" ) ) ) ;
@ -854,7 +854,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
return false ;
}
if ( onlineCaution ( ) ) {
log . log Debug ( "GlobalCrawl: online caution, omitting processing" ) ;
log . log Fine ( "GlobalCrawl: online caution, omitting processing" ) ;
return false ;
}
@ -872,7 +872,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
plasmaCrawlNURL . Entry urlEntry = urlPool . noticeURL . pop ( plasmaCrawlNURL . STACK_TYPE_REMOTE ) ;
String stats = "REMOTETRIGGEREDCRAWL[" + urlPool . noticeURL . stackSize ( plasmaCrawlNURL . STACK_TYPE_CORE ) + ", " + urlPool . noticeURL . stackSize ( plasmaCrawlNURL . STACK_TYPE_LIMIT ) + ", " + urlPool . noticeURL . stackSize ( plasmaCrawlNURL . STACK_TYPE_OVERHANG ) + ", " + urlPool . noticeURL . stackSize ( plasmaCrawlNURL . STACK_TYPE_REMOTE ) + "]" ;
if ( urlEntry . url ( ) = = null ) {
log . log Error ( stats + ": urlEntry.url() == null" ) ;
log . log Failure ( stats + ": urlEntry.url() == null" ) ;
return false ;
}
String profileHandle = urlEntry . profileHandle ( ) ;
@ -880,10 +880,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
plasmaCrawlProfile . entry profile = profiles . getEntry ( profileHandle ) ;
if ( profile = = null ) {
log . log Error ( stats + ": LOST PROFILE HANDLE '" + urlEntry . profileHandle ( ) + "' (must be internal error) for URL " + urlEntry . url ( ) ) ;
log . log Failure ( stats + ": LOST PROFILE HANDLE '" + urlEntry . profileHandle ( ) + "' (must be internal error) for URL " + urlEntry . url ( ) ) ;
return false ;
}
log . log Debug ( "plasmaSwitchboard.remoteTriggeredCrawlJob: url=" + urlEntry . url ( ) + ", initiator=" + urlEntry . initiator ( ) +
log . log Fine ( "plasmaSwitchboard.remoteTriggeredCrawlJob: url=" + urlEntry . url ( ) + ", initiator=" + urlEntry . initiator ( ) +
", crawlOrder=" + ( ( profile . remoteIndexing ( ) ) ? "true" : "false" ) + ", depth=" + urlEntry . depth ( ) + ", crawlDepth=" + profile . generalDepth ( ) + ", filter=" + profile . generalFilter ( ) +
", permission=" + ( ( yacyCore . seedDB = = null ) ? "undefined" : ( ( ( yacyCore . seedDB . mySeed . isSenior ( ) ) | | ( yacyCore . seedDB . mySeed . isPrincipal ( ) ) ) ? "true" : "false" ) ) ) ;
@ -916,7 +916,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
processCase = 6 ;
}
log . log Debug ( "processResourceStack processCase=" + processCase +
log . log Fine ( "processResourceStack processCase=" + processCase +
", depth=" + entry . depth ( ) +
", maxDepth=" + ( ( entry . profile ( ) = = null ) ? "null" : Integer . toString ( entry . profile ( ) . generalDepth ( ) ) ) +
", filter=" + ( ( entry . profile ( ) = = null ) ? "null" : entry . profile ( ) . generalFilter ( ) ) +
@ -930,18 +930,18 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
( ( entry . responseHeader ( ) ! = null ) & &
( plasmaParser . supportedMimeTypesContains ( entry . responseHeader ( ) . mime ( ) ) ) ) ) {
if ( entry . cacheFile ( ) . exists ( ) ) {
log . log Debug ( "(Parser) '" + entry . normalizedURLString ( ) + "' is not parsed yet, parsing now from File" ) ;
log . log Fine ( "(Parser) '" + entry . normalizedURLString ( ) + "' is not parsed yet, parsing now from File" ) ;
document = parser . parseSource ( entry . url ( ) , ( entry . responseHeader ( ) = = null ) ? null : entry . responseHeader ( ) . mime ( ) , entry . cacheFile ( ) ) ;
} else {
log . log Debug ( "(Parser) '" + entry . normalizedURLString ( ) + "' cannot be parsed, no resource available" ) ;
log . log Fine ( "(Parser) '" + entry . normalizedURLString ( ) + "' cannot be parsed, no resource available" ) ;
return ;
}
if ( document = = null ) {
log . log Error ( "(Parser) '" + entry . normalizedURLString ( ) + "' parse failure" ) ;
log . log Failure ( "(Parser) '" + entry . normalizedURLString ( ) + "' parse failure" ) ;
return ;
}
} else {
log . log Debug ( "(Parser) '" + entry . normalizedURLString ( ) + "'. Unsupported mimeType '" + ( ( entry . responseHeader ( ) = = null ) ? null : entry . responseHeader ( ) . mime ( ) ) + "'." ) ;
log . log Fine ( "(Parser) '" + entry . normalizedURLString ( ) + "'. Unsupported mimeType '" + ( ( entry . responseHeader ( ) = = null ) ? null : entry . responseHeader ( ) . mime ( ) ) + "'." ) ;
return ;
}
@ -991,7 +991,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
if ( noIndexReason = = null ) {
// strip out words
log . log Debug ( "Condensing for '" + entry . normalizedURLString ( ) + "'" ) ;
log . log Fine ( "Condensing for '" + entry . normalizedURLString ( ) + "'" ) ;
plasmaCondenser condenser = new plasmaCondenser ( new ByteArrayInputStream ( document . getText ( ) ) ) ;
//log.logInfo("INDEXING HEADLINE:" + descr);
@ -1035,10 +1035,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
yacyClient . crawlReceipt ( initiator , "crawl" , "fill" , "indexed" , newEntry , "" ) ;
}
} else {
log . log Debug ( "Not Indexed Resource '" + entry . normalizedURLString ( ) + "': process case=" + processCase ) ;
log . log Fine ( "Not Indexed Resource '" + entry . normalizedURLString ( ) + "': process case=" + processCase ) ;
}
} catch ( Exception ee ) {
log . log Error ( "Could not index URL " + entry . url ( ) + ": " + ee . getMessage ( ) , ee ) ;
log . log Failure ( "Could not index URL " + entry . url ( ) + ": " + ee . getMessage ( ) , ee ) ;
if ( ( processCase = = 6 ) & & ( initiator ! = null ) ) {
yacyClient . crawlReceipt ( initiator , "crawl" , "exception" , ee . getMessage ( ) , null , "" ) ;
}
@ -1058,7 +1058,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
document = null ;
} catch ( IOException e ) {
log . log Error ( "ERROR in plasmaSwitchboard.process(): " + e . toString ( ) ) ;
log . log Failure ( "ERROR in plasmaSwitchboard.process(): " + e . toString ( ) ) ;
} finally {
// explicit delete/free resources
if ( ( entry ! = null ) & & ( entry . profile ( ) ! = null ) & & ( ! ( entry . profile ( ) . storeHTCache ( ) ) ) ) cacheManager . deleteFile ( entry . url ( ) ) ;
@ -1075,7 +1075,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// strange errors
if ( nexturlString = = null ) {
reason = "denied_(url_null)" ;
log . log Error ( "Wrong URL in stackCrawl: url=null" ) ;
log . log Failure ( "Wrong URL in stackCrawl: url=null" ) ;
return reason ;
}
/ *
@ -1092,7 +1092,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
nexturl = new URL ( nexturlString ) ;
} catch ( MalformedURLException e ) {
reason = "denied_(url_'" + nexturlString + "'_wrong)" ;
log . log Error ( "Wrong URL in stackCrawl: " + nexturlString ) ;
log . log Failure ( "Wrong URL in stackCrawl: " + nexturlString ) ;
return reason ;
}
@ -1177,13 +1177,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// are we qualified?
if ( ( yacyCore . seedDB . mySeed = = null ) | |
( yacyCore . seedDB . mySeed . isJunior ( ) ) ) {
log . log Debug ( "plasmaSwitchboard.processRemoteCrawlTrigger: no permission" ) ;
log . log Fine ( "plasmaSwitchboard.processRemoteCrawlTrigger: no permission" ) ;
return false ;
}
// check url
if ( urlEntry . url ( ) = = null ) {
log . log Debug ( "ERROR: plasmaSwitchboard.processRemoteCrawlTrigger - url is null. name=" + urlEntry . name ( ) ) ;
log . log Fine ( "ERROR: plasmaSwitchboard.processRemoteCrawlTrigger - url is null. name=" + urlEntry . name ( ) ) ;
return true ;
}
String urlhash = plasmaURL . urlHash ( urlEntry . url ( ) ) ;
@ -1192,7 +1192,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
yacySeed remoteSeed = yacyCore . dhtAgent . getCrawlSeed ( urlhash ) ;
if ( remoteSeed = = null ) {
log . log Debug ( "plasmaSwitchboard.processRemoteCrawlTrigger: no remote crawl seed available" ) ;
log . log Fine ( "plasmaSwitchboard.processRemoteCrawlTrigger: no remote crawl seed available" ) ;
return false ;
}
@ -1222,7 +1222,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if ( remoteSeed ! = null ) yacyCore . peerActions . peerDeparture ( remoteSeed ) ;
return false ;
} else try {
log . log Debug ( "plasmaSwitchboard.processRemoteCrawlTrigger: remoteSeed=" + remoteSeed . getName ( ) + ", url=" + urlEntry . url ( ) . toString ( ) + ", response=" + page . toString ( ) ) ; // DEBUG
log . log Fine ( "plasmaSwitchboard.processRemoteCrawlTrigger: remoteSeed=" + remoteSeed . getName ( ) + ", url=" + urlEntry . url ( ) . toString ( ) + ", response=" + page . toString ( ) ) ; // DEBUG
int newdelay = Integer . parseInt ( ( String ) page . get ( "delay" ) ) ;
yacyCore . dhtAgent . setCrawlDelay ( remoteSeed . hash , newdelay ) ;
@ -1250,7 +1250,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
} catch ( Exception e ) {
// wrong values
log . log Error ( "REMOTECRAWLTRIGGER: REMOTE CRAWL TO PEER " + remoteSeed . getName ( ) + " FAILED. CLIENT RETURNED: " + page . toString ( ) , e ) ;
log . log Failure ( "REMOTECRAWLTRIGGER: REMOTE CRAWL TO PEER " + remoteSeed . getName ( ) + " FAILED. CLIENT RETURNED: " + page . toString ( ) , e ) ;
return false ;
}
}
@ -1277,19 +1277,19 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public void run ( ) {
try {
// search the database locally
log . log Debug ( "presearch: started job" ) ;
log . log Fine ( "presearch: started job" ) ;
plasmaWordIndexEntity idx = searchManager . searchHashes ( queryhashes , time ) ;
log . log Debug ( "presearch: found " + idx . size ( ) + " results" ) ;
log . log Fine ( "presearch: found " + idx . size ( ) + " results" ) ;
plasmaSearch . result acc = searchManager . order ( idx , queryhashes , stopwords , order , time , searchcount ) ;
if ( acc = = null ) return ;
log . log Debug ( "presearch: ordered results, now " + acc . sizeOrdered ( ) + " URLs ready for fetch" ) ;
log . log Fine ( "presearch: ordered results, now " + acc . sizeOrdered ( ) + " URLs ready for fetch" ) ;
// take some elements and fetch the snippets
snippetCache . fetch ( acc , queryhashes , urlmask , fetchcount ) ;
} catch ( IOException e ) {
log . log Error ( "presearch: failed" , e ) ;
log . log Failure ( "presearch: failed" , e ) ;
}
log . log Debug ( "presearch: job terminated" ) ;
log . log Fine ( "presearch: job terminated" ) ;
}
}
@ -1335,7 +1335,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if ( fetchpeers < 10 ) fetchpeers = 10 ;
if ( fetchcount > count * 10 ) fetchcount = count * 10 ;
globalresults = yacySearch . searchHashes ( queryhashes , urlPool . loadedURL , searchManager , fetchcount , fetchpeers , urlBlacklist , snippetCache , fetchtime ) ;
log . log Debug ( "SEARCH TIME AFTER GLOBAL-TRIGGER TO " + fetchpeers + " PEERS: " + ( ( System . currentTimeMillis ( ) - timestamp ) / 1000 ) + " seconds" ) ;
log . log Fine ( "SEARCH TIME AFTER GLOBAL-TRIGGER TO " + fetchpeers + " PEERS: " + ( ( System . currentTimeMillis ( ) - timestamp ) / 1000 ) + " seconds" ) ;
}
prop . put ( "globalresults" , globalresults ) ; // the result are written to the local DB
@ -1343,14 +1343,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// now search locally (the global results should be now in the local db)
long remainingTime = time - ( System . currentTimeMillis ( ) - timestamp ) ;
plasmaWordIndexEntity idx = searchManager . searchHashes ( queryhashes , remainingTime * 8 / 10 ) ; // the search
log . log Debug ( "SEARCH TIME AFTER FINDING " + idx . size ( ) + " ELEMENTS: " + ( ( System . currentTimeMillis ( ) - timestamp ) / 1000 ) + " seconds" ) ;
log . log Fine ( "SEARCH TIME AFTER FINDING " + idx . size ( ) + " ELEMENTS: " + ( ( System . currentTimeMillis ( ) - timestamp ) / 1000 ) + " seconds" ) ;
remainingTime = time - ( System . currentTimeMillis ( ) - timestamp ) ;
if ( remainingTime < 500 ) remainingTime = 500 ;
if ( remainingTime > 3000 ) remainingTime = 3000 ;
plasmaSearch . result acc = searchManager . order ( idx , queryhashes , stopwords , order , remainingTime , 10 ) ;
if ( ! ( global ) ) snippetCache . fetch ( acc . cloneSmart ( ) , queryhashes , urlmask , 10 ) ;
log . log Debug ( "SEARCH TIME AFTER ORDERING OF SEARCH RESULT: " + ( ( System . currentTimeMillis ( ) - timestamp ) / 1000 ) + " seconds" ) ;
log . log Fine ( "SEARCH TIME AFTER ORDERING OF SEARCH RESULT: " + ( ( System . currentTimeMillis ( ) - timestamp ) / 1000 ) + " seconds" ) ;
// result is a List of urlEntry elements: prepare answer
if ( acc = = null ) {
@ -1427,7 +1427,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
}
}
log . log Debug ( "SEARCH TIME AFTER RESULT PREPARATION: " + ( ( System . currentTimeMillis ( ) - timestamp ) / 1000 ) + " seconds" ) ;
log . log Fine ( "SEARCH TIME AFTER RESULT PREPARATION: " + ( ( System . currentTimeMillis ( ) - timestamp ) / 1000 ) + " seconds" ) ;
// calc some more cross-reference
remainingTime = time - ( System . currentTimeMillis ( ) - timestamp ) ;
@ -1444,7 +1444,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
* * /
//Object[] ws = ref.getScores(16, false, 2, Integer.MAX_VALUE);
Object [ ] ws = acc . getReferences ( 16 ) ;
log . log Debug ( "SEARCH TIME AFTER XREF PREPARATION: " + ( ( System . currentTimeMillis ( ) - timestamp ) / 1000 ) + " seconds" ) ;
log . log Fine ( "SEARCH TIME AFTER XREF PREPARATION: " + ( ( System . currentTimeMillis ( ) - timestamp ) / 1000 ) + " seconds" ) ;
/ *
System . out . print ( "DEBUG WORD-SCORE: " ) ;