@ -63,7 +63,8 @@ public final class CrawlSwitchboard {
public static final long CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE = 60L * 24L * 30L ;
private final Log log ;
private Map < byte [ ] , Map < String , String > > profilesActiveCrawls , profilesPassiveCrawls , profilesInvalidCrawls ;
private Map < byte [ ] , Map < String , String > > profilesActiveCrawls ;
private final Map < byte [ ] , Map < String , String > > profilesPassiveCrawls , profilesInvalidCrawls ;
public CrawlProfile defaultProxyProfile ;
public CrawlProfile defaultRemoteProfile ;
public CrawlProfile defaultTextSnippetLocalProfile , defaultTextSnippetGlobalProfile ;
@ -97,18 +98,18 @@ public final class CrawlSwitchboard {
for ( final byte [ ] handle : this . profilesActiveCrawls . keySet ( ) ) {
final CrawlProfile p ;
p = new CrawlProfile ( this . profilesActiveCrawls . get ( handle ) ) ;
if ( ! RegexHelper . isValidRegex ( p . get ( CrawlProfile . FILTER_ MUSTMATCH) ) ) {
this . removeActive( handle ) ;
this . putInvalid( handle , p ) ;
if ( ! RegexHelper . isValidRegex ( p . get ( CrawlProfile . FILTER_ URL_ MUSTMATCH) ) ) {
removeActive( handle ) ;
putInvalid( handle , p ) ;
Log . logWarning ( "CrawlProfiles" , "removed Profile " + p . handle ( ) + ": " + p . name ( )
+ " from active crawls since " + CrawlProfile . FILTER_ MUSTMATCH
+ " is no valid regular expression: " + p . get ( CrawlProfile . FILTER_ MUSTMATCH) ) ;
} else if ( ! RegexHelper . isValidRegex ( p . get ( CrawlProfile . FILTER_ MUSTNOTMATCH) ) ) {
this . putInvalid( handle , p ) ;
this . removeActive( handle ) ;
+ " from active crawls since " + CrawlProfile . FILTER_ URL_ MUSTMATCH
+ " is no valid regular expression: " + p . get ( CrawlProfile . FILTER_ URL_ MUSTMATCH) ) ;
} else if ( ! RegexHelper . isValidRegex ( p . get ( CrawlProfile . FILTER_ URL_ MUSTNOTMATCH) ) ) {
putInvalid( handle , p ) ;
removeActive( handle ) ;
Log . logWarning ( "CrawlProfiles" , "removed Profile " + p . handle ( ) + ": " + p . name ( )
+ " from active crawls since " + CrawlProfile . FILTER_ MUSTNOTMATCH
+ " is no valid regular expression: " + p . get ( CrawlProfile . FILTER_ MUSTNOTMATCH) ) ;
+ " from active crawls since " + CrawlProfile . FILTER_ URL_ MUSTNOTMATCH
+ " is no valid regular expression: " + p . get ( CrawlProfile . FILTER_ URL_ MUSTNOTMATCH) ) ;
} else {
Log . logInfo ( "CrawlProfiles" , "loaded Profile " + p . handle ( ) + ": " + p . name ( ) ) ;
}
@ -227,7 +228,10 @@ public final class CrawlSwitchboard {
if ( this . defaultProxyProfile = = null ) {
// generate new default entry for proxy crawling
this . defaultProxyProfile = new CrawlProfile (
"proxy" , null , CrawlProfile . MATCH_ALL , CrawlProfile . MATCH_NEVER ,
"proxy" , null ,
CrawlProfile . MATCH_ALL , CrawlProfile . MATCH_NEVER ,
CrawlProfile . MATCH_ALL , CrawlProfile . MATCH_NEVER ,
"" ,
0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/ ,
CrawlProfile . getRecrawlDate ( CRAWL_PROFILE_PROXY_RECRAWL_CYCLE ) , - 1 , false ,
true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/ ,
@ -239,38 +243,38 @@ public final class CrawlSwitchboard {
}
if ( this . defaultRemoteProfile = = null ) {
// generate new default entry for remote crawling
this . defaultRemoteProfile = new CrawlProfile ( CRAWL_PROFILE_REMOTE , null , CrawlProfile . MATCH_ALL , CrawlProfile . MATCH_ NEVER, 0 ,
this . defaultRemoteProfile = new CrawlProfile ( CRAWL_PROFILE_REMOTE , null , CrawlProfile . MATCH_ALL , CrawlProfile . MATCH_ ALL, CrawlProfile . MATCH_NEVER , "" , CrawlProfile . MATCH_ NEVER, 0 ,
- 1 , - 1 , true , true , true , false , false , true , true , false , CacheStrategy . IFFRESH ) ;
this . profilesActiveCrawls . put ( UTF8 . getBytes ( this . defaultRemoteProfile . handle ( ) ) , this . defaultRemoteProfile ) ;
}
if ( this . defaultTextSnippetLocalProfile = = null ) {
// generate new default entry for snippet fetch and optional crawling
this . defaultTextSnippetLocalProfile = new CrawlProfile ( CRAWL_PROFILE_SNIPPET_LOCAL_TEXT , null , CrawlProfile . MATCH_ALL , CrawlProfile . MATCH_NEVER , 0 ,
this . defaultTextSnippetLocalProfile = new CrawlProfile ( CRAWL_PROFILE_SNIPPET_LOCAL_TEXT , null , CrawlProfile . MATCH_ALL , CrawlProfile . MATCH_NEVER , CrawlProfile . MATCH_ALL , CrawlProfile . MATCH_NEVER , "" , 0 ,
CrawlProfile . getRecrawlDate ( CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE ) , - 1 , true , false , false , true , false , true , true , false , CacheStrategy . IFEXIST ) ;
this . profilesActiveCrawls . put ( UTF8 . getBytes ( this . defaultTextSnippetLocalProfile . handle ( ) ) , this . defaultTextSnippetLocalProfile ) ;
}
if ( this . defaultTextSnippetGlobalProfile = = null ) {
// generate new default entry for snippet fetch and optional crawling
this . defaultTextSnippetGlobalProfile = new CrawlProfile ( CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT , null , CrawlProfile . MATCH_ALL , CrawlProfile . MATCH_NEVER , 0 ,
this . defaultTextSnippetGlobalProfile = new CrawlProfile ( CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT , null , CrawlProfile . MATCH_ALL , CrawlProfile . MATCH_NEVER , CrawlProfile . MATCH_ALL , CrawlProfile . MATCH_NEVER , "" , 0 ,
CrawlProfile . getRecrawlDate ( CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE ) , - 1 , true , true , true , true , false , true , true , false , CacheStrategy . IFEXIST ) ;
this . profilesActiveCrawls . put ( UTF8 . getBytes ( this . defaultTextSnippetGlobalProfile . handle ( ) ) , this . defaultTextSnippetGlobalProfile ) ;
}
this . defaultTextSnippetGlobalProfile . setCacheStrategy ( CacheStrategy . IFEXIST ) ;
if ( this . defaultMediaSnippetLocalProfile = = null ) {
// generate new default entry for snippet fetch and optional crawling
this . defaultMediaSnippetLocalProfile = new CrawlProfile ( CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA , null , CrawlProfile . MATCH_ALL , CrawlProfile . MATCH_NEVER , 0 ,
this . defaultMediaSnippetLocalProfile = new CrawlProfile ( CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA , null , CrawlProfile . MATCH_ALL , CrawlProfile . MATCH_NEVER , CrawlProfile . MATCH_ALL , CrawlProfile . MATCH_NEVER , "" , 0 ,
CrawlProfile . getRecrawlDate ( CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE ) , - 1 , true , false , false , true , false , true , true , false , CacheStrategy . IFEXIST ) ;
this . profilesActiveCrawls . put ( UTF8 . getBytes ( this . defaultMediaSnippetLocalProfile . handle ( ) ) , this . defaultMediaSnippetLocalProfile ) ;
}
if ( this . defaultMediaSnippetGlobalProfile = = null ) {
// generate new default entry for snippet fetch and optional crawling
this . defaultMediaSnippetGlobalProfile = new CrawlProfile ( CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA , null , CrawlProfile . MATCH_ALL , CrawlProfile . MATCH_NEVER , 0 ,
this . defaultMediaSnippetGlobalProfile = new CrawlProfile ( CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA , null , CrawlProfile . MATCH_ALL , CrawlProfile . MATCH_NEVER , CrawlProfile . MATCH_ALL , CrawlProfile . MATCH_NEVER , "" , 0 ,
CrawlProfile . getRecrawlDate ( CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE ) , - 1 , true , false , true , true , false , true , true , false , CacheStrategy . IFEXIST ) ;
this . profilesActiveCrawls . put ( UTF8 . getBytes ( this . defaultMediaSnippetGlobalProfile . handle ( ) ) , this . defaultMediaSnippetGlobalProfile ) ;
}
if ( this . defaultSurrogateProfile = = null ) {
// generate new default entry for surrogate parsing
this . defaultSurrogateProfile = new CrawlProfile ( CRAWL_PROFILE_SURROGATE , null , CrawlProfile . MATCH_ALL , CrawlProfile . MATCH_NEVER , 0 ,
this . defaultSurrogateProfile = new CrawlProfile ( CRAWL_PROFILE_SURROGATE , null , CrawlProfile . MATCH_ALL , CrawlProfile . MATCH_NEVER , CrawlProfile . MATCH_ALL , CrawlProfile . MATCH_NEVER , "" , 0 ,
CrawlProfile . getRecrawlDate ( CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE ) , - 1 , true , true , false , false , false , true , true , false , CacheStrategy . NOCACHE ) ;
this . profilesActiveCrawls . put ( UTF8 . getBytes ( this . defaultSurrogateProfile . handle ( ) ) , this . defaultSurrogateProfile ) ;
}