@ -4,7 +4,10 @@
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
// last major change: 31.08.2010
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
@ -63,6 +66,26 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
private Pattern mustmatch = null , mustnotmatch = null ;
/ * *
* Constructor which creates CrawlPofile from parameters .
* @param name name of the crawl profile
* @param startURL root URL of the crawl
* @param mustmatch URLs which do not match this regex will be ignored
* @param mustnotmatch URLs which match this regex will be ignored
* @param depth height of the tree which will be created by the crawler
* @param recrawlIfOlder documents which have been indexed in the past will
* be indexed again if they are older than the time ( ms ) in this parameter
* @param domMaxPages maximum number from one domain which will be indexed
* @param crawlingQ true if URLs containing questionmarks shall be indexed
* @param indexText true if text content of URL shall be indexed
* @param indexMedia true if media content of URL shall be indexed
* @param storeHTCache true if content chall be kept in cache after indexing
* @param remoteIndexing true if part of the crawl job shall be distributed
* @param xsstopw true if static stop words shall be ignored
* @param xdstopw true if dynamic stop words shall be ignored
* @param xpstopw true if parent stop words shall be ignored
* @param cacheStrategy determines if and how cache is used loading content
* /
public CrawlProfile (
final String name ,
final DigestURI startURL ,
@ -81,8 +104,12 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
final boolean xpstopw ,
final CacheStrategy cacheStrategy ) {
super ( 40 ) ;
if ( name = = null | | name . length ( ) = = 0 ) throw new NullPointerException ( "name must not be null" ) ;
final String handle = ( startURL = = null ) ? Base64Order . enhancedCoder . encode ( Digest . encodeMD5Raw ( name ) ) . substring ( 0 , Word . commonHashLength ) : ASCII . String ( startURL . hash ( ) ) ;
if ( name = = null | | name . isEmpty ( ) ) {
throw new NullPointerException ( "name must not be null or empty" ) ;
}
final String handle = ( startURL = = null )
? Base64Order . enhancedCoder . encode ( Digest . encodeMD5Raw ( name ) ) . substring ( 0 , Word . commonHashLength )
: ASCII . String ( startURL . hash ( ) ) ;
put ( HANDLE , handle ) ;
put ( NAME , name ) ;
put ( START_URL , ( startURL = = null ) ? "" : startURL . toNormalform ( true , false ) ) ;
@ -102,37 +129,75 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
put ( CACHE_STRAGEGY , cacheStrategy . toString ( ) ) ;
}
/ * *
* Constructor which creats a CrawlProfile from values in a Map .
* @param ext contains values
* /
public CrawlProfile ( final Map < String , String > ext ) {
super ( ext = = null ? 1 : ext . size ( ) ) ;
if ( ext ! = null ) putAll ( ext ) ;
}
public void put ( final String key , final boolean value ) {
/ * *
* Adds a parameter to CrawlProfile .
* @param key name of the parameter
* @param value values if the parameter
* /
public final void put ( final String key , final boolean value ) {
super . put ( key , Boolean . toString ( value ) ) ;
}
public void put ( final String key , final int value ) {
/ * *
* Adds a parameter to CrawlProfile .
* @param key name of the parameter
* @param value values if the parameter
* /
public final void put ( final String key , final int value ) {
super . put ( key , Integer . toString ( value ) ) ;
}
public void put ( final String key , final long value ) {
/ * *
* Adds a parameter to CrawlProfile .
* @param key name of the parameter
* @param value values if the parameter
* /
public final void put ( final String key , final long value ) {
super . put ( key , Long . toString ( value ) ) ;
}
/ * *
* Gets handle of the CrawlProfile .
* @return handle of the profile
* /
public String handle ( ) {
final String r = get ( HANDLE ) ;
//if (r == null) return null;
return r ;
}
/ * *
* Gets the name of the CrawlProfile .
* @return name of the profile
* /
public String name ( ) {
final String r = get ( NAME ) ;
if ( r = = null ) return "" ;
return r ;
}
/ * *
* Gets the root URL of the crawl job .
* @return root URL
* /
public String startURL ( ) {
final String r = get ( START_URL ) ;
return r ;
}
/ * *
* Gets the regex which must be matched by URLs in order to be crawled .
* @return regex which must be matched
* /
public Pattern mustMatchPattern ( ) {
if ( this . mustmatch = = null ) {
String r = get ( FILTER_MUSTMATCH ) ;
@ -141,6 +206,11 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
}
return this . mustmatch ;
}
/ * *
* Gets the regex which must not be matched by URLs in order to be crawled .
* @return regex which must not be matched
* /
public Pattern mustNotMatchPattern ( ) {
if ( this . mustnotmatch = = null ) {
String r = get ( FILTER_MUSTNOTMATCH ) ;
@ -149,6 +219,12 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
}
return this . mustnotmatch ;
}
/ * *
* Gets depth of crawl job ( or height of the tree which will be
* created by the crawler ) .
* @return depth of crawl job
* /
public int depth ( ) {
final String r = get ( DEPTH ) ;
if ( r = = null ) return 0 ;
@ -159,6 +235,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
return 0 ;
}
}
public CacheStrategy cacheStrategy ( ) {
final String r = get ( CACHE_STRAGEGY ) ;
if ( r = = null ) return CacheStrategy . IFEXIST ;
@ -169,9 +246,15 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
return CacheStrategy . IFEXIST ;
}
}
public void setCacheStrategy ( final CacheStrategy newStrategy ) {
put ( CACHE_STRAGEGY , newStrategy . toString ( ) ) ;
}
/ * *
* Gets the minimum age that an entry must have to be re - crawled .
* @return time in ms
* /
public long recrawlIfOlder ( ) {
// returns a long (millis) that is the minimum age that
// an entry must have to be re-crawled
@ -185,6 +268,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
return 0 L ;
}
}
public int domMaxPages ( ) {
// this is the maximum number of pages that are crawled for a single domain
// if -1, this means no limit
@ -199,26 +283,31 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
return Integer . MAX_VALUE ;
}
}
public boolean crawlingQ ( ) {
final String r = get ( CRAWLING_Q ) ;
if ( r = = null ) return false ;
return ( r . equals ( Boolean . TRUE . toString ( ) ) ) ;
}
public boolean pushSolr ( ) {
final String r = get ( PUSH_SOLR ) ;
if ( r = = null ) return true ;
return ( r . equals ( Boolean . TRUE . toString ( ) ) ) ;
}
public boolean indexText ( ) {
final String r = get ( INDEX_TEXT ) ;
if ( r = = null ) return true ;
return ( r . equals ( Boolean . TRUE . toString ( ) ) ) ;
}
public boolean indexMedia ( ) {
final String r = get ( INDEX_MEDIA ) ;
if ( r = = null ) return true ;
return ( r . equals ( Boolean . TRUE . toString ( ) ) ) ;
}
public boolean storeHTCache ( ) {
final String r = get ( STORE_HTCACHE ) ;
if ( r = = null ) return false ;
@ -229,16 +318,19 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if ( r = = null ) return false ;
return ( r . equals ( Boolean . TRUE . toString ( ) ) ) ;
}
public boolean excludeStaticStopwords ( ) {
final String r = get ( XSSTOPW ) ;
if ( r = = null ) return false ;
return ( r . equals ( Boolean . TRUE . toString ( ) ) ) ;
}
public boolean excludeDynamicStopwords ( ) {
final String r = get ( XDSTOPW ) ;
if ( r = = null ) return false ;
return ( r . equals ( Boolean . TRUE . toString ( ) ) ) ;
}
public boolean excludeParentStopwords ( ) {
final String r = get ( XPSTOPW ) ;
if ( r = = null ) return false ;