@ -4,7 +4,7 @@
// (C) by Michael Peter Christen; mc@yacy.net
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
// Frankfurt, Germany, 2004
// last major change: 25.02.2004
// last major change: 31.08.2010
//
//
// This program is free software; you can redistribute it and/or modify
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// it under the terms of the GNU General Public License as published by
@ -22,263 +22,25 @@
package de.anomic.crawler ;
package de.anomic.crawler ;
import java.io.File ;
import java.io.IOException ;
import java.util.HashSet ;
import java.util.Iterator ;
import java.util.Iterator ;
import java.util.Map ;
import java.util.Map ;
import java.util.concurrent.ConcurrentHashMap ;
import java.util.concurrent.ConcurrentHashMap ;
import java.util.regex.Pattern ;
import java.util.regex.Pattern ;
import net.yacy.kelondro.blob.MapHeap ;
import net.yacy.kelondro.data.meta.DigestURI ;
import net.yacy.kelondro.data.meta.DigestURI ;
import net.yacy.kelondro.data.word.Word ;
import net.yacy.kelondro.data.word.Word ;
import net.yacy.kelondro.index.RowSpaceExceededException ;
import net.yacy.kelondro.logging.Log ;
import net.yacy.kelondro.logging.Log ;
import net.yacy.kelondro.order.Base64Order ;
import net.yacy.kelondro.order.Base64Order ;
import net.yacy.kelondro.order.CloneableIterator ;
import net.yacy.kelondro.order.Digest ;
import net.yacy.kelondro.order.Digest ;
import net.yacy.kelondro.order.NaturalOrder ;
import net.yacy.kelondro.util.FileUtils ;
import net.yacy.kelondro.util.kelondroException ;
public class CrawlProfile {
public class CrawlProfile extends ConcurrentHashMap < String , String > implements Map < String , String > {
private static final long serialVersionUID = 5527325718810703504L ;
public static final String MATCH_ALL = ".*" ;
public static final String MATCH_ALL = ".*" ;
public static final String MATCH_NEVER = "" ;
public static final String MATCH_NEVER = "" ;
static ConcurrentHashMap < String , Map < String , DomProfile > > domsCache = new ConcurrentHashMap < String , Map < String , DomProfile > > ( ) ;
MapHeap profileTable ;
private final File profileTableFile ;
public CrawlProfile ( final File file ) throws IOException {
//System.out.println("loading crawl profile from " + file);
this . profileTableFile = file ;
profileTableFile . getParentFile ( ) . mkdirs ( ) ;
profileTable = new MapHeap ( profileTableFile , Word . commonHashLength , NaturalOrder . naturalOrder , 1024 * 64 , 500 , '_' ) ;
profileIterator pi = new profileIterator ( true ) ;
entry e ;
while ( pi . hasNext ( ) ) {
e = pi . next ( ) ;
if ( e = = null ) continue ;
Log . logInfo ( "CrawlProfiles" , "loaded Profile " + e . handle ( ) + ": " + e . name ( ) ) ;
}
}
public void clear ( ) {
// deletes the profile database and creates a new one
if ( profileTable ! = null ) profileTable . close ( ) ;
FileUtils . deletedelete ( profileTableFile ) ;
profileTableFile . getParentFile ( ) . mkdirs ( ) ;
try {
profileTable = new MapHeap ( profileTableFile , Word . commonHashLength , NaturalOrder . naturalOrder , 1024 * 64 , 500 , '_' ) ;
} catch ( IOException e ) {
Log . logException ( e ) ;
}
}
public void close ( ) {
if ( profileTable ! = null ) profileTable . close ( ) ;
this . profileTable = null ;
}
public int size ( ) {
return profileTable . size ( ) ;
}
public Iterator < entry > profiles ( final boolean up ) {
// enumerates profile entries
try {
return new profileIterator ( up ) ;
} catch ( final IOException e ) {
Log . logException ( e ) ;
return new HashSet < entry > ( ) . iterator ( ) ;
}
}
public class profileIterator implements Iterator < entry > {
// the iterator iterates all keys, which are byte[] objects
CloneableIterator < byte [ ] > handleIterator ;
String lastkey ;
public profileIterator ( final boolean up ) throws IOException {
handleIterator = profileTable . keys ( up , false ) ;
lastkey = null ;
}
public boolean hasNext ( ) {
try {
return handleIterator . hasNext ( ) ;
} catch ( final kelondroException e ) {
Log . logException ( e ) ;
clear ( ) ;
return false ;
}
}
public entry next ( ) {
try {
lastkey = new String ( handleIterator . next ( ) ) ;
return getEntry ( lastkey ) ;
} catch ( final kelondroException e ) {
Log . logException ( e ) ;
clear ( ) ;
return null ;
}
}
public void remove ( ) {
if ( lastkey ! = null ) try {
removeEntry ( lastkey . getBytes ( ) ) ;
} catch ( final kelondroException e ) {
Log . logException ( e ) ;
clear ( ) ;
}
}
}
public void removeEntry ( final byte [ ] handle ) {
try {
profileTable . delete ( handle ) ;
} catch ( final IOException e ) {
Log . logException ( e ) ;
}
}
public entry newEntry ( final Map < String , String > mem ) {
final entry ne = new entry ( mem ) ;
try {
profileTable . insert ( ne . handle ( ) . getBytes ( ) , ne . map ( ) ) ;
} catch ( final Exception e ) {
clear ( ) ;
try {
profileTable . insert ( ne . handle ( ) . getBytes ( ) , ne . map ( ) ) ;
} catch ( final Exception ee ) {
Log . logException ( e ) ;
System . exit ( 0 ) ;
}
}
return ne ;
}
public entry newEntry ( final String name ,
final DigestURI startURL ,
final String mustmatch , final String mustnotmatch ,
final int generalDepth ,
final long recrawlIfOlder /*date*/ , final int domFilterDepth , final int domMaxPages ,
final boolean crawlingQ ,
final boolean indexText , final boolean indexMedia ,
final boolean storeHTCache , final boolean storeTXCache ,
final boolean remoteIndexing ,
final boolean xsstopw , final boolean xdstopw , final boolean xpstopw ,
final CacheStrategy cacheStrategy ) {
final entry ne = new entry (
name , startURL ,
mustmatch , mustnotmatch ,
generalDepth ,
recrawlIfOlder , domFilterDepth , domMaxPages ,
crawlingQ ,
indexText , indexMedia ,
storeHTCache , storeTXCache ,
remoteIndexing ,
xsstopw , xdstopw , xpstopw ,
cacheStrategy ) ;
try {
profileTable . insert ( ne . handle ( ) . getBytes ( ) , ne . map ( ) ) ;
} catch ( final Exception e ) {
clear ( ) ;
try {
profileTable . insert ( ne . handle ( ) . getBytes ( ) , ne . map ( ) ) ;
} catch ( final Exception ee ) {
Log . logException ( e ) ;
System . exit ( 0 ) ;
}
}
return ne ;
}
public boolean hasEntry ( final String handle ) {
return profileTable . containsKey ( handle . getBytes ( ) ) ;
}
public entry getEntry ( final String handle ) {
if ( profileTable = = null ) return null ;
Map < String , String > m ;
try {
m = profileTable . get ( handle . getBytes ( ) ) ;
} catch ( final IOException e ) {
Log . logException ( e ) ;
return null ;
} catch ( RowSpaceExceededException e ) {
Log . logException ( e ) ;
return null ;
}
if ( m = = null ) return null ;
return new entry ( m ) ;
}
public void changeEntry ( final entry e , final String propName , final String newValue ) throws IOException , RowSpaceExceededException {
e . mem . put ( propName , newValue ) ;
assert e . handle ( ) ! = null ;
profileTable . insert ( e . handle ( ) . getBytes ( ) , e . mem ) ;
}
public long getRecrawlDate ( final long oldTimeMinutes ) {
return System . currentTimeMillis ( ) - ( 60000L * oldTimeMinutes ) ;
}
public static class DomProfile {
public String referrer ;
public int depth , count ;
public DomProfile ( final String ref , final int d ) {
this . referrer = ref ;
this . depth = d ;
this . count = 1 ;
}
public void inc ( ) {
this . count + + ;
}
}
public static enum CacheStrategy {
NOCACHE ( 0 ) , // never use the cache, all content from fresh internet source
IFFRESH ( 1 ) , // use the cache if the cache exists and is fresh using the proxy-fresh rules
IFEXIST ( 2 ) , // use the cache if the cache exist. Do no check freshness. Otherwise use online source.
CACHEONLY ( 3 ) ; // never go online, use all content from cache. If no cache exist, treat content as unavailable
public int code ;
private CacheStrategy ( int code ) {
this . code = code ;
}
public String toString ( ) {
return Integer . toString ( this . code ) ;
}
public static CacheStrategy decode ( int code ) {
for ( CacheStrategy strategy : CacheStrategy . values ( ) ) if ( strategy . code = = code ) return strategy ;
return NOCACHE ;
}
public static CacheStrategy parse ( String name ) {
if ( name . equals ( "nocache" ) ) return NOCACHE ;
if ( name . equals ( "iffresh" ) ) return IFFRESH ;
if ( name . equals ( "ifexist" ) ) return IFEXIST ;
if ( name . equals ( "cacheonly" ) ) return CACHEONLY ;
return null ;
}
public String toName ( ) {
return this . name ( ) . toLowerCase ( ) ;
}
public boolean isAllowedToFetchOnline ( ) {
return this . code < 3 ;
}
public boolean mustBeOffline ( ) {
return this . code = = 3 ;
}
}
public static class entry {
// this is a simple record structure that hold all properties of a single crawl start
// this is a simple record structure that hold all properties of a single crawl start
public static final String HANDLE = "handle" ;
public static final String HANDLE = "handle" ;
public static final String NAME = "name" ;
public static final String NAME = "name" ;
public static final String START_URL = "startURL" ;
public static final String START_URL = "startURL" ;
@ -299,12 +61,11 @@ public class CrawlProfile {
public static final String XPSTOPW = "xpstopw" ;
public static final String XPSTOPW = "xpstopw" ;
public static final String CACHE_STRAGEGY = "cacheStrategy" ;
public static final String CACHE_STRAGEGY = "cacheStrategy" ;
private Map < String , String > mem ;
private Map < String , DomProfile > doms ;
private Map < String , DomProfile > doms ;
private Pattern mustmatch = null , mustnotmatch = null ;
private Pattern mustmatch = null , mustnotmatch = null ;
public entry ( final String name , final DigestURI startURL ,
public CrawlProfil e( final String name , final DigestURI startURL ,
final String mustmatch ,
final String mustmatch ,
final String mustnotmatch ,
final String mustnotmatch ,
final int depth ,
final int depth ,
@ -316,83 +77,81 @@ public class CrawlProfile {
final boolean remoteIndexing ,
final boolean remoteIndexing ,
final boolean xsstopw , final boolean xdstopw , final boolean xpstopw ,
final boolean xsstopw , final boolean xdstopw , final boolean xpstopw ,
final CacheStrategy cacheStrategy ) {
final CacheStrategy cacheStrategy ) {
super ( 40 ) ;
if ( name = = null | | name . length ( ) = = 0 ) throw new NullPointerException ( "name must not be null" ) ;
if ( name = = null | | name . length ( ) = = 0 ) throw new NullPointerException ( "name must not be null" ) ;
final String handle = ( startURL = = null ) ? Base64Order . enhancedCoder . encode ( Digest . encodeMD5Raw ( name ) ) . substring ( 0 , Word . commonHashLength ) : new String ( startURL . hash ( ) ) ;
final String handle = ( startURL = = null ) ? Base64Order . enhancedCoder . encode ( Digest . encodeMD5Raw ( name ) ) . substring ( 0 , Word . commonHashLength ) : new String ( startURL . hash ( ) ) ;
mem = new ConcurrentHashMap < String , String > ( 40 ) ;
put ( HANDLE , handle ) ;
mem . put ( HANDLE , handle ) ;
put ( NAME , name ) ;
mem . put ( NAME , name ) ;
put ( START_URL , ( startURL = = null ) ? "" : startURL . toNormalform ( true , false ) ) ;
mem . put ( START_URL , ( startURL = = null ) ? "" : startURL . toNormalform ( true , false ) ) ;
put ( FILTER_MUSTMATCH , ( mustmatch = = null ) ? CrawlProfile . MATCH_ALL : mustmatch ) ;
mem . put ( FILTER_MUSTMATCH , ( mustmatch = = null ) ? MATCH_ALL : mustmatch ) ;
put ( FILTER_MUSTNOTMATCH , ( mustnotmatch = = null ) ? CrawlProfile . MATCH_NEVER : mustnotmatch ) ;
mem . put ( FILTER_MUSTNOTMATCH , ( mustnotmatch = = null ) ? MATCH_NEVER : mustnotmatch ) ;
put ( DEPTH , depth ) ;
mem . put ( DEPTH , Integer . toString ( depth ) ) ;
put ( RECRAWL_IF_OLDER , recrawlIfOlder ) ;
mem . put ( RECRAWL_IF_OLDER , Long . toString ( recrawlIfOlder ) ) ;
put ( DOM_FILTER_DEPTH , domFilterDepth ) ;
mem . put ( DOM_FILTER_DEPTH , Integer . toString ( domFilterDepth ) ) ;
put ( DOM_MAX_PAGES , domMaxPages ) ;
mem . put ( DOM_MAX_PAGES , Integer . toString ( domMaxPages ) ) ;
put ( CRAWLING_Q , crawlingQ ) ; // crawling of urls with '?'
mem . put ( CRAWLING_Q , Boolean . toString ( crawlingQ ) ) ; // crawling of urls with '?'
put ( INDEX_TEXT , indexText ) ;
mem . put ( INDEX_TEXT , Boolean . toString ( indexText ) ) ;
put ( INDEX_MEDIA , indexMedia ) ;
mem . put ( INDEX_MEDIA , Boolean . toString ( indexMedia ) ) ;
put ( STORE_HTCACHE , storeHTCache ) ;
mem . put ( STORE_HTCACHE , Boolean . toString ( storeHTCache ) ) ;
put ( STORE_TXCACHE , storeTXCache ) ;
mem . put ( STORE_TXCACHE , Boolean . toString ( storeTXCache ) ) ;
put ( REMOTE_INDEXING , remoteIndexing ) ;
mem . put ( REMOTE_INDEXING , Boolean . toString ( remoteIndexing ) ) ;
put ( XSSTOPW , xsstopw ) ; // exclude static stop-words
mem . put ( XSSTOPW , Boolean . toString ( xsstopw ) ) ; // exclude static stop-words
put ( XDSTOPW , xdstopw ) ; // exclude dynamic stop-word
mem . put ( XDSTOPW , Boolean . toString ( xdstopw ) ) ; // exclude dynamic stop-word
put ( XPSTOPW , xpstopw ) ; // exclude parent stop-words
mem . put ( XPSTOPW , Boolean . toString ( xpstopw ) ) ; // exclude parent stop-words
put ( CACHE_STRAGEGY , cacheStrategy . toString ( ) ) ;
mem . put ( CACHE_STRAGEGY , cacheStrategy . toString ( ) ) ;
doms = new ConcurrentHashMap < String , DomProfile > ( ) ;
doms = new ConcurrentHashMap < String , DomProfile > ( ) ;
}
}
@Override
public CrawlProfile ( Map < String , String > ext ) {
public String toString ( ) {
super ( ext = = null ? 1 : ext . size ( ) ) ;
final StringBuilder str = new StringBuilder ( ) ;
if ( ext ! = null ) this . putAll ( ext ) ;
doms = new ConcurrentHashMap < String , DomProfile > ( ) ;
if ( this . mem ! = null ) {
str . append ( this . mem . toString ( ) ) ;
}
}
return str . toString ( ) ;
public void put ( String key , boolean value ) {
super . put ( key , Boolean . toString ( value ) ) ;
}
}
public entry ( final Map < String , String > mem ) {
public void put ( String key , int value ) {
this . mem = mem ;
super . put ( key , Integer . toString ( value ) ) ;
this . doms = domsCache . get ( this . mem . get ( HANDLE ) ) ;
if ( this . doms = = null ) this . doms = new ConcurrentHashMap < String , DomProfile > ( ) ;
}
}
public Map < String , String > map ( ) {
public void put ( String key , long value ) {
return mem ;
super . put ( key , Long . toString ( value ) ) ;
}
}
public String handle ( ) {
public String handle ( ) {
final String r = mem. get( HANDLE ) ;
final String r = get( HANDLE ) ;
//if (r == null) return null;
//if (r == null) return null;
return r ;
return r ;
}
}
public String name ( ) {
public String name ( ) {
final String r = mem. get( NAME ) ;
final String r = get( NAME ) ;
if ( r = = null ) return "" ;
if ( r = = null ) return "" ;
return r ;
return r ;
}
}
public String startURL ( ) {
public String startURL ( ) {
final String r = mem. get( START_URL ) ;
final String r = get( START_URL ) ;
return r ;
return r ;
}
}
public Pattern mustMatchPattern ( ) {
public Pattern mustMatchPattern ( ) {
if ( this . mustmatch = = null ) {
if ( this . mustmatch = = null ) {
String r = mem. get( FILTER_MUSTMATCH ) ;
String r = get( FILTER_MUSTMATCH ) ;
if ( r = = null ) r = MATCH_ALL;
if ( r = = null ) r = CrawlProfile. MATCH_ALL;
this . mustmatch = Pattern . compile ( r ) ;
this . mustmatch = Pattern . compile ( r ) ;
}
}
return this . mustmatch ;
return this . mustmatch ;
}
}
public Pattern mustNotMatchPattern ( ) {
public Pattern mustNotMatchPattern ( ) {
if ( this . mustnotmatch = = null ) {
if ( this . mustnotmatch = = null ) {
String r = mem. get( FILTER_MUSTNOTMATCH ) ;
String r = get( FILTER_MUSTNOTMATCH ) ;
if ( r = = null ) r = MATCH_NEVER;
if ( r = = null ) r = CrawlProfile. MATCH_NEVER;
this . mustnotmatch = Pattern . compile ( r ) ;
this . mustnotmatch = Pattern . compile ( r ) ;
}
}
return this . mustnotmatch ;
return this . mustnotmatch ;
}
}
public int depth ( ) {
public int depth ( ) {
final String r = mem. get( DEPTH ) ;
final String r = get( DEPTH ) ;
if ( r = = null ) return 0 ;
if ( r = = null ) return 0 ;
try {
try {
return Integer . parseInt ( r ) ;
return Integer . parseInt ( r ) ;
@ -402,7 +161,7 @@ public class CrawlProfile {
}
}
}
}
public CacheStrategy cacheStrategy ( ) {
public CacheStrategy cacheStrategy ( ) {
final String r = mem. get( CACHE_STRAGEGY ) ;
final String r = get( CACHE_STRAGEGY ) ;
if ( r = = null ) return CacheStrategy . IFFRESH ;
if ( r = = null ) return CacheStrategy . IFFRESH ;
try {
try {
return CacheStrategy . decode ( Integer . parseInt ( r ) ) ;
return CacheStrategy . decode ( Integer . parseInt ( r ) ) ;
@ -412,12 +171,12 @@ public class CrawlProfile {
}
}
}
}
public void setCacheStrategy ( CacheStrategy newStrategy ) {
public void setCacheStrategy ( CacheStrategy newStrategy ) {
mem. put( CACHE_STRAGEGY , newStrategy . toString ( ) ) ;
put( CACHE_STRAGEGY , newStrategy . toString ( ) ) ;
}
}
public long recrawlIfOlder ( ) {
public long recrawlIfOlder ( ) {
// returns a long (millis) that is the minimum age that
// returns a long (millis) that is the minimum age that
// an entry must have to be re-crawled
// an entry must have to be re-crawled
final String r = mem. get( RECRAWL_IF_OLDER ) ;
final String r = get( RECRAWL_IF_OLDER ) ;
if ( r = = null ) return 0 L ;
if ( r = = null ) return 0 L ;
try {
try {
final long l = Long . parseLong ( r ) ;
final long l = Long . parseLong ( r ) ;
@ -431,7 +190,7 @@ public class CrawlProfile {
// if the depth is equal or less to this depth,
// if the depth is equal or less to this depth,
// then the current url feeds with its domain the crawl filter
// then the current url feeds with its domain the crawl filter
// if this is -1, all domains are feeded
// if this is -1, all domains are feeded
final String r = mem. get( DOM_FILTER_DEPTH ) ;
final String r = get( DOM_FILTER_DEPTH ) ;
if ( r = = null ) return Integer . MAX_VALUE ;
if ( r = = null ) return Integer . MAX_VALUE ;
try {
try {
final int i = Integer . parseInt ( r ) ;
final int i = Integer . parseInt ( r ) ;
@ -445,7 +204,7 @@ public class CrawlProfile {
public int domMaxPages ( ) {
public int domMaxPages ( ) {
// this is the maximum number of pages that are crawled for a single domain
// this is the maximum number of pages that are crawled for a single domain
// if -1, this means no limit
// if -1, this means no limit
final String r = mem. get( DOM_MAX_PAGES ) ;
final String r = get( DOM_MAX_PAGES ) ;
if ( r = = null ) return Integer . MAX_VALUE ;
if ( r = = null ) return Integer . MAX_VALUE ;
try {
try {
final int i = Integer . parseInt ( r ) ;
final int i = Integer . parseInt ( r ) ;
@ -457,47 +216,47 @@ public class CrawlProfile {
}
}
}
}
public boolean crawlingQ ( ) {
public boolean crawlingQ ( ) {
final String r = mem. get( CRAWLING_Q ) ;
final String r = get( CRAWLING_Q ) ;
if ( r = = null ) return false ;
if ( r = = null ) return false ;
return ( r . equals ( Boolean . TRUE . toString ( ) ) ) ;
return ( r . equals ( Boolean . TRUE . toString ( ) ) ) ;
}
}
public boolean indexText ( ) {
public boolean indexText ( ) {
final String r = mem. get( INDEX_TEXT ) ;
final String r = get( INDEX_TEXT ) ;
if ( r = = null ) return true ;
if ( r = = null ) return true ;
return ( r . equals ( Boolean . TRUE . toString ( ) ) ) ;
return ( r . equals ( Boolean . TRUE . toString ( ) ) ) ;
}
}
public boolean indexMedia ( ) {
public boolean indexMedia ( ) {
final String r = mem. get( INDEX_MEDIA ) ;
final String r = get( INDEX_MEDIA ) ;
if ( r = = null ) return true ;
if ( r = = null ) return true ;
return ( r . equals ( Boolean . TRUE . toString ( ) ) ) ;
return ( r . equals ( Boolean . TRUE . toString ( ) ) ) ;
}
}
public boolean storeHTCache ( ) {
public boolean storeHTCache ( ) {
final String r = mem. get( STORE_HTCACHE ) ;
final String r = get( STORE_HTCACHE ) ;
if ( r = = null ) return false ;
if ( r = = null ) return false ;
return ( r . equals ( Boolean . TRUE . toString ( ) ) ) ;
return ( r . equals ( Boolean . TRUE . toString ( ) ) ) ;
}
}
public boolean storeTXCache ( ) {
public boolean storeTXCache ( ) {
final String r = mem. get( STORE_TXCACHE ) ;
final String r = get( STORE_TXCACHE ) ;
if ( r = = null ) return false ;
if ( r = = null ) return false ;
return ( r . equals ( Boolean . TRUE . toString ( ) ) ) ;
return ( r . equals ( Boolean . TRUE . toString ( ) ) ) ;
}
}
public boolean remoteIndexing ( ) {
public boolean remoteIndexing ( ) {
final String r = mem. get( REMOTE_INDEXING ) ;
final String r = get( REMOTE_INDEXING ) ;
if ( r = = null ) return false ;
if ( r = = null ) return false ;
return ( r . equals ( Boolean . TRUE . toString ( ) ) ) ;
return ( r . equals ( Boolean . TRUE . toString ( ) ) ) ;
}
}
public boolean excludeStaticStopwords ( ) {
public boolean excludeStaticStopwords ( ) {
final String r = mem. get( XSSTOPW ) ;
final String r = get( XSSTOPW ) ;
if ( r = = null ) return false ;
if ( r = = null ) return false ;
return ( r . equals ( Boolean . TRUE . toString ( ) ) ) ;
return ( r . equals ( Boolean . TRUE . toString ( ) ) ) ;
}
}
public boolean excludeDynamicStopwords ( ) {
public boolean excludeDynamicStopwords ( ) {
final String r = mem. get( XDSTOPW ) ;
final String r = get( XDSTOPW ) ;
if ( r = = null ) return false ;
if ( r = = null ) return false ;
return ( r . equals ( Boolean . TRUE . toString ( ) ) ) ;
return ( r . equals ( Boolean . TRUE . toString ( ) ) ) ;
}
}
public boolean excludeParentStopwords ( ) {
public boolean excludeParentStopwords ( ) {
final String r = mem. get( XPSTOPW ) ;
final String r = get( XPSTOPW ) ;
if ( r = = null ) return false ;
if ( r = = null ) return false ;
return ( r . equals ( Boolean . TRUE . toString ( ) ) ) ;
return ( r . equals ( Boolean . TRUE . toString ( ) ) ) ;
}
}
@ -510,7 +269,6 @@ public class CrawlProfile {
// increase counter
// increase counter
dp . inc ( ) ;
dp . inc ( ) ;
}
}
domsCache . put ( this . mem . get ( HANDLE ) , doms ) ;
}
}
public boolean grantedDomAppearance ( final String domain ) {
public boolean grantedDomAppearance ( final String domain ) {
final int max = domFilterDepth ( ) ;
final int max = domFilterDepth ( ) ;
@ -556,6 +314,59 @@ public class CrawlProfile {
}
}
return domname ;
return domname ;
}
}
public final static class DomProfile {
public String referrer ;
public int depth , count ;
public DomProfile ( final String ref , final int d ) {
this . referrer = ref ;
this . depth = d ;
this . count = 1 ;
}
public void inc ( ) {
this . count + + ;
}
}
}
public static enum CacheStrategy {
NOCACHE ( 0 ) , // never use the cache, all content from fresh internet source
IFFRESH ( 1 ) , // use the cache if the cache exists and is fresh using the proxy-fresh rules
IFEXIST ( 2 ) , // use the cache if the cache exist. Do no check freshness. Otherwise use online source.
CACHEONLY ( 3 ) ; // never go online, use all content from cache. If no cache exist, treat content as unavailable
public int code ;
private CacheStrategy ( int code ) {
this . code = code ;
}
public String toString ( ) {
return Integer . toString ( this . code ) ;
}
public static CacheStrategy decode ( int code ) {
for ( CacheStrategy strategy : CacheStrategy . values ( ) ) if ( strategy . code = = code ) return strategy ;
return NOCACHE ;
}
public static CacheStrategy parse ( String name ) {
if ( name . equals ( "nocache" ) ) return NOCACHE ;
if ( name . equals ( "iffresh" ) ) return IFFRESH ;
if ( name . equals ( "ifexist" ) ) return IFEXIST ;
if ( name . equals ( "cacheonly" ) ) return CACHEONLY ;
return null ;
}
public String toName ( ) {
return this . name ( ) . toLowerCase ( ) ;
}
public boolean isAllowedToFetchOnline ( ) {
return this . code < 3 ;
}
public boolean mustBeOffline ( ) {
return this . code = = 3 ;
}
}
public static long getRecrawlDate ( final long oldTimeMinutes ) {
return System . currentTimeMillis ( ) - ( 60000L * oldTimeMinutes ) ;
}
}
}