@ -59,9 +59,9 @@ import java.util.Date;
import java.util.LinkedList ;
import java.util.Map ;
import java.util.TreeMap ;
import java.util.Calendar ;
import java.util.GregorianCalendar ;
import java.util.TimeZone ;
//import java.util.Calendar;
//import java.util.GregorianCalendar;
//import java.util.TimeZone;
import de.anomic.htmlFilter.htmlFilterContentScraper ;
import de.anomic.http.httpHeader ;
@ -89,13 +89,14 @@ public final class plasmaHTCache {
public static serverLog log ;
public plasmaHTCache ( File htCachePath , long maxCacheSize , int bufferkb ) {
// this.switchboard = switchboard;
// this.switchboard = switchboard;
this . log = new serverLog ( "HTCACHE" ) ;
this . cachePath = htCachePath ;
this . maxCacheSize = maxCacheSize ;
// set cache path
// we dont need check the path, because we have do that in plasmaSwitchboard.java - Borg-0300
/ * // set cache path
if ( ! ( htCachePath . exists ( ) ) ) {
// make the cache path
htCachePath . mkdir ( ) ;
@ -104,7 +105,7 @@ public final class plasmaHTCache {
// if the cache does not exists or is a file and not a directory, panic
System . out . println ( "the cache path " + htCachePath . toString ( ) + " is not a directory or does not exists and cannot be created" ) ;
System . exit ( 0 ) ;
}
} * /
// open the response header database
File dbfile = new File ( cachePath , "responseHeader.db" ) ;
@ -200,13 +201,13 @@ public final class plasmaHTCache {
while ( ( currCacheSize > maxCacheSize ) & & ( cacheAge . size ( ) > 0 ) ) {
f = ( File ) cacheAge . remove ( cacheAge . firstKey ( ) ) ;
if ( ( f ! = null ) & & ( f . exists ( ) ) ) {
currCacheSize - = f . length ( ) ;
long size = f . length ( ) ;
//currCacheSize -= f.length();
if ( f . delete ( ) ) {
log . logInfo ( "DELETED OLD CACHE : " + f . toString ( ) ) ;
currCacheSize - = size ;
f = f . getParentFile ( ) ;
if ( ( f . exists ( ) ) & & ( f . isDirectory ( ) ) ) {
// check size of directory
if ( f . list ( ) . length = = 0 ) {
if ( f . isDirectory ( ) & & ( f . list ( ) . length = = 0 ) ) {
// the directory has no files in it; delete it also
if ( f . delete ( ) ) log . logInfo ( "DELETED EMPTY DIRECTORY : " + f . toString ( ) ) ;
}
@ -214,7 +215,6 @@ public final class plasmaHTCache {
}
}
}
}
public void close ( ) throws IOException {
responseHeaderDB . close ( ) ;
@ -256,8 +256,7 @@ public final class plasmaHTCache {
}
log . logSystem ( "CACHE SCANNED, CONTAINS " + c +
" FILES = " + currCacheSize / 1048576 + "MB, OLDEST IS " +
( ( ageHours < 24 ) ? ( ageHours + " HOURS" ) : ( ( ageHours / 24 ) + " DAYS" ) ) +
" OLD" ) ;
( ( ageHours < 24 ) ? ( ageHours + " HOURS" ) : ( ( ageHours / 24 ) + " DAYS" ) ) + " OLD" ) ;
cleanup ( ) ;
// start to prefetch ip's from dns
@ -343,15 +342,18 @@ public final class plasmaHTCache {
return plasmaParser . mediaExtContains ( urlString ) ;
}
// this method creates from a given host and path a cache path
/ * *
* this method creates from a given host and path a cache path
* from a given host ( which may also be an IPv4 - number , but not IPv6 or
* a domain ; all without leading ' http : //') and a path (which must start
* with a leading '/' , and may also end in an '/' ) a path to a file
* in the file system with root as given in cachePath is constructed
* it will also be ensured , that the complete path exists ; if necessary
* that path will be generated
* @return URL
* /
public File getCachePath ( URL url ) {
// from a given host (which may also be an IPv4 - number, but not IPv6 or
// a domain; all without leading 'http://') and a path (which must start
// with a leading '/', and may also end in an '/') a path to a file
// in the file system with root as given in cachePath is constructed
// it will also be ensured, that the complete path exists; if necessary
// that path will be generated
//System.out.println("DEBUG: getCachedPath=" + url.toString());
// System.out.println("DEBUG: getCachePath: IN=" + url.toString());
String remotePath = url . getPath ( ) ;
if ( ! ( remotePath . startsWith ( "/" ) ) ) remotePath = "/" + remotePath ;
if ( remotePath . endsWith ( "/" ) ) remotePath = remotePath + "ndx" ;
@ -361,31 +363,40 @@ public final class plasmaHTCache {
remotePath = remotePath . replace ( ':' , '_' ) ; // yes this is not reversible, but that is not needed
int port = url . getPort ( ) ;
if ( port < 0 ) port = 80 ;
// System.out.println("DEBUG: getCachePath: OUT=" + url.getHost() + ((port == 80) ? "" : ("+" + port)) + remotePath);
return new File ( this . cachePath , url . getHost ( ) + ( ( port = = 80 ) ? "" : ( "+" + port ) ) + remotePath ) ;
}
/ * *
* this is the reverse function to getCachePath : it constructs the url as string
* from a given storage path
* /
public static URL getURL ( File cachePath , File f ) {
// this is the reverse function to getCachePath: it constructs the url as string
// from a given storage path
// System.out.println("DEBUG: getURL: IN: Path=[" + cachePath + "]");
// System.out.println("DEBUG: getURL: IN: File=[" + f + "]");
String s = f . toString ( ) . replace ( '\\' , '/' ) ;
String c = cachePath . toString ( ) . replace ( '\\' , '/' ) ;
//System.out.println("DEBUG: getURL for c=" + c + ", s=" + s);
int p = s . lastIndexOf ( c ) ;
if ( p > = 0 ) {
s = s . substring ( p + c . length ( ) ) ;
while ( s . startsWith ( "/" ) ) s = s . substring ( 1 ) ;
if ( ( p = s . indexOf ( "+" ) ) > = 0 ) {
s = s . substring ( 0 , p ) + ":" + s . substring ( p + 1 ) ;
} else {
/ * } else {
p = s . indexOf ( "/" ) ;
if ( p < 0 )
s = s + ":80/" ;
else
s = s . substring ( 0 , p ) + ":80" + s . substring ( p ) ;
s = s . substring ( 0 , p ) + ":80" + s . substring ( p ) ; * /
}
if ( s . endsWith ( "ndx" ) ) s = s . substring ( 0 , s . length ( ) - 3 ) ;
//System.out.println("DEBUG: getURL url=" + s);
// System.out.println("DEBUG: getURL: OUT=" + s);
try {
/ * URL url = null ;
url = new URL ( "http://" + s ) ;
System . out . println ( "DEBUG: getURL: URL=" + url . toString ( ) ) ;
return url ; //new URL("http://" + s); */
return new URL ( "http://" + s ) ;
} catch ( Exception e ) {
return null ;
@ -449,14 +460,14 @@ public final class plasmaHTCache {
public plasmaCrawlProfile . entry profile ;
private String initiator ;
public Entry ( Date initDate , int depth , URL url , String name ,
httpHeader requestHeader ,
String responseStatus , httpHeader responseHeader ,
String initiator ,
plasmaCrawlProfile . entry profile ) {
// normalize url
// normalize url - Borg-0300
serverLog . logDebug ( "PLASMA" , "Entry: URL=" + url . toString ( ) ) ;
this . nomalizedURLString = htmlFilterContentScraper . urlNormalform ( url ) ;
try {
this . url = new URL ( nomalizedURLString ) ;