@ -427,11 +427,10 @@ public class plasmaHTCache {
public Entry newEntry ( Date initDate , int depth , URL url ,
httpHeader requestHeader ,
String responseStatus , httpHeader responseHeader ,
htmlFilterContentScraper scraper ,
String initiator ,
plasmaCrawlProfile . entry profile ) {
//System.out.println("NEW ENTRY: " + url.toString()); // DEBUG
return new Entry ( initDate , depth , url , requestHeader , responseStatus , responseHeader , scraper, initiator, profile ) ;
return new Entry ( initDate , depth , url , requestHeader , responseStatus , responseHeader , initiator, profile ) ;
}
public class Entry {
@ -449,15 +448,17 @@ public class plasmaHTCache {
public String urlString ;
public int status ; // cache load/hit/stale etc status
public Date lastModified ;
public htmlFilterContentScraper scraper ;
public char doctype ;
public String language ;
public plasmaCrawlProfile . entry profile ;
private String initiator ;
public ByteArrayOutputStream content ;
public htmlFilterContentScraper scraper ;
public Entry ( Date initDate , int depth , URL url ,
httpHeader requestHeader ,
String responseStatus , httpHeader responseHeader ,
htmlFilterContentScraper scraper ,
String initiator ,
plasmaCrawlProfile . entry profile ) {
@ -478,7 +479,7 @@ public class plasmaHTCache {
this . requestHeader = requestHeader ;
this . responseStatus = responseStatus ;
this . responseHeader = responseHeader ;
this . scraper = scraper ;
this . content = new ByteArrayOutputStream ( ) ;
this . profile = profile ;
this . initiator = ( initiator = = null ) ? null : ( ( initiator . length ( ) = = 0 ) ? null : initiator ) ;
@ -503,8 +504,16 @@ public class plasmaHTCache {
// to be defined later:
this . cacheArray = null ;
this . status = CACHE_UNFILLED ;
this . scraper = null ;
}
public OutputStream getContentOutputStream ( ) {
return ( OutputStream ) content ;
}
public byte [ ] getContentBytes ( ) {
try { content . flush ( ) ; } catch ( IOException e ) { }
return content . toByteArray ( ) ;
}
public String initiator ( ) {
return initiator ;
}
@ -614,8 +623,129 @@ public class plasmaHTCache {
return null ;
}
public String shallIndexCache ( ) {
public boolean shallUseCache ( ) {
// decide upon header information if a specific file should be taken from the cache or not
//System.out.println("SHALL READ CACHE: requestHeader = " + requestHeader.toString() + ", responseHeader = " + responseHeader.toString());
// -CGI access in request
// CGI access makes the page very individual, and therefore not usable in caches
if ( isPOST ( urlString ) ) return false ;
if ( isCGI ( urlString ) ) return false ;
// -authorization cases in request
if ( requestHeader . containsKey ( "AUTHORIZATION" ) ) return false ;
// -ranges in request
// we do not cache partial content
if ( ( requestHeader ! = null ) & & ( requestHeader . containsKey ( "RANGE" ) ) ) return false ;
//Date d1, d2;
// -if-modified-since in request
// The entity has to be transferred only if it has
// been modified since the date given by the If-Modified-Since header.
if ( requestHeader . containsKey ( "IF-MODIFIED-SINCE" ) ) {
// checking this makes only sense if the cached response contains
// a Last-Modified field. If the field does not exist, we go the safe way
if ( ! ( responseHeader . containsKey ( "Last-Modified" ) ) ) return false ;
// parse date
Date d1 , d2 ;
d2 = responseHeader . lastModified ( ) ; if ( d2 = = null ) d2 = new Date ( ) ;
d1 = requestHeader . ifModifiedSince ( ) ; if ( d1 = = null ) d1 = new Date ( ) ;
// finally, we shall treat the cache as stale if the modification time is after the if-.. time
if ( d2 . after ( d1 ) ) return false ;
}
boolean isNotPicture = ! isPicture ( responseHeader ) ;
// -cookies in request
// unfortunately, we should reload in case of a cookie
// but we think that pictures can still be considered as fresh
if ( ( requestHeader . containsKey ( "COOKIE" ) ) & & ( isNotPicture ) ) return false ;
// -set-cookie in cached response
// this is a similar case as for COOKIE.
if ( ( responseHeader . containsKey ( "SET-COOKIE" ) ) & & ( isNotPicture ) ) return false ; // too strong
if ( ( responseHeader . containsKey ( "SET-COOKIE2" ) ) & & ( isNotPicture ) ) return false ; // too strong
// -pragma in cached response
// logically, we would not need to care about no-cache pragmas in cached response headers,
// because they cannot exist since they are not written to the cache.
// So this IF should always fail..
if ( ( responseHeader . containsKey ( "PRAGMA" ) ) & &
( ( ( String ) responseHeader . get ( "Pragma" ) ) . toUpperCase ( ) . equals ( "NO-CACHE" ) ) ) return false ;
// calculate often needed values for freshness attributes
Date date = responseHeader . date ( ) ;
Date expires = responseHeader . expires ( ) ;
Date lastModified = responseHeader . lastModified ( ) ;
String cacheControl = ( String ) responseHeader . get ( "Cache-Control" ) ;
// see for documentation also:
// http://www.web-caching.com/cacheability.html
// http://vancouver-webpages.com/CacheNow/
// look for freshnes information
// if we don't have any freshnes indication, we treat the file as stale.
// no handle for freshness control:
if ( ( expires = = null ) & & ( cacheControl = = null ) & & ( lastModified = = null ) ) return false ;
// -expires in cached response
// the expires value gives us a very easy hint when the cache is stale
if ( expires ! = null ) {
Date yesterday = new Date ( ( new Date ( ) ) . getTime ( ) - oneday ) ;
if ( expires . before ( yesterday ) ) return false ;
}
// -lastModified in cached response
// we can apply a TTL (Time To Live) heuristic here. We call the time delta between the last read
// of the file and the last modified date as the age of the file. If we consider the file as
// middel-aged then, the maximum TTL would be cache-creation plus age.
// This would be a TTL factor of 100% we want no more than 10% TTL, so that a 10 month old cache
// file may only be treated as fresh for one more month, not more.
if ( lastModified ! = null ) {
if ( date = = null ) date = new Date ( ) ;
long age = date . getTime ( ) - lastModified . getTime ( ) ;
if ( age < 0 ) return false ;
// TTL (Time-To-Live) is age/10 = (d2.getTime() - d1.getTime()) / 10
// the actual living-time is new Date().getTime() - d2.getTime()
// therefore the cache is stale, if Date().getTime() - d2.getTime() > age/10
if ( ( new Date ( ) ) . getTime ( ) - date . getTime ( ) > age / 10 ) return false ;
}
// -cache-control in cached response
// the cache-control has many value options.
if ( cacheControl ! = null ) {
cacheControl = cacheControl . trim ( ) . toUpperCase ( ) ;
if ( cacheControl . startsWith ( "PUBLIC" ) ) {
// ok, do nothing
} else if ( ( cacheControl . startsWith ( "PRIVATE" ) ) | |
( cacheControl . startsWith ( "NO-CACHE" ) ) | |
( cacheControl . startsWith ( "NO-STORE" ) ) ) {
// easy case
return false ;
} else if ( cacheControl . startsWith ( "MAX-AGE=" ) ) {
// we need also the load date
if ( date = = null ) return false ;
try {
long ttl = 1000 * Long . parseLong ( cacheControl . substring ( 8 ) ) ; // milliseconds to live
if ( ( new Date ( ) ) . getTime ( ) - date . getTime ( ) > ttl ) {
return false ;
}
} catch ( Exception e ) {
return false ;
}
}
}
return true ;
}
public String shallIndexCacheForProxy ( ) {
// decide upon header information if a specific file should be indexed
// this method returns null if the answer is 'YES'!
// if the answer is 'NO' (do not index), it returns a string with the reason
@ -670,10 +800,8 @@ public class plasmaHTCache {
// thus we do not care about it here for indexing
// -pragma in cached response
/ *
if ( ( responseHeader . containsKey ( "PRAGMA" ) ) & &
( ( ( String ) responseHeader . get ( "Pragma" ) ) . toUpperCase ( ) . equals ( "NO-CACHE" ) ) ) return "Denied_(pragma_no_cache)" ;
* /
// see for documentation also:
// http://www.web-caching.com/cacheability.html
@ -732,126 +860,69 @@ public class plasmaHTCache {
return null ;
}
public boolean shallUseCache ( ) {
// decide upon header information if a specific file should be taken from the cache or not
//System.out.println("SHALL READ CACHE: requestHeader = " + requestHeader.toString() + ", responseHeader = " + responseHeader.toString());
public String shallIndexCacheForCrawler ( ) {
// decide upon header information if a specific file should be indexed
// this method returns null if the answer is 'YES'!
// if the answer is 'NO' (do not index), it returns a string with the reason
// to reject the crawling demand in clear text
// check profile
if ( ! ( profile . localIndexing ( ) ) ) return "Indexing_Not_Allowed" ;
// -CGI access in request
// CGI access makes the page very individual, and therefore not usable in caches
if ( isPOST ( urlString ) ) return false ;
if ( isCGI ( urlString ) ) return false ;
if ( ( isPOST ( urlString ) ) & & ( ! ( profile . crawlingQ ( ) ) ) ) return "Dynamic_(POST)" ;
if ( ( isCGI ( urlString ) ) & & ( ! ( profile . crawlingQ ( ) ) ) ) return "Dynamic_(CGI)" ;
// -authorization cases in request
if ( requestHeader . containsKey ( "AUTHORIZATION" ) ) return false ;
// we checked that in shallStoreCache
// -ranges in request
// we do not cache partial content
if ( ( requestHeader ! = null ) & & ( requestHeader . containsKey ( "RANGE" ) ) ) return false ;
// we checked that in shallStoreCache
//Date d1, d2;
// a picture cannot be indexed
if ( isPicture ( responseHeader ) ) return "Media_Content_(Picture)" ;
if ( ! ( isText ( responseHeader ) ) ) return "Media_Content_(not_text)" ;
if ( noIndexingURL ( urlString ) ) return "Media_Content_(forbidden)" ;
// -if-modified-since in request
// The entity has to be transferred only if it has
// been modified since the date given by the If-Modified-Since header.
if ( requestHeader . containsKey ( "IF-MODIFIED-SINCE" ) ) {
// checking this makes only sense if the cached response contains
// a Last-Modified field. If the field does not exist, we go the safe way
if ( ! ( responseHeader . containsKey ( "Last-Modified" ) ) ) return false ;
// parse date
Date d1 , d2 ;
d2 = responseHeader . lastModified ( ) ; if ( d2 = = null ) d2 = new Date ( ) ;
d1 = requestHeader . ifModifiedSince ( ) ; if ( d1 = = null ) d1 = new Date ( ) ;
// finally, we shall treat the cache as stale if the modification time is after the if-.. time
if ( d2 . after ( d1 ) ) return false ;
}
// if the page is fresh at the very moment we can index it
// -> this does not apply for the crawler
boolean isNotPicture = ! isPicture ( responseHeader ) ;
// -cookies in request
// unfortunately, we should reload in case of a cookie
// but we think that pictures can still be considered as fresh
if ( ( requestHeader . containsKey ( "COOKIE" ) ) & & ( isNotPicture ) ) return false ;
// -set-cookie in cached response
// this is a similar case as for COOKIE.
if ( ( responseHeader . containsKey ( "SET-COOKIE" ) ) & & ( isNotPicture ) ) return false ; // too strong
if ( ( responseHeader . containsKey ( "SET-COOKIE2" ) ) & & ( isNotPicture ) ) return false ; // too strong
// -pragma in cached response
// logically, we would not need to care about no-cache pragmas in cached response headers,
// because they cannot exist since they are not written to the cache.
// So this IF should always fail..
if ( ( responseHeader . containsKey ( "PRAGMA" ) ) & &
( ( ( String ) responseHeader . get ( "Pragma" ) ) . toUpperCase ( ) . equals ( "NO-CACHE" ) ) ) return false ;
// calculate often needed values for freshness attributes
Date date = responseHeader . date ( ) ;
Date expires = responseHeader . expires ( ) ;
Date lastModified = responseHeader . lastModified ( ) ;
String cacheControl = ( String ) responseHeader . get ( "Cache-Control" ) ;
// see for documentation also:
// http://www.web-caching.com/cacheability.html
// http://vancouver-webpages.com/CacheNow/
// unfortunately, we cannot index pages which have been requested with a cookie
// because the returned content may be special for the client
// -> this does not apply for a crawler
// -set-cookie in response
// the set-cookie from the server does not indicate that the content is special
// thus we do not care about it here for indexing
// -> this does not apply for a crawler
// -pragma in cached response
// -> in the crawler we ignore this
// look for freshnes information
// if we don't have any freshnes indication, we treat the file as stale.
// no handle for freshness control:
if ( ( expires = = null ) & & ( cacheControl = = null ) & & ( lastModified = = null ) ) return false ;
// -expires in cached response
// the expires value gives us a very easy hint when the cache is stale
if ( expires ! = null ) {
Date yesterday = new Date ( ( new Date ( ) ) . getTime ( ) - oneday ) ;
if ( expires . before ( yesterday ) ) return false ;
}
// sometimes, the expires date is set to the past to prevent that a page is cached
// we use that information to see if we should index it
// -> this does not apply for a crawler
// -lastModified in cached response
// we can apply a TTL (Time To Live) heuristic here. We call the time delta between the last read
// of the file and the last modified date as the age of the file. If we consider the file as
// middel-aged then, the maximum TTL would be cache-creation plus age.
// This would be a TTL factor of 100% we want no more than 10% TTL, so that a 10 month old cache
// file may only be treated as fresh for one more month, not more.
if ( lastModified ! = null ) {
if ( date = = null ) date = new Date ( ) ;
long age = date . getTime ( ) - lastModified . getTime ( ) ;
if ( age < 0 ) return false ;
// TTL (Time-To-Live) is age/10 = (d2.getTime() - d1.getTime()) / 10
// the actual living-time is new Date().getTime() - d2.getTime()
// therefore the cache is stale, if Date().getTime() - d2.getTime() > age/10
if ( ( new Date ( ) ) . getTime ( ) - date . getTime ( ) > age / 10 ) return false ;
}
// this information is too weak to use it to prevent indexing
// even if we can apply a TTL heuristic for cache usage
// -cache-control in cached response
// the cache-control has many value options.
if ( cacheControl ! = null ) {
cacheControl = cacheControl . trim ( ) . toUpperCase ( ) ;
if ( cacheControl . startsWith ( "PUBLIC" ) ) {
// ok, do nothing
} else if ( ( cacheControl . startsWith ( "PRIVATE" ) ) | |
( cacheControl . startsWith ( "NO-CACHE" ) ) | |
( cacheControl . startsWith ( "NO-STORE" ) ) ) {
// easy case
return false ;
} else if ( cacheControl . startsWith ( "MAX-AGE=" ) ) {
// we need also the load date
if ( date = = null ) return false ;
try {
long ttl = 1000 * Long . parseLong ( cacheControl . substring ( 8 ) ) ; // milliseconds to live
if ( ( new Date ( ) ) . getTime ( ) - date . getTime ( ) > ttl ) {
return false ;
}
} catch ( Exception e ) {
return false ;
}
}
}
// -> in the crawler we ignore this
return true ;
return null ;
}
}
}