bugfix: port handling in HTCache

grogram flow, cleared up


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1021 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
borg-0300 20 years ago
parent 2c5999ae00
commit a803a509ae

@ -260,11 +260,9 @@ public final class plasmaHTCache {
}
private void cleanupDoIt(long newCacheSize) {
if (this.cacheAge.size() == 0) return;
File obj;
Iterator iter = this.cacheAge.keySet().iterator();
while (iter.hasNext() && (this.currCacheSize >= newCacheSize)) {
while (iter.hasNext() && this.currCacheSize >= newCacheSize) {
Object key = iter.next();
obj = (File) this.cacheAge.get(key);
if (obj != null) {
@ -287,8 +285,8 @@ public final class plasmaHTCache {
private void cleanup() {
// clean up cache to have 4% (enough) space for next entries
if ((this.currCacheSize >= this.maxCacheSize) && (this.cacheAge.size() > 0)) {
if (this.maxCacheSize > 0) cleanupDoIt(this.maxCacheSize - ((this.maxCacheSize / 100) * 4));
if (this.cacheAge.size() > 0 && this.currCacheSize >= this.maxCacheSize) {
if (this.maxCacheSize > 0) { cleanupDoIt(this.maxCacheSize - (this.maxCacheSize / 100) * 4); }
}
}
@ -426,21 +424,24 @@ public final class plasmaHTCache {
* in the file system with root as given in cachePath is constructed
* it will also be ensured, that the complete path exists; if necessary
* that path will be generated
* @return URL
* @return new File
*/
public File getCachePath(URL url) {
// this.log.logFinest("plasmaHTCache: getCachePath: IN=" + url.toString());
String remotePath = url.getPath();
if (!(remotePath.startsWith("/"))) remotePath = "/" + remotePath;
if (remotePath.endsWith("/")) remotePath = remotePath + "ndx";
if (remotePath.indexOf('#') > 0) remotePath.substring(0, remotePath.indexOf('#'));
remotePath = remotePath.replace('?', '_');
remotePath = remotePath.replace('&', '_'); // yes this is not reversible, but that is not needed
remotePath = remotePath.replace(':', '_'); // yes this is not reversible, but that is not needed
int port = url.getPort();
if (port < 0) port = 80;
// this.log.logFinest("plasmaHTCache: getCachePath: OUT=" + url.getHost() + ((port == 80) ? "" : ("+" + port)) + remotePath);
return new File(this.cachePath, url.getHost() + ((port == 80) ? "" : ("+" + port)) + remotePath);
if (!remotePath.startsWith("/")) remotePath = "/" + remotePath;
if (remotePath.indexOf('#') > 0) {
remotePath.substring(0, remotePath.indexOf('#'));
} else if (remotePath.endsWith("/")) {
remotePath = remotePath + "ndx";
}
remotePath = remotePath.replaceAll("[?&:]", "_"); // yes this is not reversible, but that is not needed
final int port = url.getPort();
if (port < 0 || port == 80) {
return new File(this.cachePath, url.getHost() + remotePath);
} else {
return new File(this.cachePath, url.getHost() + "!" + port + remotePath);
}
}
/**
@ -451,28 +452,25 @@ public final class plasmaHTCache {
// this.log.logFinest("plasmaHTCache: getURL: IN: Path=[" + cachePath + "]");
// this.log.logFinest("plasmaHTCache: getURL: IN: File=[" + f + "]");
String s = f.toString().replace('\\', '/');
String c = cachePath.toString().replace('\\', '/');
int p = s.lastIndexOf(c);
if (p >= 0) {
s = s.substring(p + c.length());
final String c = cachePath.toString().replace('\\', '/');
int pos = s.lastIndexOf(c);
if (pos >= 0) {
s = s.substring(pos + c.length());
while (s.startsWith("/")) s = s.substring(1);
if ((p = s.indexOf("+")) >= 0) {
s = s.substring(0, p) + ":" + s.substring(p + 1);
/* } else {
p = s.indexOf("/");
if (p < 0)
s = s + ":80/";
else
s = s.substring(0, p) + ":80" + s.substring(p);*/
// dieser Block kann später gelöscht werden
pos = s.indexOf("+");
if (pos >= 0) {
s = s.substring(0, pos) + ":" + s.substring(pos + 1);
}
pos = s.indexOf("!");
if (pos >= 0) {
s = s.substring(0, pos) + ":" + s.substring(pos + 1);
}
if (s.endsWith("ndx")) { s = s.substring(0, s.length() - 3); }
// this.log.logFinest("plasmaHTCache: getURL: OUT=" + s);
try {
/* URL url = null;
url = new URL("http://" + s);
this.log.logFinest("plasmaHTCache: getURL: URL=" + url.toString());
return url;//new URL("http://" + s); */
return new URL("http://" + s);
} catch (Exception e) {
return null;
@ -493,11 +491,11 @@ public final class plasmaHTCache {
}
public static boolean isPOST(String urlString) {
return ((urlString.indexOf("?") >= 0) ||
(urlString.indexOf("&") >= 0));
return (urlString.indexOf("?") >= 0 ||
urlString.indexOf("&") >= 0);
}
public static boolean isCGI(String urlString) { // Borg-0300
public static boolean isCGI(String urlString) {
String ls = urlString.toLowerCase();
return ((ls.indexOf(".cgi") >= 0) ||
(ls.indexOf(".exe") >= 0) ||
@ -619,39 +617,43 @@ public final class plasmaHTCache {
// but also as strict as necessary to enable caching of most items
public String shallStoreCacheForProxy() {
// returns NULL if the answer is TRUE
// in case of FALSE, the reason as String is returned
// returns NULL if the answer is TRUE
// in case of FALSE, the reason as String is returned
// check profile
if (!(this.profile.storeHTCache())) return "storage_not_wanted";
// check profile
if (!this.profile.storeHTCache()) { return "storage_not_wanted"; }
// decide upon header information if a specific file should be stored to the cache or not
// if the storage was requested by prefetching, the request map is null
// check status code
if (!((this.responseStatus.startsWith("200")) || (this.responseStatus.startsWith("203")))) return "bad_status_" + this.responseStatus.substring(0,3);
if (!(this.responseStatus.startsWith("200") ||
this.responseStatus.startsWith("203"))) { return "bad_status_" + this.responseStatus.substring(0,3); }
// check storage location
// sometimes a file name is equal to a path name in the same directory;
// or sometimes a file name is equal a directory name created earlier;
// we cannot match that here in the cache file path and therefore omit writing into the cache
if ((this.cacheFile.getParentFile().isFile()) || (this.cacheFile.isDirectory())) return "path_ambiguous";
if (this.cacheFile.toString().indexOf("..") >= 0) return "path_dangerous";
if (this.cacheFile.getParentFile().isFile() || this.cacheFile.isDirectory()) { return "path_ambiguous"; }
if (this.cacheFile.toString().indexOf("..") >= 0) { return "path_dangerous"; }
// -CGI access in request
// CGI access makes the page very individual, and therefore not usable in caches
if ((isPOST(this.nomalizedURLString)) && (!(this.profile.crawlingQ()))) return "dynamic_post";
if (isCGI(this.nomalizedURLString)) return "dynamic_cgi";
// -authorization cases in request
// authorization makes pages very individual, and therefore we cannot use the
// content in the cache
if ((this.requestHeader != null) && (this.requestHeader.containsKey(httpHeader.AUTHORIZATION))) return "personalized";
if (isPOST(this.nomalizedURLString) && !this.profile.crawlingQ()) { return "dynamic_post"; }
if (isCGI(this.nomalizedURLString)) { return "dynamic_cgi"; }
if (this.requestHeader != null) {
// -authorization cases in request
// authorization makes pages very individual, and therefore we cannot use the
// content in the cache
if (this.requestHeader.containsKey(httpHeader.AUTHORIZATION)) { return "personalized"; }
// -ranges in request and response
// we do not cache partial content
if (this.requestHeader.containsKey(httpHeader.RANGE)) { return "partial"; }
}
// -ranges in request and response
// we do not cache partial content
if ((this.requestHeader != null) && (this.requestHeader.containsKey(httpHeader.RANGE))) return "partial";
if ((this.responseHeader != null) && (this.responseHeader.containsKey(httpHeader.CONTENT_RANGE))) return "partial";
if (this.responseHeader != null && this.responseHeader.containsKey(httpHeader.CONTENT_RANGE)) { return "partial"; }
// -if-modified-since in request
// we do not care about if-modified-since, because this case only occurres if the
@ -669,8 +671,8 @@ public final class plasmaHTCache {
// -pragma in response
// if we have a pragma non-cache, we don't cache. usually if this is wanted from
// the server, it makes sense
if ((this.responseHeader.containsKey(httpHeader.PRAGMA)) &&
(((String) this.responseHeader.get(httpHeader.PRAGMA)).toUpperCase().equals("NO-CACHE"))) return "controlled_no_cache";
String cacheControl = (String) this.responseHeader.get(httpHeader.PRAGMA);
if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return "controlled_no_cache"; }
// -expires in response
// we do not care about expires, because at the time this is called the data is
@ -678,23 +680,23 @@ public final class plasmaHTCache {
// -cache-control in response
// the cache-control has many value options.
String cacheControl = (String) this.responseHeader.get(httpHeader.CACHE_CONTROL);
cacheControl = (String) this.responseHeader.get(httpHeader.CACHE_CONTROL);
if (cacheControl != null) {
cacheControl = cacheControl.trim().toUpperCase();
if (cacheControl.startsWith("MAX-AGE=")) {
// we need also the load date
Date date = this.responseHeader.date();
if (date == null) return "stale_no_date_given_in_response";
try {
long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live
if (serverDate.correctedUTCTime() - date.getTime() > ttl) {
//System.out.println("***not indexed because cache-control");
return "stale_expired";
}
} catch (Exception e) {
return "stale_error_" + e.getMessage() + ")";
cacheControl = cacheControl.trim().toUpperCase();
if (cacheControl.startsWith("MAX-AGE=")) {
// we need also the load date
Date date = this.responseHeader.date();
if (date == null) return "stale_no_date_given_in_response";
try {
long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live
if (serverDate.correctedUTCTime() - date.getTime() > ttl) {
//System.out.println("***not indexed because cache-control");
return "stale_expired";
}
} catch (Exception e) {
return "stale_error_" + e.getMessage() + ")";
}
}
}
return null;
}
@ -704,77 +706,67 @@ public final class plasmaHTCache {
* @return
*/
public boolean shallUseCacheForProxy() {
// if the client requests a un-cached copy of the resource ...
if (
(this.requestHeader.containsKey(httpHeader.PRAGMA)) &&
(((String) this.requestHeader.get(httpHeader.PRAGMA)).toUpperCase().equals("NO-CACHE"))
) return false;
// System.out.println("SHALL READ CACHE: requestHeader = " + requestHeader.toString() + ", responseHeader = " + responseHeader.toString());
if (
(this.requestHeader.containsKey(httpHeader.CACHE_CONTROL)) &&
(
(((String) this.requestHeader.get(httpHeader.CACHE_CONTROL)).toUpperCase().startsWith("NO-CACHE")) ||
(((String) this.requestHeader.get(httpHeader.CACHE_CONTROL)).toUpperCase().startsWith("MAX-AGE=0"))
)
) return false;
String cacheControl;
if (this.requestHeader != null) {
// -authorization cases in request
if (this.requestHeader.containsKey(httpHeader.AUTHORIZATION)) { return false; }
// -ranges in request
// we do not cache partial content
if (this.requestHeader.containsKey(httpHeader.RANGE)) { return false; }
//System.out.println("SHALL READ CACHE: requestHeader = " + requestHeader.toString() + ", responseHeader = " + responseHeader.toString());
// if the client requests a un-cached copy of the resource ...
cacheControl = (String) this.requestHeader.get(httpHeader.PRAGMA);
if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return false; }
cacheControl = (String) this.requestHeader.get(httpHeader.CACHE_CONTROL);
if (cacheControl != null) {
cacheControl = cacheControl.trim().toUpperCase();
if (cacheControl.startsWith("NO-CACHE") || cacheControl.startsWith("MAX-AGE=0")) { return false; }
}
}
// -CGI access in request
// CGI access makes the page very individual, and therefore not usable in caches
if (isPOST(this.nomalizedURLString)) return false;
if (isCGI(this.nomalizedURLString)) return false;
// -authorization cases in request
if (this.requestHeader.containsKey(httpHeader.AUTHORIZATION)) return false;
// -ranges in request
// we do not cache partial content
if ((this.requestHeader != null) && (this.requestHeader.containsKey(httpHeader.RANGE))) return false;
//Date d1, d2;
if (isPOST(this.nomalizedURLString)) { return false; }
if (isCGI(this.nomalizedURLString)) { return false; }
// -if-modified-since in request
// The entity has to be transferred only if it has
// been modified since the date given by the If-Modified-Since header.
if (this.requestHeader.containsKey(httpHeader.IF_MODIFIED_SINCE)) {
// checking this makes only sense if the cached response contains
// a Last-Modified field. If the field does not exist, we go the safe way
if (!(this.responseHeader.containsKey(httpHeader.LAST_MODIFIED))) return false;
// parse date
Date d1, d2;
d2 = this.responseHeader.lastModified(); if (d2 == null) d2 = new Date(serverDate.correctedUTCTime());
d1 = this.requestHeader.ifModifiedSince(); if (d1 == null) d1 = new Date(serverDate.correctedUTCTime());
// finally, we shall treat the cache as stale if the modification time is after the if-.. time
if (d2.after(d1)) return false;
// checking this makes only sense if the cached response contains
// a Last-Modified field. If the field does not exist, we go the safe way
if (!this.responseHeader.containsKey(httpHeader.LAST_MODIFIED)) { return false; }
// parse date
Date d1, d2;
d2 = this.responseHeader.lastModified(); if (d2 == null) { d2 = new Date(serverDate.correctedUTCTime()); }
d1 = this.requestHeader.ifModifiedSince(); if (d1 == null) { d1 = new Date(serverDate.correctedUTCTime()); }
// finally, we shall treat the cache as stale if the modification time is after the if-.. time
if (d2.after(d1)) { return false; }
}
boolean isNotPicture = !isPicture(this.responseHeader);
// -cookies in request
// unfortunately, we should reload in case of a cookie
// but we think that pictures can still be considered as fresh
if ((this.requestHeader.containsKey(httpHeader.COOKIE)) && (isNotPicture)) return false;
// -set-cookie in cached response
// this is a similar case as for COOKIE.
if ((this.responseHeader.containsKey(httpHeader.SET_COOKIE)) && (isNotPicture)) return false; // too strong
if ((this.responseHeader.containsKey(httpHeader.SET_COOKIE2)) && (isNotPicture)) return false; // too strong
if (!isPicture(this.responseHeader)) {
// -cookies in request
// unfortunately, we should reload in case of a cookie
// but we think that pictures can still be considered as fresh
// -set-cookie in cached response
// this is a similar case as for COOKIE.
if (this.requestHeader.containsKey(httpHeader.COOKIE) ||
this.responseHeader.containsKey(httpHeader.SET_COOKIE) ||
this.responseHeader.containsKey(httpHeader.SET_COOKIE2)) {
return false; // too strong
}
}
// -pragma in cached response
// logically, we would not need to care about no-cache pragmas in cached response headers,
// because they cannot exist since they are not written to the cache.
// So this IF should always fail..
if ((this.responseHeader.containsKey(httpHeader.PRAGMA)) &&
(((String) this.responseHeader.get(httpHeader.PRAGMA)).toUpperCase().equals("NO-CACHE"))) return false;
// calculate often needed values for freshness attributes
Date date = this.responseHeader.date();
Date expires = this.responseHeader.expires();
Date lastModified = this.responseHeader.lastModified();
String cacheControl = (String) this.responseHeader.get(httpHeader.CACHE_CONTROL);
cacheControl = (String) this.responseHeader.get(httpHeader.PRAGMA);
if (cacheControl != null && cacheControl.trim().toUpperCase().equals("NO-CACHE")) { return false; }
// see for documentation also:
// http://www.web-caching.com/cacheability.html
@ -783,14 +775,17 @@ public final class plasmaHTCache {
// look for freshnes information
// if we don't have any freshnes indication, we treat the file as stale.
// no handle for freshness control:
if ((expires == null) && (cacheControl == null) && (lastModified == null)) return false;
// -expires in cached response
// the expires value gives us a very easy hint when the cache is stale
Date expires = this.responseHeader.expires();
if (expires != null) {
//System.out.println("EXPIRES-TEST: expires=" + expires + ", NOW=" + serverDate.correctedGMTDate() + ", url=" + url);
if (expires.before(new Date(serverDate.correctedUTCTime()))) return false;
// System.out.println("EXPIRES-TEST: expires=" + expires + ", NOW=" + serverDate.correctedGMTDate() + ", url=" + url);
if (expires.before(new Date(serverDate.correctedUTCTime()))) { return false; }
}
Date lastModified = this.responseHeader.lastModified();
cacheControl = (String) this.responseHeader.get(httpHeader.CACHE_CONTROL);
if (cacheControl == null && lastModified == null && expires == null) { return false; }
// -lastModified in cached response
// we can apply a TTL (Time To Live) heuristic here. We call the time delta between the last read
@ -798,46 +793,45 @@ public final class plasmaHTCache {
// middel-aged then, the maximum TTL would be cache-creation plus age.
// This would be a TTL factor of 100% we want no more than 10% TTL, so that a 10 month old cache
// file may only be treated as fresh for one more month, not more.
Date date = this.responseHeader.date();
if (lastModified != null) {
if (date == null) date = new Date(serverDate.correctedUTCTime());
long age = date.getTime() - lastModified.getTime();
if (age < 0) return false;
// TTL (Time-To-Live) is age/10 = (d2.getTime() - d1.getTime()) / 10
// the actual living-time is serverDate.correctedGMTDate().getTime() - d2.getTime()
// therefore the cache is stale, if serverDate.correctedGMTDate().getTime() - d2.getTime() > age/10
if (serverDate.correctedUTCTime() - date.getTime() > age / 10) return false;
if (date == null) { date = new Date(serverDate.correctedUTCTime()); }
long age = date.getTime() - lastModified.getTime();
if (age < 0) { return false; }
// TTL (Time-To-Live) is age/10 = (d2.getTime() - d1.getTime()) / 10
// the actual living-time is serverDate.correctedGMTDate().getTime() - d2.getTime()
// therefore the cache is stale, if serverDate.correctedGMTDate().getTime() - d2.getTime() > age/10
if (serverDate.correctedUTCTime() - date.getTime() > age / 10) { return false; }
}
// -cache-control in cached response
// -cache-control in cached response
// the cache-control has many value options.
if (cacheControl != null) {
cacheControl = cacheControl.trim().toUpperCase();
if (cacheControl.startsWith("PUBLIC")) {
// ok, do nothing
} else if ((cacheControl.startsWith("PRIVATE")) ||
(cacheControl.startsWith("NO-CACHE")) ||
(cacheControl.startsWith("NO-STORE"))) {
// easy case
return false;
} else if (cacheControl.startsWith("MAX-AGE=")) {
// we need also the load date
if (date == null) return false;
try {
long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live
if (serverDate.correctedUTCTime() - date.getTime() > ttl) {
return false;
}
} catch (Exception e) {
cacheControl = cacheControl.trim().toUpperCase();
if (cacheControl.startsWith("PRIVATE") ||
cacheControl.startsWith("NO-CACHE") ||
cacheControl.startsWith("NO-STORE")) {
// easy case
return false;
// } else if (cacheControl.startsWith("PUBLIC")) {
// // ok, do nothing
} else if (cacheControl.startsWith("MAX-AGE=")) {
// we need also the load date
if (date == null) { return false; }
try {
final long ttl = 1000 * Long.parseLong(cacheControl.substring(8)); // milliseconds to live
if (serverDate.correctedUTCTime() - date.getTime() > ttl) {
return false;
}
} catch (Exception e) {
return false;
}
}
}
return true;
}
}
} // class Entry
/*
public static void main(String[] args) {

Loading…
Cancel
Save