|
|
|
@ -331,384 +331,263 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand
|
|
|
|
|
return;
|
|
|
|
|
} catch (Exception ee) {}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// handle outgoing cookies
|
|
|
|
|
handleOutgoingCookies(requestHeader, host, ip);
|
|
|
|
|
|
|
|
|
|
// set another userAgent, if not yellowlisted
|
|
|
|
|
if ((yellowList != null) && (!(yellowList.contains(domain(hostlow))))) {
|
|
|
|
|
// change the User-Agent
|
|
|
|
|
requestHeader.put("User-Agent", userAgent);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// set a scraper and a htmlFilter
|
|
|
|
|
OutputStream hfos = null;
|
|
|
|
|
htmlFilterContentScraper scraper = null;
|
|
|
|
|
|
|
|
|
|
// set another userAgent, if not yellowlisted
|
|
|
|
|
if ((yellowList != null) && (!(yellowList.contains(domain(hostlow))))) {
|
|
|
|
|
// change the User-Agent
|
|
|
|
|
requestHeader.put("User-Agent", userAgent);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// set a scraper and a htmlFilter
|
|
|
|
|
OutputStream hfos = null;
|
|
|
|
|
htmlFilterContentScraper scraper = null;
|
|
|
|
|
|
|
|
|
|
// resolve yacy and yacyh domains
|
|
|
|
|
String yAddress = yacyCore.seedDB.resolveYacyAddress(host);
|
|
|
|
|
|
|
|
|
|
// re-calc the url path
|
|
|
|
|
String remotePath = (args == null) ? path : (path + "?" + args); // with leading '/'
|
|
|
|
|
|
|
|
|
|
// re-calc the url path
|
|
|
|
|
String remotePath = (args == null) ? path : (path + "?" + args); // with leading '/'
|
|
|
|
|
|
|
|
|
|
// attach possible yacy-sublevel-domain
|
|
|
|
|
if ((yAddress != null) &&
|
|
|
|
|
((pos = yAddress.indexOf("/")) >= 0) &&
|
|
|
|
|
(!(remotePath.startsWith("/env"))) // this is the special path, staying always at root-level
|
|
|
|
|
) remotePath = yAddress.substring(pos) + remotePath;
|
|
|
|
|
|
|
|
|
|
((pos = yAddress.indexOf("/")) >= 0) &&
|
|
|
|
|
(!(remotePath.startsWith("/env"))) // this is the special path, staying always at root-level
|
|
|
|
|
) remotePath = yAddress.substring(pos) + remotePath;
|
|
|
|
|
|
|
|
|
|
// decide wether to use a cache entry or connect to the network
|
|
|
|
|
File cacheFile = cacheManager.getCachePath(url);
|
|
|
|
|
String urlHash = plasmaCrawlLURL.urlHash(url);
|
|
|
|
|
httpHeader cachedResponseHeader = null;
|
|
|
|
|
boolean cacheExists = ((cacheFile.exists()) && (cacheFile.isFile()) &&
|
|
|
|
|
((cachedResponseHeader = cacheManager.getCachedResponse(urlHash)) != null));
|
|
|
|
|
|
|
|
|
|
// why are files unzipped upon arrival? why not zip all files in cache?
|
|
|
|
|
// This follows from the following premises
|
|
|
|
|
// (a) no file shall be unzip-ed more than once to prevent unnessesary computing time
|
|
|
|
|
// (b) old cache entries shall be comparable with refill-entries to detect/distiguish case 3+4
|
|
|
|
|
// (c) the indexing mechanism needs files unzip-ed, a schedule could do that later
|
|
|
|
|
// case b and c contradicts, if we use a scheduler, because files in a stale cache would be unzipped
|
|
|
|
|
// and the newly arrival would be zipped and would have to be unzipped upon load. But then the
|
|
|
|
|
// scheduler is superfluous. Therefore the only reminding case is
|
|
|
|
|
// (d) cached files shall be either all zipped or unzipped
|
|
|
|
|
// case d contradicts with a, because files need to be unzipped for indexing. Therefore
|
|
|
|
|
// the only remaining case is to unzip files right upon load. Thats what we do here.
|
|
|
|
|
|
|
|
|
|
// finally use existing cache if appropriate
|
|
|
|
|
// here we must decide weather or not to save the data
|
|
|
|
|
// to a cache
|
|
|
|
|
// we distinguish four CACHE STATE cases:
|
|
|
|
|
// 1. cache fill
|
|
|
|
|
// 2. cache fresh - no refill
|
|
|
|
|
// 3. cache stale - refill - necessary
|
|
|
|
|
// 4. cache stale - refill - superfluous
|
|
|
|
|
// in two of these cases we trigger a scheduler to handle newly arrived files:
|
|
|
|
|
// case 1 and case 3
|
|
|
|
|
plasmaHTCache.Entry hpc;
|
|
|
|
|
if (cacheExists) {
|
|
|
|
|
// we respond on the request by using the cache
|
|
|
|
|
|
|
|
|
|
hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, "200 OK", cachedResponseHeader, null, switchboard.defaultProxyProfile);
|
|
|
|
|
|
|
|
|
|
if (hpc.shallUseCache()) {
|
|
|
|
|
// the cache is fresh
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
// replace date field in old header by actual date, this is according to RFC
|
|
|
|
|
cachedResponseHeader.put("Date", httpc.dateString(httpc.nowDate()));
|
|
|
|
|
|
|
|
|
|
// maybe the content length is missing
|
|
|
|
|
if (!(cachedResponseHeader.containsKey("CONTENT-LENGTH")))
|
|
|
|
|
cachedResponseHeader.put("CONTENT-LENGTH", (String) ("" + cacheFile.length()));
|
|
|
|
|
|
|
|
|
|
// check if we can send a 304 instead the complete content
|
|
|
|
|
if (requestHeader.containsKey("IF-MODIFIED-SINCE")) {
|
|
|
|
|
// conditional request: freshness of cache for that condition was already
|
|
|
|
|
// checked within shallUseCache(). Now send only a 304 response
|
|
|
|
|
log.logInfo("CACHE HIT/304 " + cacheFile.toString());
|
|
|
|
|
|
|
|
|
|
// send cached header with replaced date and added length
|
|
|
|
|
respondHeader(respond, "304 OK", cachedResponseHeader); // respond with 'not modified'
|
|
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
// unconditional request: send content of cache
|
|
|
|
|
log.logInfo("CACHE HIT/203 " + cacheFile.toString());
|
|
|
|
|
|
|
|
|
|
// send cached header with replaced date and added length
|
|
|
|
|
respondHeader(respond, "203 OK", cachedResponseHeader); // respond with 'non-authoritative'
|
|
|
|
|
|
|
|
|
|
// make a transformer
|
|
|
|
|
if ((!(transformer.isIdentityTransformer())) &&
|
|
|
|
|
((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) &&
|
|
|
|
|
((cachedResponseHeader == null) || (httpd.isTextMime(cachedResponseHeader.mime(), switchboard.mimeWhite)))) {
|
|
|
|
|
hfos = new htmlFilterOutputStream(respond, null, transformer, (ext.length() == 0));
|
|
|
|
|
} else {
|
|
|
|
|
hfos = respond;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// send also the complete body now from the cache
|
|
|
|
|
// simply read the file and transfer to out socket
|
|
|
|
|
InputStream is = new FileInputStream(cacheFile);
|
|
|
|
|
byte[] buffer = new byte[2048];
|
|
|
|
|
int l;
|
|
|
|
|
while ((l = is.read(buffer)) > 0) {hfos.write(buffer, 0, l);}
|
|
|
|
|
is.close();
|
|
|
|
|
if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
|
|
|
|
|
}
|
|
|
|
|
// that's it!
|
|
|
|
|
} catch (SocketException e) {
|
|
|
|
|
// this happens if the client stops loading the file
|
|
|
|
|
// we do nothing here
|
|
|
|
|
respondError(respond, "111 socket error: " + e.getMessage(), 1, url.toString());
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
// the cache is (supposed to be) stale
|
|
|
|
|
|
|
|
|
|
// delete the cache
|
|
|
|
|
long sizeBeforeDelete = cacheFile.length();
|
|
|
|
|
cacheFile.delete();
|
|
|
|
|
|
|
|
|
|
// take a new file from the server
|
|
|
|
|
httpc remote = null;
|
|
|
|
|
httpc.response res = null;
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
// open the connection
|
|
|
|
|
if (yAddress == null) {
|
|
|
|
|
remote = newhttpc(host, port, timeout);
|
|
|
|
|
File cacheFile = cacheManager.getCachePath(url);
|
|
|
|
|
String urlHash = plasmaCrawlLURL.urlHash(url);
|
|
|
|
|
httpHeader cachedResponseHeader = null;
|
|
|
|
|
boolean cacheExists = ((cacheFile.exists()) && (cacheFile.isFile()) &&
|
|
|
|
|
((cachedResponseHeader = cacheManager.getCachedResponse(urlHash)) != null));
|
|
|
|
|
|
|
|
|
|
// why are files unzipped upon arrival? why not zip all files in cache?
|
|
|
|
|
// This follows from the following premises
|
|
|
|
|
// (a) no file shall be unzip-ed more than once to prevent unnessesary computing time
|
|
|
|
|
// (b) old cache entries shall be comparable with refill-entries to detect/distiguish case 3+4
|
|
|
|
|
// (c) the indexing mechanism needs files unzip-ed, a schedule could do that later
|
|
|
|
|
// case b and c contradicts, if we use a scheduler, because files in a stale cache would be unzipped
|
|
|
|
|
// and the newly arrival would be zipped and would have to be unzipped upon load. But then the
|
|
|
|
|
// scheduler is superfluous. Therefore the only reminding case is
|
|
|
|
|
// (d) cached files shall be either all zipped or unzipped
|
|
|
|
|
// case d contradicts with a, because files need to be unzipped for indexing. Therefore
|
|
|
|
|
// the only remaining case is to unzip files right upon load. Thats what we do here.
|
|
|
|
|
|
|
|
|
|
// finally use existing cache if appropriate
|
|
|
|
|
// here we must decide weather or not to save the data
|
|
|
|
|
// to a cache
|
|
|
|
|
// we distinguish four CACHE STATE cases:
|
|
|
|
|
// 1. cache fill
|
|
|
|
|
// 2. cache fresh - no refill
|
|
|
|
|
// 3. cache stale - refill - necessary
|
|
|
|
|
// 4. cache stale - refill - superfluous
|
|
|
|
|
// in two of these cases we trigger a scheduler to handle newly arrived files:
|
|
|
|
|
// case 1 and case 3
|
|
|
|
|
plasmaHTCache.Entry hpc;
|
|
|
|
|
if ((cacheExists) &&
|
|
|
|
|
((hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, "200 OK",
|
|
|
|
|
cachedResponseHeader, null,
|
|
|
|
|
switchboard.defaultProxyProfile)).shallUseCache())) {
|
|
|
|
|
// we respond on the request by using the cache, the cache is fresh
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
// replace date field in old header by actual date, this is according to RFC
|
|
|
|
|
cachedResponseHeader.put("Date", httpc.dateString(httpc.nowDate()));
|
|
|
|
|
|
|
|
|
|
// maybe the content length is missing
|
|
|
|
|
if (!(cachedResponseHeader.containsKey("CONTENT-LENGTH")))
|
|
|
|
|
cachedResponseHeader.put("CONTENT-LENGTH", (String) ("" + cacheFile.length()));
|
|
|
|
|
|
|
|
|
|
// check if we can send a 304 instead the complete content
|
|
|
|
|
if (requestHeader.containsKey("IF-MODIFIED-SINCE")) {
|
|
|
|
|
// conditional request: freshness of cache for that condition was already
|
|
|
|
|
// checked within shallUseCache(). Now send only a 304 response
|
|
|
|
|
log.logInfo("CACHE HIT/304 " + cacheFile.toString());
|
|
|
|
|
|
|
|
|
|
// send cached header with replaced date and added length
|
|
|
|
|
respondHeader(respond, "304 OK", cachedResponseHeader); // respond with 'not modified'
|
|
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
// unconditional request: send content of cache
|
|
|
|
|
log.logInfo("CACHE HIT/203 " + cacheFile.toString());
|
|
|
|
|
|
|
|
|
|
// send cached header with replaced date and added length
|
|
|
|
|
respondHeader(respond, "203 OK", cachedResponseHeader); // respond with 'non-authoritative'
|
|
|
|
|
|
|
|
|
|
// make a transformer
|
|
|
|
|
if ((!(transformer.isIdentityTransformer())) &&
|
|
|
|
|
((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) &&
|
|
|
|
|
((cachedResponseHeader == null) || (httpd.isTextMime(cachedResponseHeader.mime(), switchboard.mimeWhite)))) {
|
|
|
|
|
hfos = new htmlFilterOutputStream(respond, null, transformer, (ext.length() == 0));
|
|
|
|
|
} else {
|
|
|
|
|
remote = newhttpc(yAddress, timeout);
|
|
|
|
|
hfos = respond;
|
|
|
|
|
}
|
|
|
|
|
//System.out.println("HEADER: CLIENT TO PROXY = " + requestHeader.toString()); // DEBUG
|
|
|
|
|
|
|
|
|
|
// send request
|
|
|
|
|
res = remote.GET(remotePath, requestHeader);
|
|
|
|
|
long contentLength = res.responseHeader.contentLength();
|
|
|
|
|
|
|
|
|
|
// reserver cache entry
|
|
|
|
|
hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, res.status, res.responseHeader, null, switchboard.defaultProxyProfile);
|
|
|
|
|
|
|
|
|
|
// send also the complete body now from the cache
|
|
|
|
|
// simply read the file and transfer to out socket
|
|
|
|
|
InputStream is = new FileInputStream(cacheFile);
|
|
|
|
|
byte[] buffer = new byte[2048];
|
|
|
|
|
int l;
|
|
|
|
|
while ((l = is.read(buffer)) > 0) {hfos.write(buffer, 0, l);}
|
|
|
|
|
is.close();
|
|
|
|
|
if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
|
|
|
|
|
}
|
|
|
|
|
// that's it!
|
|
|
|
|
} catch (SocketException e) {
|
|
|
|
|
// this happens if the client stops loading the file
|
|
|
|
|
// we do nothing here
|
|
|
|
|
respondError(respond, "111 socket error: " + e.getMessage(), 1, url.toString());
|
|
|
|
|
}
|
|
|
|
|
respond.flush();
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// the cache does either not exist or is (supposed to be) stale
|
|
|
|
|
long sizeBeforeDelete = -1;
|
|
|
|
|
if (cacheExists) {
|
|
|
|
|
// delete the cache
|
|
|
|
|
sizeBeforeDelete = cacheFile.length();
|
|
|
|
|
cacheFile.delete();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// take a new file from the server
|
|
|
|
|
httpc remote = null;
|
|
|
|
|
httpc.response res = null;
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
// open the connection
|
|
|
|
|
if (yAddress == null) {
|
|
|
|
|
remote = newhttpc(host, port, timeout);
|
|
|
|
|
} else {
|
|
|
|
|
remote = newhttpc(yAddress, timeout);
|
|
|
|
|
}
|
|
|
|
|
//System.out.println("HEADER: CLIENT TO PROXY = " + requestHeader.toString()); // DEBUG
|
|
|
|
|
|
|
|
|
|
// send request
|
|
|
|
|
res = remote.GET(remotePath, requestHeader);
|
|
|
|
|
long contentLength = res.responseHeader.contentLength();
|
|
|
|
|
|
|
|
|
|
// reserver cache entry
|
|
|
|
|
hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, res.status, res.responseHeader, null, switchboard.defaultProxyProfile);
|
|
|
|
|
|
|
|
|
|
// handle file types
|
|
|
|
|
if (((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) &&
|
|
|
|
|
(httpd.isTextMime(res.responseHeader.mime(), switchboard.mimeWhite))) {
|
|
|
|
|
if (transformer.isIdentityTransformer()) {
|
|
|
|
|
// no transformation, only passthrough
|
|
|
|
|
hfos = respond;
|
|
|
|
|
} else {
|
|
|
|
|
// make a scraper and transformer
|
|
|
|
|
if (((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) &&
|
|
|
|
|
(httpd.isTextMime(res.responseHeader.mime(), switchboard.mimeWhite))) {
|
|
|
|
|
if (transformer.isIdentityTransformer()) {
|
|
|
|
|
hfos = hpc.getContentOutputStream();
|
|
|
|
|
scraper = new htmlFilterContentScraper(url);
|
|
|
|
|
hfos = new htmlFilterOutputStream(respond, scraper, transformer, (ext.length() == 0));
|
|
|
|
|
if (((htmlFilterOutputStream) hfos).binarySuspect()) {
|
|
|
|
|
scraper = null; // forget it, may be rubbish
|
|
|
|
|
log.logDebug("Content of " + url + " is probably binary. deleted scraper.");
|
|
|
|
|
}
|
|
|
|
|
hpc.scraper = scraper;
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
log.logDebug("Resource " + url + " has wrong extension (" + ext + ") or wrong mime-type (" + res.responseHeader.mime() + "). not scraped");
|
|
|
|
|
scraper = null;
|
|
|
|
|
hfos = respond;
|
|
|
|
|
hpc.scraper = scraper;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// handle incoming cookies
|
|
|
|
|
handleIncomingCookies(res.responseHeader, host, ip);
|
|
|
|
|
|
|
|
|
|
// request has been placed and result has been returned. work off response
|
|
|
|
|
try {
|
|
|
|
|
respondHeader(respond, res.status, res.responseHeader);
|
|
|
|
|
String storeError;
|
|
|
|
|
if ((storeError = hpc.shallStoreCache()) == null) {
|
|
|
|
|
// we write a new cache entry
|
|
|
|
|
if ((contentLength > 0) && // known
|
|
|
|
|
(contentLength < 1048576)) {// 1 MB
|
|
|
|
|
// ok, we don't write actually into a file, only to RAM, and schedule writing the file.
|
|
|
|
|
byte[] cacheArray;
|
|
|
|
|
cacheArray = res.writeContent(hfos);
|
|
|
|
|
if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
|
|
|
|
|
|
|
|
|
|
if (sizeBeforeDelete == -1) {
|
|
|
|
|
// totally fresh file
|
|
|
|
|
hpc.status = plasmaHTCache.CACHE_FILL; // it's an insert
|
|
|
|
|
cacheManager.stackProcess(hpc, cacheArray);
|
|
|
|
|
} else if (sizeBeforeDelete == cacheArray.length) {
|
|
|
|
|
// before we came here we deleted a cache entry
|
|
|
|
|
cacheArray = null;
|
|
|
|
|
hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_BAD;
|
|
|
|
|
cacheManager.stackProcess(hpc); // unnecessary update
|
|
|
|
|
} else {
|
|
|
|
|
scraper = new htmlFilterContentScraper(url);
|
|
|
|
|
hfos = new htmlFilterOutputStream(respond, scraper, transformer, (ext.length() == 0));
|
|
|
|
|
if (((htmlFilterOutputStream) hfos).binarySuspect()) {
|
|
|
|
|
scraper = null; // forget it, may be rubbish
|
|
|
|
|
log.logDebug("Content of " + url + " is probably binary. deleted scraper.");
|
|
|
|
|
}
|
|
|
|
|
hpc.scraper = scraper;
|
|
|
|
|
// before we came here we deleted a cache entry
|
|
|
|
|
hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_GOOD;
|
|
|
|
|
cacheManager.stackProcess(hpc, cacheArray); // necessary update, write response header to cache
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
log.logDebug("Resource " + url + " has wrong extension (" + ext + ") or wrong mime-type (" + res.responseHeader.mime() + "). not scraped");
|
|
|
|
|
scraper = null;
|
|
|
|
|
hfos = respond;
|
|
|
|
|
hpc.scraper = scraper;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// handle incoming cookies
|
|
|
|
|
handleIncomingCookies(res.responseHeader, host, ip);
|
|
|
|
|
|
|
|
|
|
// request has been placed and result has been returned. work off response
|
|
|
|
|
try {
|
|
|
|
|
respondHeader(respond, res.status, res.responseHeader);
|
|
|
|
|
String storeError;
|
|
|
|
|
if ((storeError = hpc.shallStoreCache()) == null) {
|
|
|
|
|
// we write a new cache entry
|
|
|
|
|
if ((contentLength > 0) && // known
|
|
|
|
|
(contentLength < 1048576)) // 1 MB
|
|
|
|
|
{
|
|
|
|
|
byte[] cacheArray;
|
|
|
|
|
if (transformer.isIdentityTransformer()) {
|
|
|
|
|
res.writeContentX(hfos, respond);
|
|
|
|
|
cacheArray = hpc.getContentBytes();
|
|
|
|
|
} else {
|
|
|
|
|
cacheArray = res.writeContent(hfos);
|
|
|
|
|
}
|
|
|
|
|
if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
|
|
|
|
|
// before we came here we deleted a cache entry
|
|
|
|
|
if (sizeBeforeDelete == cacheArray.length) {
|
|
|
|
|
cacheArray = null;
|
|
|
|
|
hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_BAD;
|
|
|
|
|
cacheManager.stackProcess(hpc); // unnecessary update
|
|
|
|
|
} else {
|
|
|
|
|
hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_GOOD;
|
|
|
|
|
cacheManager.stackProcess(hpc, cacheArray); // necessary update, write response header to cache
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
// the file is too big to cache it in the ram, write to file
|
|
|
|
|
cacheFile.getParentFile().mkdirs();
|
|
|
|
|
if (transformer.isIdentityTransformer()) {
|
|
|
|
|
res.writeContent(respond, cacheFile);
|
|
|
|
|
if (contentLength < 10485760) { // 10 mb
|
|
|
|
|
serverFileUtils.copy(cacheFile, hfos);
|
|
|
|
|
} // else hfos is empty and that means: no work afterwards with it
|
|
|
|
|
} else {
|
|
|
|
|
res.writeContent(hfos, cacheFile);
|
|
|
|
|
}
|
|
|
|
|
if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
|
|
|
|
|
// before we came here we deleted a cache entry
|
|
|
|
|
if (sizeBeforeDelete == cacheFile.length()) {
|
|
|
|
|
hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_BAD;
|
|
|
|
|
cacheManager.stackProcess(hpc); // unnecessary update
|
|
|
|
|
} else {
|
|
|
|
|
hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_GOOD;
|
|
|
|
|
cacheManager.stackProcess(hpc); // necessary update, write response header to cache
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
// no caching
|
|
|
|
|
log.logDebug(cacheFile.toString() + " not cached: " + storeError);
|
|
|
|
|
res.writeContent(hfos, null);
|
|
|
|
|
if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
|
|
|
|
|
// before we came here we deleted a cache entry
|
|
|
|
|
hpc.status = plasmaHTCache.CACHE_STALE_NO_RELOAD;
|
|
|
|
|
cacheManager.stackProcess(hpc);
|
|
|
|
|
}
|
|
|
|
|
} catch (SocketException e) {
|
|
|
|
|
// this may happen if the client suddenly closes its connection
|
|
|
|
|
// maybe the user has stopped loading
|
|
|
|
|
// in that case, we are not responsible and just forget it
|
|
|
|
|
// but we clean the cache also, since it may be only partial
|
|
|
|
|
// and most possible corrupted
|
|
|
|
|
if (cacheFile.exists()) cacheFile.delete();
|
|
|
|
|
}
|
|
|
|
|
remote.close();
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
// this may happen if the targeted host does not exist or anything with the
|
|
|
|
|
// remote server was wrong.
|
|
|
|
|
// in any case, sending a 404 is appropriate
|
|
|
|
|
try {
|
|
|
|
|
if ((e.toString().indexOf("unknown host")) > 0) {
|
|
|
|
|
respondHeader(respond,"404 unknown host", new httpHeader(null));
|
|
|
|
|
} else {
|
|
|
|
|
respondHeader(respond,"404 Not Found", new httpHeader(null));
|
|
|
|
|
respond.write(("Exception occurred:\r\n").getBytes());
|
|
|
|
|
respond.write((e.toString() + "\r\n").getBytes());
|
|
|
|
|
respond.write(("[TRACE: ").getBytes());
|
|
|
|
|
e.printStackTrace(new PrintStream(respond));
|
|
|
|
|
respond.write(("]\r\n").getBytes());
|
|
|
|
|
}
|
|
|
|
|
} catch (Exception ee) {}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
// we take a new file from the net and respond with that
|
|
|
|
|
try {
|
|
|
|
|
// open the connection
|
|
|
|
|
//httpc remote = newhttpc(host, port, timeout);
|
|
|
|
|
httpc remote;
|
|
|
|
|
if (yAddress == null) {
|
|
|
|
|
remote = newhttpc(host, port, timeout);
|
|
|
|
|
} else {
|
|
|
|
|
remote = newhttpc(yAddress, timeout);
|
|
|
|
|
}
|
|
|
|
|
//System.out.println("HEADER: CLIENT TO PROXY = " + requestHeader.toString()); // DEBUG
|
|
|
|
|
|
|
|
|
|
// send request
|
|
|
|
|
httpc.response res = remote.GET(remotePath, requestHeader);
|
|
|
|
|
long contentLength = res.responseHeader.contentLength();
|
|
|
|
|
|
|
|
|
|
// reserve cache entry
|
|
|
|
|
hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, res.status, res.responseHeader, null, switchboard.defaultProxyProfile);
|
|
|
|
|
|
|
|
|
|
// make a scraper and transformer
|
|
|
|
|
if (((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) &&
|
|
|
|
|
(httpd.isTextMime(res.responseHeader.mime(), switchboard.mimeWhite))) {
|
|
|
|
|
if (transformer.isIdentityTransformer()) {
|
|
|
|
|
hfos = hpc.getContentOutputStream();
|
|
|
|
|
} else {
|
|
|
|
|
scraper = new htmlFilterContentScraper(url);
|
|
|
|
|
hfos = new htmlFilterOutputStream(respond, scraper, transformer, (ext.length() == 0));
|
|
|
|
|
if (((htmlFilterOutputStream) hfos).binarySuspect()) {
|
|
|
|
|
scraper = null; // forget it, may be rubbish
|
|
|
|
|
log.logDebug("Content of " + url + " is probably binary. deleted scraper.");
|
|
|
|
|
// the file is too big to cache it in the ram, write to file right here
|
|
|
|
|
cacheFile.getParentFile().mkdirs();
|
|
|
|
|
res.writeContent(hfos, cacheFile);
|
|
|
|
|
if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
|
|
|
|
|
if (sizeBeforeDelete == -1) {
|
|
|
|
|
// totally fresh file
|
|
|
|
|
hpc.status = plasmaHTCache.CACHE_FILL; // it's an insert
|
|
|
|
|
cacheManager.stackProcess(hpc);
|
|
|
|
|
} else if (sizeBeforeDelete == cacheFile.length()) {
|
|
|
|
|
// before we came here we deleted a cache entry
|
|
|
|
|
hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_BAD;
|
|
|
|
|
cacheManager.stackProcess(hpc); // unnecessary update
|
|
|
|
|
} else {
|
|
|
|
|
// before we came here we deleted a cache entry
|
|
|
|
|
hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_GOOD;
|
|
|
|
|
cacheManager.stackProcess(hpc); // necessary update, write response header to cache
|
|
|
|
|
}
|
|
|
|
|
hpc.scraper = scraper;
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
log.logDebug("Resource " + url + " has wrong extension (" + ext + ") or wrong mime-type (" + res.responseHeader.mime() + "). not scraped");
|
|
|
|
|
scraper = null;
|
|
|
|
|
hfos = respond;
|
|
|
|
|
hpc.scraper = scraper;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// handle incoming cookies
|
|
|
|
|
handleIncomingCookies(res.responseHeader, host, ip);
|
|
|
|
|
|
|
|
|
|
// request has been placed and result has been returned. work off response
|
|
|
|
|
try {
|
|
|
|
|
//System.out.println("HEADER: SERVER TO PROXY = [" + res.status + "] " + ((httpHeader) res.responseHeader).toString()); // DEBUG
|
|
|
|
|
respondHeader(respond, res.status, res.responseHeader);
|
|
|
|
|
String storeError;
|
|
|
|
|
if ((storeError = hpc.shallStoreCache()) == null) {
|
|
|
|
|
// we write a new cache entry
|
|
|
|
|
if ((contentLength > 0) && (contentLength < 1048576)) {
|
|
|
|
|
// write to buffer
|
|
|
|
|
byte[] cacheArray;
|
|
|
|
|
if (transformer.isIdentityTransformer()) {
|
|
|
|
|
res.writeContentX(hfos, respond);
|
|
|
|
|
cacheArray = hpc.getContentBytes();
|
|
|
|
|
} else {
|
|
|
|
|
cacheArray = res.writeContent(hfos);
|
|
|
|
|
}
|
|
|
|
|
if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
|
|
|
|
|
// enQueue new entry with response header and file as byte[]
|
|
|
|
|
hpc.status = plasmaHTCache.CACHE_FILL;
|
|
|
|
|
cacheManager.stackProcess(hpc, cacheArray);
|
|
|
|
|
} else try {
|
|
|
|
|
// write to file system directly
|
|
|
|
|
cacheFile.getParentFile().mkdirs();
|
|
|
|
|
if (transformer.isIdentityTransformer()) {
|
|
|
|
|
res.writeContent(respond, cacheFile);
|
|
|
|
|
if (contentLength < 10485760) { // 10 mb
|
|
|
|
|
serverFileUtils.copy(cacheFile, hfos);
|
|
|
|
|
} // else hfos is empty and that means: no work afterwards with it
|
|
|
|
|
} else {
|
|
|
|
|
res.writeContent(hfos, cacheFile);
|
|
|
|
|
}
|
|
|
|
|
if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
|
|
|
|
|
// enQueue new entry with response header
|
|
|
|
|
hpc.status = plasmaHTCache.CACHE_FILL;
|
|
|
|
|
cacheManager.stackProcess(hpc);
|
|
|
|
|
} catch (FileNotFoundException e) {
|
|
|
|
|
// this may happen if there are no write rights whatsoever
|
|
|
|
|
// (do nothing)
|
|
|
|
|
/*
|
|
|
|
|
Exception occurred:
|
|
|
|
|
java.io.FileNotFoundException:
|
|
|
|
|
/opt/yacy_pre_v0.314_20041219/DATA/HTCACHE/www.spiegel.de/fotostrecke/0,5538,PB64-SUQ9NDYwNyZucj0z,00.html
|
|
|
|
|
(Permission denied)
|
|
|
|
|
*/
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
// no caching
|
|
|
|
|
//System.out.println("DEBUG: " + res.status + " " + cacheFile.toString()); // debug
|
|
|
|
|
log.logDebug(cacheFile.toString() + " not cached: " + storeError);
|
|
|
|
|
res.writeContent(hfos, null);
|
|
|
|
|
if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
|
|
|
|
|
// no old file and no load. just data passing
|
|
|
|
|
// no caching
|
|
|
|
|
log.logDebug(cacheFile.toString() + " not cached: " + storeError);
|
|
|
|
|
res.writeContent(hfos, null);
|
|
|
|
|
if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
|
|
|
|
|
if (sizeBeforeDelete == -1) {
|
|
|
|
|
// no old file and no load. just data passing
|
|
|
|
|
hpc.status = plasmaHTCache.CACHE_PASSING;
|
|
|
|
|
cacheManager.stackProcess(hpc);
|
|
|
|
|
}
|
|
|
|
|
} catch (SocketException e) {
|
|
|
|
|
// this may happen if the client suddenly closes its connection
|
|
|
|
|
// maybe the user has stopped loading
|
|
|
|
|
// in that case, we are not responsible and just forget it
|
|
|
|
|
// but we clean the cache also, since it may be only partial
|
|
|
|
|
// and most possible corrupted
|
|
|
|
|
if (cacheFile.exists()) cacheFile.delete();
|
|
|
|
|
respondHeader(respond,"404 client unexpectedly closed connection", new httpHeader(null));
|
|
|
|
|
}
|
|
|
|
|
remote.close();
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
// this may happen if the targeted host does not exist or anything with the
|
|
|
|
|
// remote server was wrong.
|
|
|
|
|
// in any case, sending a 404 is appropriate
|
|
|
|
|
try {
|
|
|
|
|
if ((e.toString().indexOf("unknown host")) > 0) {
|
|
|
|
|
respondHeader(respond,"404 unknown host", new httpHeader(null));
|
|
|
|
|
} else {
|
|
|
|
|
respondHeader(respond,"404 resource not available (generic exception: " + e.toString() + ")", new httpHeader(null));
|
|
|
|
|
//respond.write(("Exception occurred:\r\n").getBytes());
|
|
|
|
|
//respond.write((e.toString() + "\r\n").getBytes());
|
|
|
|
|
//respond.write(("[TRACE: ").getBytes());
|
|
|
|
|
//e.printStackTrace(new PrintStream(respond));
|
|
|
|
|
//respond.write(("]\r\n").getBytes());
|
|
|
|
|
/* http://www.geocrawler.com/archives/3/201/1999/8/50/2505805/
|
|
|
|
|
> java.net.ConnectException: Connection refused
|
|
|
|
|
*/
|
|
|
|
|
e.printStackTrace();
|
|
|
|
|
}
|
|
|
|
|
} catch (Exception ee) {}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
respond.flush();
|
|
|
|
|
cacheManager.stackProcess(hpc);
|
|
|
|
|
} else {
|
|
|
|
|
// before we came here we deleted a cache entry
|
|
|
|
|
hpc.status = plasmaHTCache.CACHE_STALE_NO_RELOAD;
|
|
|
|
|
cacheManager.stackProcess(hpc);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} catch (SocketException e) {
|
|
|
|
|
// this may happen if the client suddenly closes its connection
|
|
|
|
|
// maybe the user has stopped loading
|
|
|
|
|
// in that case, we are not responsible and just forget it
|
|
|
|
|
// but we clean the cache also, since it may be only partial
|
|
|
|
|
// and most possible corrupted
|
|
|
|
|
if (cacheFile.exists()) cacheFile.delete();
|
|
|
|
|
respondHeader(respond,"404 client unexpectedly closed connection", new httpHeader(null));
|
|
|
|
|
}
|
|
|
|
|
remote.close();
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
// this may happen if the targeted host does not exist or anything with the
|
|
|
|
|
// remote server was wrong.
|
|
|
|
|
// in any case, sending a 404 is appropriate
|
|
|
|
|
try {
|
|
|
|
|
if ((e.toString().indexOf("unknown host")) > 0) {
|
|
|
|
|
respondHeader(respond,"404 unknown host", new httpHeader(null));
|
|
|
|
|
} else {
|
|
|
|
|
respondHeader(respond,"404 Not Found", new httpHeader(null));
|
|
|
|
|
respond.write(("Exception occurred:\r\n").getBytes());
|
|
|
|
|
respond.write((e.toString() + "\r\n").getBytes());
|
|
|
|
|
respond.write(("[TRACE: ").getBytes());
|
|
|
|
|
e.printStackTrace(new PrintStream(respond));
|
|
|
|
|
respond.write(("]\r\n").getBytes());
|
|
|
|
|
}
|
|
|
|
|
} catch (Exception ee) {}
|
|
|
|
|
}
|
|
|
|
|
respond.flush();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|