From c0807abd33e411496a7a13234e19392d9b4b18c6 Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 13 Apr 2005 23:00:20 +0000 Subject: [PATCH] new crawl/proxy/cache design + fixes git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@18 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- .../htmlFilterContentTransformer.java | 4 +- source/de/anomic/http/httpc.java | 7 +- source/de/anomic/http/httpdProxyHandler.java | 595 +++++++----------- .../de/anomic/plasma/plasmaCrawlLoader.java | 6 +- source/de/anomic/plasma/plasmaHTCache.java | 9 - .../de/anomic/plasma/plasmaSwitchboard.java | 5 +- source/de/anomic/server/serverByteBuffer.java | 31 +- yacy.blue | 2 +- 8 files changed, 273 insertions(+), 386 deletions(-) diff --git a/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java b/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java index 4e2baa5ee..8fabc2c32 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java @@ -65,7 +65,7 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer } public void init(String initarg) { - System.out.println("Transformer init: " + initarg); + //System.out.println("Transformer init: " + initarg); if (bluelist == null) { // here, the initarg is used to load a list of bluelisted words bluelist = new Vector(); @@ -79,7 +79,7 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer r.close(); } catch (Exception e) { } - if (bluelist.size() == 0) System.out.println("BLUELIST is empty"); + //if (bluelist.size() == 0) System.out.println("BLUELIST is empty"); } } diff --git a/source/de/anomic/http/httpc.java b/source/de/anomic/http/httpc.java index 820e29c04..a81f5db14 100644 --- a/source/de/anomic/http/httpc.java +++ b/source/de/anomic/http/httpc.java @@ -315,10 +315,9 @@ public class httpc { } public byte[] writeContent(OutputStream procOS) throws IOException { - ByteArrayOutputStream bos = new ByteArrayOutputStream(); - writeContentX(procOS, bos); - bos.flush(); - return bos.toByteArray(); + serverByteBuffer sbb = new serverByteBuffer(); + writeContentX(procOS, sbb); + return sbb.getBytes(); } public void writeContent(OutputStream procOS, File file) throws IOException { diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java index 6efb0ec6d..60e9fcb47 100644 --- a/source/de/anomic/http/httpdProxyHandler.java +++ b/source/de/anomic/http/httpdProxyHandler.java @@ -331,384 +331,263 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand return; } catch (Exception ee) {} } - + // handle outgoing cookies handleOutgoingCookies(requestHeader, host, ip); - // set another userAgent, if not yellowlisted - if ((yellowList != null) && (!(yellowList.contains(domain(hostlow))))) { - // change the User-Agent - requestHeader.put("User-Agent", userAgent); - } - - // set a scraper and a htmlFilter - OutputStream hfos = null; - htmlFilterContentScraper scraper = null; - + // set another userAgent, if not yellowlisted + if ((yellowList != null) && (!(yellowList.contains(domain(hostlow))))) { + // change the User-Agent + requestHeader.put("User-Agent", userAgent); + } + + // set a scraper and a htmlFilter + OutputStream hfos = null; + htmlFilterContentScraper scraper = null; + // resolve yacy and yacyh domains String yAddress = yacyCore.seedDB.resolveYacyAddress(host); - // re-calc the url path - String remotePath = (args == null) ? path : (path + "?" + args); // with leading '/' - + // re-calc the url path + String remotePath = (args == null) ? path : (path + "?" + args); // with leading '/' + // attach possible yacy-sublevel-domain if ((yAddress != null) && - ((pos = yAddress.indexOf("/")) >= 0) && - (!(remotePath.startsWith("/env"))) // this is the special path, staying always at root-level - ) remotePath = yAddress.substring(pos) + remotePath; - + ((pos = yAddress.indexOf("/")) >= 0) && + (!(remotePath.startsWith("/env"))) // this is the special path, staying always at root-level + ) remotePath = yAddress.substring(pos) + remotePath; + // decide wether to use a cache entry or connect to the network - File cacheFile = cacheManager.getCachePath(url); - String urlHash = plasmaCrawlLURL.urlHash(url); - httpHeader cachedResponseHeader = null; - boolean cacheExists = ((cacheFile.exists()) && (cacheFile.isFile()) && - ((cachedResponseHeader = cacheManager.getCachedResponse(urlHash)) != null)); - - // why are files unzipped upon arrival? why not zip all files in cache? - // This follows from the following premises - // (a) no file shall be unzip-ed more than once to prevent unnessesary computing time - // (b) old cache entries shall be comparable with refill-entries to detect/distiguish case 3+4 - // (c) the indexing mechanism needs files unzip-ed, a schedule could do that later - // case b and c contradicts, if we use a scheduler, because files in a stale cache would be unzipped - // and the newly arrival would be zipped and would have to be unzipped upon load. But then the - // scheduler is superfluous. Therefore the only reminding case is - // (d) cached files shall be either all zipped or unzipped - // case d contradicts with a, because files need to be unzipped for indexing. Therefore - // the only remaining case is to unzip files right upon load. Thats what we do here. - - // finally use existing cache if appropriate - // here we must decide weather or not to save the data - // to a cache - // we distinguish four CACHE STATE cases: - // 1. cache fill - // 2. cache fresh - no refill - // 3. cache stale - refill - necessary - // 4. cache stale - refill - superfluous - // in two of these cases we trigger a scheduler to handle newly arrived files: - // case 1 and case 3 - plasmaHTCache.Entry hpc; - if (cacheExists) { - // we respond on the request by using the cache - - hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, "200 OK", cachedResponseHeader, null, switchboard.defaultProxyProfile); - - if (hpc.shallUseCache()) { - // the cache is fresh - - try { - // replace date field in old header by actual date, this is according to RFC - cachedResponseHeader.put("Date", httpc.dateString(httpc.nowDate())); - - // maybe the content length is missing - if (!(cachedResponseHeader.containsKey("CONTENT-LENGTH"))) - cachedResponseHeader.put("CONTENT-LENGTH", (String) ("" + cacheFile.length())); - - // check if we can send a 304 instead the complete content - if (requestHeader.containsKey("IF-MODIFIED-SINCE")) { - // conditional request: freshness of cache for that condition was already - // checked within shallUseCache(). Now send only a 304 response - log.logInfo("CACHE HIT/304 " + cacheFile.toString()); - - // send cached header with replaced date and added length - respondHeader(respond, "304 OK", cachedResponseHeader); // respond with 'not modified' - - } else { - // unconditional request: send content of cache - log.logInfo("CACHE HIT/203 " + cacheFile.toString()); - - // send cached header with replaced date and added length - respondHeader(respond, "203 OK", cachedResponseHeader); // respond with 'non-authoritative' - - // make a transformer - if ((!(transformer.isIdentityTransformer())) && - ((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) && - ((cachedResponseHeader == null) || (httpd.isTextMime(cachedResponseHeader.mime(), switchboard.mimeWhite)))) { - hfos = new htmlFilterOutputStream(respond, null, transformer, (ext.length() == 0)); - } else { - hfos = respond; - } - - // send also the complete body now from the cache - // simply read the file and transfer to out socket - InputStream is = new FileInputStream(cacheFile); - byte[] buffer = new byte[2048]; - int l; - while ((l = is.read(buffer)) > 0) {hfos.write(buffer, 0, l);} - is.close(); - if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize(); - } - // that's it! - } catch (SocketException e) { - // this happens if the client stops loading the file - // we do nothing here - respondError(respond, "111 socket error: " + e.getMessage(), 1, url.toString()); - } - } else { - // the cache is (supposed to be) stale - - // delete the cache - long sizeBeforeDelete = cacheFile.length(); - cacheFile.delete(); - - // take a new file from the server - httpc remote = null; - httpc.response res = null; - - try { - // open the connection - if (yAddress == null) { - remote = newhttpc(host, port, timeout); + File cacheFile = cacheManager.getCachePath(url); + String urlHash = plasmaCrawlLURL.urlHash(url); + httpHeader cachedResponseHeader = null; + boolean cacheExists = ((cacheFile.exists()) && (cacheFile.isFile()) && + ((cachedResponseHeader = cacheManager.getCachedResponse(urlHash)) != null)); + + // why are files unzipped upon arrival? why not zip all files in cache? + // This follows from the following premises + // (a) no file shall be unzip-ed more than once to prevent unnessesary computing time + // (b) old cache entries shall be comparable with refill-entries to detect/distiguish case 3+4 + // (c) the indexing mechanism needs files unzip-ed, a schedule could do that later + // case b and c contradicts, if we use a scheduler, because files in a stale cache would be unzipped + // and the newly arrival would be zipped and would have to be unzipped upon load. But then the + // scheduler is superfluous. Therefore the only reminding case is + // (d) cached files shall be either all zipped or unzipped + // case d contradicts with a, because files need to be unzipped for indexing. Therefore + // the only remaining case is to unzip files right upon load. Thats what we do here. + + // finally use existing cache if appropriate + // here we must decide weather or not to save the data + // to a cache + // we distinguish four CACHE STATE cases: + // 1. cache fill + // 2. cache fresh - no refill + // 3. cache stale - refill - necessary + // 4. cache stale - refill - superfluous + // in two of these cases we trigger a scheduler to handle newly arrived files: + // case 1 and case 3 + plasmaHTCache.Entry hpc; + if ((cacheExists) && + ((hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, "200 OK", + cachedResponseHeader, null, + switchboard.defaultProxyProfile)).shallUseCache())) { + // we respond on the request by using the cache, the cache is fresh + + try { + // replace date field in old header by actual date, this is according to RFC + cachedResponseHeader.put("Date", httpc.dateString(httpc.nowDate())); + + // maybe the content length is missing + if (!(cachedResponseHeader.containsKey("CONTENT-LENGTH"))) + cachedResponseHeader.put("CONTENT-LENGTH", (String) ("" + cacheFile.length())); + + // check if we can send a 304 instead the complete content + if (requestHeader.containsKey("IF-MODIFIED-SINCE")) { + // conditional request: freshness of cache for that condition was already + // checked within shallUseCache(). Now send only a 304 response + log.logInfo("CACHE HIT/304 " + cacheFile.toString()); + + // send cached header with replaced date and added length + respondHeader(respond, "304 OK", cachedResponseHeader); // respond with 'not modified' + + } else { + // unconditional request: send content of cache + log.logInfo("CACHE HIT/203 " + cacheFile.toString()); + + // send cached header with replaced date and added length + respondHeader(respond, "203 OK", cachedResponseHeader); // respond with 'non-authoritative' + + // make a transformer + if ((!(transformer.isIdentityTransformer())) && + ((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) && + ((cachedResponseHeader == null) || (httpd.isTextMime(cachedResponseHeader.mime(), switchboard.mimeWhite)))) { + hfos = new htmlFilterOutputStream(respond, null, transformer, (ext.length() == 0)); } else { - remote = newhttpc(yAddress, timeout); + hfos = respond; } - //System.out.println("HEADER: CLIENT TO PROXY = " + requestHeader.toString()); // DEBUG - - // send request - res = remote.GET(remotePath, requestHeader); - long contentLength = res.responseHeader.contentLength(); - - // reserver cache entry - hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, res.status, res.responseHeader, null, switchboard.defaultProxyProfile); + // send also the complete body now from the cache + // simply read the file and transfer to out socket + InputStream is = new FileInputStream(cacheFile); + byte[] buffer = new byte[2048]; + int l; + while ((l = is.read(buffer)) > 0) {hfos.write(buffer, 0, l);} + is.close(); + if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize(); + } + // that's it! + } catch (SocketException e) { + // this happens if the client stops loading the file + // we do nothing here + respondError(respond, "111 socket error: " + e.getMessage(), 1, url.toString()); + } + respond.flush(); + return; + } + + // the cache does either not exist or is (supposed to be) stale + long sizeBeforeDelete = -1; + if (cacheExists) { + // delete the cache + sizeBeforeDelete = cacheFile.length(); + cacheFile.delete(); + } + + // take a new file from the server + httpc remote = null; + httpc.response res = null; + + try { + // open the connection + if (yAddress == null) { + remote = newhttpc(host, port, timeout); + } else { + remote = newhttpc(yAddress, timeout); + } + //System.out.println("HEADER: CLIENT TO PROXY = " + requestHeader.toString()); // DEBUG + + // send request + res = remote.GET(remotePath, requestHeader); + long contentLength = res.responseHeader.contentLength(); + + // reserver cache entry + hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, res.status, res.responseHeader, null, switchboard.defaultProxyProfile); + + // handle file types + if (((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) && + (httpd.isTextMime(res.responseHeader.mime(), switchboard.mimeWhite))) { + if (transformer.isIdentityTransformer()) { + // no transformation, only passthrough + hfos = respond; + } else { // make a scraper and transformer - if (((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) && - (httpd.isTextMime(res.responseHeader.mime(), switchboard.mimeWhite))) { - if (transformer.isIdentityTransformer()) { - hfos = hpc.getContentOutputStream(); + scraper = new htmlFilterContentScraper(url); + hfos = new htmlFilterOutputStream(respond, scraper, transformer, (ext.length() == 0)); + if (((htmlFilterOutputStream) hfos).binarySuspect()) { + scraper = null; // forget it, may be rubbish + log.logDebug("Content of " + url + " is probably binary. deleted scraper."); + } + hpc.scraper = scraper; + } + } else { + log.logDebug("Resource " + url + " has wrong extension (" + ext + ") or wrong mime-type (" + res.responseHeader.mime() + "). not scraped"); + scraper = null; + hfos = respond; + hpc.scraper = scraper; + } + + // handle incoming cookies + handleIncomingCookies(res.responseHeader, host, ip); + + // request has been placed and result has been returned. work off response + try { + respondHeader(respond, res.status, res.responseHeader); + String storeError; + if ((storeError = hpc.shallStoreCache()) == null) { + // we write a new cache entry + if ((contentLength > 0) && // known + (contentLength < 1048576)) {// 1 MB + // ok, we don't write actually into a file, only to RAM, and schedule writing the file. + byte[] cacheArray; + cacheArray = res.writeContent(hfos); + if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize(); + + if (sizeBeforeDelete == -1) { + // totally fresh file + hpc.status = plasmaHTCache.CACHE_FILL; // it's an insert + cacheManager.stackProcess(hpc, cacheArray); + } else if (sizeBeforeDelete == cacheArray.length) { + // before we came here we deleted a cache entry + cacheArray = null; + hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_BAD; + cacheManager.stackProcess(hpc); // unnecessary update } else { - scraper = new htmlFilterContentScraper(url); - hfos = new htmlFilterOutputStream(respond, scraper, transformer, (ext.length() == 0)); - if (((htmlFilterOutputStream) hfos).binarySuspect()) { - scraper = null; // forget it, may be rubbish - log.logDebug("Content of " + url + " is probably binary. deleted scraper."); - } - hpc.scraper = scraper; + // before we came here we deleted a cache entry + hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_GOOD; + cacheManager.stackProcess(hpc, cacheArray); // necessary update, write response header to cache } - } else { - log.logDebug("Resource " + url + " has wrong extension (" + ext + ") or wrong mime-type (" + res.responseHeader.mime() + "). not scraped"); - scraper = null; - hfos = respond; - hpc.scraper = scraper; - } - - // handle incoming cookies - handleIncomingCookies(res.responseHeader, host, ip); - - // request has been placed and result has been returned. work off response - try { - respondHeader(respond, res.status, res.responseHeader); - String storeError; - if ((storeError = hpc.shallStoreCache()) == null) { - // we write a new cache entry - if ((contentLength > 0) && // known - (contentLength < 1048576)) // 1 MB - { - byte[] cacheArray; - if (transformer.isIdentityTransformer()) { - res.writeContentX(hfos, respond); - cacheArray = hpc.getContentBytes(); - } else { - cacheArray = res.writeContent(hfos); - } - if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize(); - // before we came here we deleted a cache entry - if (sizeBeforeDelete == cacheArray.length) { - cacheArray = null; - hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_BAD; - cacheManager.stackProcess(hpc); // unnecessary update - } else { - hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_GOOD; - cacheManager.stackProcess(hpc, cacheArray); // necessary update, write response header to cache - } - } else { - // the file is too big to cache it in the ram, write to file - cacheFile.getParentFile().mkdirs(); - if (transformer.isIdentityTransformer()) { - res.writeContent(respond, cacheFile); - if (contentLength < 10485760) { // 10 mb - serverFileUtils.copy(cacheFile, hfos); - } // else hfos is empty and that means: no work afterwards with it - } else { - res.writeContent(hfos, cacheFile); - } - if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize(); - // before we came here we deleted a cache entry - if (sizeBeforeDelete == cacheFile.length()) { - hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_BAD; - cacheManager.stackProcess(hpc); // unnecessary update - } else { - hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_GOOD; - cacheManager.stackProcess(hpc); // necessary update, write response header to cache - } - } - } else { - // no caching - log.logDebug(cacheFile.toString() + " not cached: " + storeError); - res.writeContent(hfos, null); - if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize(); - // before we came here we deleted a cache entry - hpc.status = plasmaHTCache.CACHE_STALE_NO_RELOAD; - cacheManager.stackProcess(hpc); - } - } catch (SocketException e) { - // this may happen if the client suddenly closes its connection - // maybe the user has stopped loading - // in that case, we are not responsible and just forget it - // but we clean the cache also, since it may be only partial - // and most possible corrupted - if (cacheFile.exists()) cacheFile.delete(); - } - remote.close(); - } catch (Exception e) { - // this may happen if the targeted host does not exist or anything with the - // remote server was wrong. - // in any case, sending a 404 is appropriate - try { - if ((e.toString().indexOf("unknown host")) > 0) { - respondHeader(respond,"404 unknown host", new httpHeader(null)); - } else { - respondHeader(respond,"404 Not Found", new httpHeader(null)); - respond.write(("Exception occurred:\r\n").getBytes()); - respond.write((e.toString() + "\r\n").getBytes()); - respond.write(("[TRACE: ").getBytes()); - e.printStackTrace(new PrintStream(respond)); - respond.write(("]\r\n").getBytes()); - } - } catch (Exception ee) {} - } - } - } else { - // we take a new file from the net and respond with that - try { - // open the connection - //httpc remote = newhttpc(host, port, timeout); - httpc remote; - if (yAddress == null) { - remote = newhttpc(host, port, timeout); - } else { - remote = newhttpc(yAddress, timeout); - } - //System.out.println("HEADER: CLIENT TO PROXY = " + requestHeader.toString()); // DEBUG - - // send request - httpc.response res = remote.GET(remotePath, requestHeader); - long contentLength = res.responseHeader.contentLength(); - - // reserve cache entry - hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, res.status, res.responseHeader, null, switchboard.defaultProxyProfile); - - // make a scraper and transformer - if (((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) && - (httpd.isTextMime(res.responseHeader.mime(), switchboard.mimeWhite))) { - if (transformer.isIdentityTransformer()) { - hfos = hpc.getContentOutputStream(); } else { - scraper = new htmlFilterContentScraper(url); - hfos = new htmlFilterOutputStream(respond, scraper, transformer, (ext.length() == 0)); - if (((htmlFilterOutputStream) hfos).binarySuspect()) { - scraper = null; // forget it, may be rubbish - log.logDebug("Content of " + url + " is probably binary. deleted scraper."); + // the file is too big to cache it in the ram, write to file right here + cacheFile.getParentFile().mkdirs(); + res.writeContent(hfos, cacheFile); + if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize(); + if (sizeBeforeDelete == -1) { + // totally fresh file + hpc.status = plasmaHTCache.CACHE_FILL; // it's an insert + cacheManager.stackProcess(hpc); + } else if (sizeBeforeDelete == cacheFile.length()) { + // before we came here we deleted a cache entry + hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_BAD; + cacheManager.stackProcess(hpc); // unnecessary update + } else { + // before we came here we deleted a cache entry + hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_GOOD; + cacheManager.stackProcess(hpc); // necessary update, write response header to cache } - hpc.scraper = scraper; } } else { - log.logDebug("Resource " + url + " has wrong extension (" + ext + ") or wrong mime-type (" + res.responseHeader.mime() + "). not scraped"); - scraper = null; - hfos = respond; - hpc.scraper = scraper; - } - - // handle incoming cookies - handleIncomingCookies(res.responseHeader, host, ip); - - // request has been placed and result has been returned. work off response - try { - //System.out.println("HEADER: SERVER TO PROXY = [" + res.status + "] " + ((httpHeader) res.responseHeader).toString()); // DEBUG - respondHeader(respond, res.status, res.responseHeader); - String storeError; - if ((storeError = hpc.shallStoreCache()) == null) { - // we write a new cache entry - if ((contentLength > 0) && (contentLength < 1048576)) { - // write to buffer - byte[] cacheArray; - if (transformer.isIdentityTransformer()) { - res.writeContentX(hfos, respond); - cacheArray = hpc.getContentBytes(); - } else { - cacheArray = res.writeContent(hfos); - } - if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize(); - // enQueue new entry with response header and file as byte[] - hpc.status = plasmaHTCache.CACHE_FILL; - cacheManager.stackProcess(hpc, cacheArray); - } else try { - // write to file system directly - cacheFile.getParentFile().mkdirs(); - if (transformer.isIdentityTransformer()) { - res.writeContent(respond, cacheFile); - if (contentLength < 10485760) { // 10 mb - serverFileUtils.copy(cacheFile, hfos); - } // else hfos is empty and that means: no work afterwards with it - } else { - res.writeContent(hfos, cacheFile); - } - if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize(); - // enQueue new entry with response header - hpc.status = plasmaHTCache.CACHE_FILL; - cacheManager.stackProcess(hpc); - } catch (FileNotFoundException e) { - // this may happen if there are no write rights whatsoever - // (do nothing) - /* - Exception occurred: - java.io.FileNotFoundException: - /opt/yacy_pre_v0.314_20041219/DATA/HTCACHE/www.spiegel.de/fotostrecke/0,5538,PB64-SUQ9NDYwNyZucj0z,00.html - (Permission denied) - */ - } - } else { - // no caching - //System.out.println("DEBUG: " + res.status + " " + cacheFile.toString()); // debug - log.logDebug(cacheFile.toString() + " not cached: " + storeError); - res.writeContent(hfos, null); - if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize(); - // no old file and no load. just data passing + // no caching + log.logDebug(cacheFile.toString() + " not cached: " + storeError); + res.writeContent(hfos, null); + if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize(); + if (sizeBeforeDelete == -1) { + // no old file and no load. just data passing hpc.status = plasmaHTCache.CACHE_PASSING; - cacheManager.stackProcess(hpc); - } - } catch (SocketException e) { - // this may happen if the client suddenly closes its connection - // maybe the user has stopped loading - // in that case, we are not responsible and just forget it - // but we clean the cache also, since it may be only partial - // and most possible corrupted - if (cacheFile.exists()) cacheFile.delete(); - respondHeader(respond,"404 client unexpectedly closed connection", new httpHeader(null)); - } - remote.close(); - } catch (Exception e) { - // this may happen if the targeted host does not exist or anything with the - // remote server was wrong. - // in any case, sending a 404 is appropriate - try { - if ((e.toString().indexOf("unknown host")) > 0) { - respondHeader(respond,"404 unknown host", new httpHeader(null)); - } else { - respondHeader(respond,"404 resource not available (generic exception: " + e.toString() + ")", new httpHeader(null)); - //respond.write(("Exception occurred:\r\n").getBytes()); - //respond.write((e.toString() + "\r\n").getBytes()); - //respond.write(("[TRACE: ").getBytes()); - //e.printStackTrace(new PrintStream(respond)); - //respond.write(("]\r\n").getBytes()); - /* http://www.geocrawler.com/archives/3/201/1999/8/50/2505805/ - > java.net.ConnectException: Connection refused - */ - e.printStackTrace(); - } - } catch (Exception ee) {} - } - } - respond.flush(); + cacheManager.stackProcess(hpc); + } else { + // before we came here we deleted a cache entry + hpc.status = plasmaHTCache.CACHE_STALE_NO_RELOAD; + cacheManager.stackProcess(hpc); + } + } + } catch (SocketException e) { + // this may happen if the client suddenly closes its connection + // maybe the user has stopped loading + // in that case, we are not responsible and just forget it + // but we clean the cache also, since it may be only partial + // and most possible corrupted + if (cacheFile.exists()) cacheFile.delete(); + respondHeader(respond,"404 client unexpectedly closed connection", new httpHeader(null)); + } + remote.close(); + } catch (Exception e) { + // this may happen if the targeted host does not exist or anything with the + // remote server was wrong. + // in any case, sending a 404 is appropriate + try { + if ((e.toString().indexOf("unknown host")) > 0) { + respondHeader(respond,"404 unknown host", new httpHeader(null)); + } else { + respondHeader(respond,"404 Not Found", new httpHeader(null)); + respond.write(("Exception occurred:\r\n").getBytes()); + respond.write((e.toString() + "\r\n").getBytes()); + respond.write(("[TRACE: ").getBytes()); + e.printStackTrace(new PrintStream(respond)); + respond.write(("]\r\n").getBytes()); + } + } catch (Exception ee) {} + } + respond.flush(); } diff --git a/source/de/anomic/plasma/plasmaCrawlLoader.java b/source/de/anomic/plasma/plasmaCrawlLoader.java index bc1821b22..5fbbc4ceb 100644 --- a/source/de/anomic/plasma/plasmaCrawlLoader.java +++ b/source/de/anomic/plasma/plasmaCrawlLoader.java @@ -212,12 +212,14 @@ public class plasmaCrawlLoader { } else if ((profile.storeHTCache()) && ((error = htCache.shallStoreCache()) == null)) { // we write the new cache entry to file system directly cacheFile.getParentFile().mkdirs(); - res.writeContent(htCache.getContentOutputStream(), cacheFile); // writes in content scraper and cache file + FileOutputStream fos = new FileOutputStream(cacheFile); + htCache.cacheArray = res.writeContent(fos); // writes in cacheArray and cache file + fos.close(); htCache.status = plasmaHTCache.CACHE_FILL; } else { if (error != null) log.logDebug("CRAWLER NOT STORED RESOURCE " + url.toString() + ": " + error); // anyway, the content still lives in the content scraper - res.writeContent(htCache.getContentOutputStream(), null); // writes only into content scraper + htCache.cacheArray = res.writeContent(null); // writes only into cacheArray htCache.status = plasmaHTCache.CACHE_PASSING; } // enQueue new entry with response header diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java index f5d5b9ebe..e358acdaf 100644 --- a/source/de/anomic/plasma/plasmaHTCache.java +++ b/source/de/anomic/plasma/plasmaHTCache.java @@ -452,7 +452,6 @@ public class plasmaHTCache { public String language; public plasmaCrawlProfile.entry profile; private String initiator; - public ByteArrayOutputStream content; public htmlFilterContentScraper scraper; @@ -479,7 +478,6 @@ public class plasmaHTCache { this.requestHeader = requestHeader; this.responseStatus = responseStatus; this.responseHeader = responseHeader; - this.content = new ByteArrayOutputStream(); this.profile = profile; this.initiator = (initiator == null) ? null : ((initiator.length() == 0) ? null: initiator); @@ -507,13 +505,6 @@ public class plasmaHTCache { this.scraper = null; } - public OutputStream getContentOutputStream() { - return (OutputStream) content; - } - public byte[] getContentBytes() { - try { content.flush(); } catch (IOException e) {} - return content.toByteArray(); - } public String initiator() { return initiator; } diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 68aa6c746..934ad3bec 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -469,8 +469,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi private synchronized void processResourceStack(plasmaHTCache.Entry entry) { // work off one stack entry with a fresh resource (scraped web page) - byte[] content; - if (((content = entry.getContentBytes()).length > 0) || (entry.scraper != null)) try { + if ((entry.cacheArray != null) || (entry.scraper != null)) try { // we must distinguish the following cases: resource-load was initiated by // 1) global crawling: the index is extern, not here (not possible here) // 2) result of search queries, some indexes are here (not possible here) @@ -502,7 +501,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi document = parser.transformScraper(entry.url, entry.responseHeader.mime(), entry.scraper); } else { log.logDebug("(Parser) '" + entry.urlString + "' is not parsed, parsing now"); - document = parser.parseSource(entry.url, entry.responseHeader.mime(), content); + document = parser.parseSource(entry.url, entry.responseHeader.mime(), entry.cacheArray); } // put anchors on crawl stack diff --git a/source/de/anomic/server/serverByteBuffer.java b/source/de/anomic/server/serverByteBuffer.java index 047026ff2..cc5b8b9cc 100644 --- a/source/de/anomic/server/serverByteBuffer.java +++ b/source/de/anomic/server/serverByteBuffer.java @@ -43,7 +43,7 @@ package de.anomic.server; import java.io.*; import java.util.*; -public class serverByteBuffer { +public class serverByteBuffer extends OutputStream { public static final byte singlequote = (byte) 39; public static final byte doublequote = (byte) 34; @@ -119,20 +119,37 @@ public class serverByteBuffer { offset = 0; } - public serverByteBuffer append(byte b) { - if (offset + length + 1 > buffer.length) grow(); + public void write(int b) { + write((byte) (b & 0xff)); + } + + public void write(byte b) { + if (offset + length + 1 > buffer.length) grow(); buffer[offset + length++] = b; + } + + public void write(byte[] bb) { + write(bb, 0, bb.length); + } + + public void write(byte[] bb, int of, int le) { + while (offset + length + le > buffer.length) grow(); + System.arraycopy(bb, of, buffer, offset + length, le); + length += le; + } + + public serverByteBuffer append(byte b) { + write(b); return this; } public serverByteBuffer append(byte[] bb) { - return append(bb, 0, bb.length); + write(bb); + return this; } public serverByteBuffer append(byte[] bb, int of, int le) { - while (offset + length + le > buffer.length) grow(); - System.arraycopy(bb, of, buffer, offset + length, le); - length += le; + write(bb, of, le); return this; } diff --git a/yacy.blue b/yacy.blue index d2833eb81..054c67159 100644 --- a/yacy.blue +++ b/yacy.blue @@ -1 +1 @@ -testblue +testblue \ No newline at end of file