diff --git a/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java b/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java
index 4e2baa5ee..8fabc2c32 100644
--- a/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java
@@ -65,7 +65,7 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer
}
public void init(String initarg) {
- System.out.println("Transformer init: " + initarg);
+ //System.out.println("Transformer init: " + initarg);
if (bluelist == null) {
// here, the initarg is used to load a list of bluelisted words
bluelist = new Vector();
@@ -79,7 +79,7 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer
r.close();
} catch (Exception e) {
}
- if (bluelist.size() == 0) System.out.println("BLUELIST is empty");
+ //if (bluelist.size() == 0) System.out.println("BLUELIST is empty");
}
}
diff --git a/source/de/anomic/http/httpc.java b/source/de/anomic/http/httpc.java
index 820e29c04..a81f5db14 100644
--- a/source/de/anomic/http/httpc.java
+++ b/source/de/anomic/http/httpc.java
@@ -315,10 +315,9 @@ public class httpc {
}
public byte[] writeContent(OutputStream procOS) throws IOException {
- ByteArrayOutputStream bos = new ByteArrayOutputStream();
- writeContentX(procOS, bos);
- bos.flush();
- return bos.toByteArray();
+ serverByteBuffer sbb = new serverByteBuffer();
+ writeContentX(procOS, sbb);
+ return sbb.getBytes();
}
public void writeContent(OutputStream procOS, File file) throws IOException {
diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java
index 6efb0ec6d..60e9fcb47 100644
--- a/source/de/anomic/http/httpdProxyHandler.java
+++ b/source/de/anomic/http/httpdProxyHandler.java
@@ -331,384 +331,263 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand
return;
} catch (Exception ee) {}
}
-
+
// handle outgoing cookies
handleOutgoingCookies(requestHeader, host, ip);
- // set another userAgent, if not yellowlisted
- if ((yellowList != null) && (!(yellowList.contains(domain(hostlow))))) {
- // change the User-Agent
- requestHeader.put("User-Agent", userAgent);
- }
-
- // set a scraper and a htmlFilter
- OutputStream hfos = null;
- htmlFilterContentScraper scraper = null;
-
+ // set another userAgent, if not yellowlisted
+ if ((yellowList != null) && (!(yellowList.contains(domain(hostlow))))) {
+ // change the User-Agent
+ requestHeader.put("User-Agent", userAgent);
+ }
+
+ // set a scraper and a htmlFilter
+ OutputStream hfos = null;
+ htmlFilterContentScraper scraper = null;
+
// resolve yacy and yacyh domains
String yAddress = yacyCore.seedDB.resolveYacyAddress(host);
- // re-calc the url path
- String remotePath = (args == null) ? path : (path + "?" + args); // with leading '/'
-
+ // re-calc the url path
+ String remotePath = (args == null) ? path : (path + "?" + args); // with leading '/'
+
// attach possible yacy-sublevel-domain
if ((yAddress != null) &&
- ((pos = yAddress.indexOf("/")) >= 0) &&
- (!(remotePath.startsWith("/env"))) // this is the special path, staying always at root-level
- ) remotePath = yAddress.substring(pos) + remotePath;
-
+ ((pos = yAddress.indexOf("/")) >= 0) &&
+ (!(remotePath.startsWith("/env"))) // this is the special path, staying always at root-level
+ ) remotePath = yAddress.substring(pos) + remotePath;
+
// decide wether to use a cache entry or connect to the network
- File cacheFile = cacheManager.getCachePath(url);
- String urlHash = plasmaCrawlLURL.urlHash(url);
- httpHeader cachedResponseHeader = null;
- boolean cacheExists = ((cacheFile.exists()) && (cacheFile.isFile()) &&
- ((cachedResponseHeader = cacheManager.getCachedResponse(urlHash)) != null));
-
- // why are files unzipped upon arrival? why not zip all files in cache?
- // This follows from the following premises
- // (a) no file shall be unzip-ed more than once to prevent unnessesary computing time
- // (b) old cache entries shall be comparable with refill-entries to detect/distiguish case 3+4
- // (c) the indexing mechanism needs files unzip-ed, a schedule could do that later
- // case b and c contradicts, if we use a scheduler, because files in a stale cache would be unzipped
- // and the newly arrival would be zipped and would have to be unzipped upon load. But then the
- // scheduler is superfluous. Therefore the only reminding case is
- // (d) cached files shall be either all zipped or unzipped
- // case d contradicts with a, because files need to be unzipped for indexing. Therefore
- // the only remaining case is to unzip files right upon load. Thats what we do here.
-
- // finally use existing cache if appropriate
- // here we must decide weather or not to save the data
- // to a cache
- // we distinguish four CACHE STATE cases:
- // 1. cache fill
- // 2. cache fresh - no refill
- // 3. cache stale - refill - necessary
- // 4. cache stale - refill - superfluous
- // in two of these cases we trigger a scheduler to handle newly arrived files:
- // case 1 and case 3
- plasmaHTCache.Entry hpc;
- if (cacheExists) {
- // we respond on the request by using the cache
-
- hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, "200 OK", cachedResponseHeader, null, switchboard.defaultProxyProfile);
-
- if (hpc.shallUseCache()) {
- // the cache is fresh
-
- try {
- // replace date field in old header by actual date, this is according to RFC
- cachedResponseHeader.put("Date", httpc.dateString(httpc.nowDate()));
-
- // maybe the content length is missing
- if (!(cachedResponseHeader.containsKey("CONTENT-LENGTH")))
- cachedResponseHeader.put("CONTENT-LENGTH", (String) ("" + cacheFile.length()));
-
- // check if we can send a 304 instead the complete content
- if (requestHeader.containsKey("IF-MODIFIED-SINCE")) {
- // conditional request: freshness of cache for that condition was already
- // checked within shallUseCache(). Now send only a 304 response
- log.logInfo("CACHE HIT/304 " + cacheFile.toString());
-
- // send cached header with replaced date and added length
- respondHeader(respond, "304 OK", cachedResponseHeader); // respond with 'not modified'
-
- } else {
- // unconditional request: send content of cache
- log.logInfo("CACHE HIT/203 " + cacheFile.toString());
-
- // send cached header with replaced date and added length
- respondHeader(respond, "203 OK", cachedResponseHeader); // respond with 'non-authoritative'
-
- // make a transformer
- if ((!(transformer.isIdentityTransformer())) &&
- ((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) &&
- ((cachedResponseHeader == null) || (httpd.isTextMime(cachedResponseHeader.mime(), switchboard.mimeWhite)))) {
- hfos = new htmlFilterOutputStream(respond, null, transformer, (ext.length() == 0));
- } else {
- hfos = respond;
- }
-
- // send also the complete body now from the cache
- // simply read the file and transfer to out socket
- InputStream is = new FileInputStream(cacheFile);
- byte[] buffer = new byte[2048];
- int l;
- while ((l = is.read(buffer)) > 0) {hfos.write(buffer, 0, l);}
- is.close();
- if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
- }
- // that's it!
- } catch (SocketException e) {
- // this happens if the client stops loading the file
- // we do nothing here
- respondError(respond, "111 socket error: " + e.getMessage(), 1, url.toString());
- }
- } else {
- // the cache is (supposed to be) stale
-
- // delete the cache
- long sizeBeforeDelete = cacheFile.length();
- cacheFile.delete();
-
- // take a new file from the server
- httpc remote = null;
- httpc.response res = null;
-
- try {
- // open the connection
- if (yAddress == null) {
- remote = newhttpc(host, port, timeout);
+ File cacheFile = cacheManager.getCachePath(url);
+ String urlHash = plasmaCrawlLURL.urlHash(url);
+ httpHeader cachedResponseHeader = null;
+ boolean cacheExists = ((cacheFile.exists()) && (cacheFile.isFile()) &&
+ ((cachedResponseHeader = cacheManager.getCachedResponse(urlHash)) != null));
+
+ // why are files unzipped upon arrival? why not zip all files in cache?
+ // This follows from the following premises
+ // (a) no file shall be unzip-ed more than once to prevent unnessesary computing time
+ // (b) old cache entries shall be comparable with refill-entries to detect/distiguish case 3+4
+ // (c) the indexing mechanism needs files unzip-ed, a schedule could do that later
+ // case b and c contradicts, if we use a scheduler, because files in a stale cache would be unzipped
+ // and the newly arrival would be zipped and would have to be unzipped upon load. But then the
+ // scheduler is superfluous. Therefore the only reminding case is
+ // (d) cached files shall be either all zipped or unzipped
+ // case d contradicts with a, because files need to be unzipped for indexing. Therefore
+ // the only remaining case is to unzip files right upon load. Thats what we do here.
+
+ // finally use existing cache if appropriate
+ // here we must decide weather or not to save the data
+ // to a cache
+ // we distinguish four CACHE STATE cases:
+ // 1. cache fill
+ // 2. cache fresh - no refill
+ // 3. cache stale - refill - necessary
+ // 4. cache stale - refill - superfluous
+ // in two of these cases we trigger a scheduler to handle newly arrived files:
+ // case 1 and case 3
+ plasmaHTCache.Entry hpc;
+ if ((cacheExists) &&
+ ((hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, "200 OK",
+ cachedResponseHeader, null,
+ switchboard.defaultProxyProfile)).shallUseCache())) {
+ // we respond on the request by using the cache, the cache is fresh
+
+ try {
+ // replace date field in old header by actual date, this is according to RFC
+ cachedResponseHeader.put("Date", httpc.dateString(httpc.nowDate()));
+
+ // maybe the content length is missing
+ if (!(cachedResponseHeader.containsKey("CONTENT-LENGTH")))
+ cachedResponseHeader.put("CONTENT-LENGTH", (String) ("" + cacheFile.length()));
+
+ // check if we can send a 304 instead the complete content
+ if (requestHeader.containsKey("IF-MODIFIED-SINCE")) {
+ // conditional request: freshness of cache for that condition was already
+ // checked within shallUseCache(). Now send only a 304 response
+ log.logInfo("CACHE HIT/304 " + cacheFile.toString());
+
+ // send cached header with replaced date and added length
+ respondHeader(respond, "304 OK", cachedResponseHeader); // respond with 'not modified'
+
+ } else {
+ // unconditional request: send content of cache
+ log.logInfo("CACHE HIT/203 " + cacheFile.toString());
+
+ // send cached header with replaced date and added length
+ respondHeader(respond, "203 OK", cachedResponseHeader); // respond with 'non-authoritative'
+
+ // make a transformer
+ if ((!(transformer.isIdentityTransformer())) &&
+ ((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) &&
+ ((cachedResponseHeader == null) || (httpd.isTextMime(cachedResponseHeader.mime(), switchboard.mimeWhite)))) {
+ hfos = new htmlFilterOutputStream(respond, null, transformer, (ext.length() == 0));
} else {
- remote = newhttpc(yAddress, timeout);
+ hfos = respond;
}
- //System.out.println("HEADER: CLIENT TO PROXY = " + requestHeader.toString()); // DEBUG
-
- // send request
- res = remote.GET(remotePath, requestHeader);
- long contentLength = res.responseHeader.contentLength();
-
- // reserver cache entry
- hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, res.status, res.responseHeader, null, switchboard.defaultProxyProfile);
+ // send also the complete body now from the cache
+ // simply read the file and transfer to out socket
+ InputStream is = new FileInputStream(cacheFile);
+ byte[] buffer = new byte[2048];
+ int l;
+ while ((l = is.read(buffer)) > 0) {hfos.write(buffer, 0, l);}
+ is.close();
+ if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
+ }
+ // that's it!
+ } catch (SocketException e) {
+ // this happens if the client stops loading the file
+ // we do nothing here
+ respondError(respond, "111 socket error: " + e.getMessage(), 1, url.toString());
+ }
+ respond.flush();
+ return;
+ }
+
+ // the cache does either not exist or is (supposed to be) stale
+ long sizeBeforeDelete = -1;
+ if (cacheExists) {
+ // delete the cache
+ sizeBeforeDelete = cacheFile.length();
+ cacheFile.delete();
+ }
+
+ // take a new file from the server
+ httpc remote = null;
+ httpc.response res = null;
+
+ try {
+ // open the connection
+ if (yAddress == null) {
+ remote = newhttpc(host, port, timeout);
+ } else {
+ remote = newhttpc(yAddress, timeout);
+ }
+ //System.out.println("HEADER: CLIENT TO PROXY = " + requestHeader.toString()); // DEBUG
+
+ // send request
+ res = remote.GET(remotePath, requestHeader);
+ long contentLength = res.responseHeader.contentLength();
+
+ // reserver cache entry
+ hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, res.status, res.responseHeader, null, switchboard.defaultProxyProfile);
+
+ // handle file types
+ if (((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) &&
+ (httpd.isTextMime(res.responseHeader.mime(), switchboard.mimeWhite))) {
+ if (transformer.isIdentityTransformer()) {
+ // no transformation, only passthrough
+ hfos = respond;
+ } else {
// make a scraper and transformer
- if (((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) &&
- (httpd.isTextMime(res.responseHeader.mime(), switchboard.mimeWhite))) {
- if (transformer.isIdentityTransformer()) {
- hfos = hpc.getContentOutputStream();
+ scraper = new htmlFilterContentScraper(url);
+ hfos = new htmlFilterOutputStream(respond, scraper, transformer, (ext.length() == 0));
+ if (((htmlFilterOutputStream) hfos).binarySuspect()) {
+ scraper = null; // forget it, may be rubbish
+ log.logDebug("Content of " + url + " is probably binary. deleted scraper.");
+ }
+ hpc.scraper = scraper;
+ }
+ } else {
+ log.logDebug("Resource " + url + " has wrong extension (" + ext + ") or wrong mime-type (" + res.responseHeader.mime() + "). not scraped");
+ scraper = null;
+ hfos = respond;
+ hpc.scraper = scraper;
+ }
+
+ // handle incoming cookies
+ handleIncomingCookies(res.responseHeader, host, ip);
+
+ // request has been placed and result has been returned. work off response
+ try {
+ respondHeader(respond, res.status, res.responseHeader);
+ String storeError;
+ if ((storeError = hpc.shallStoreCache()) == null) {
+ // we write a new cache entry
+ if ((contentLength > 0) && // known
+ (contentLength < 1048576)) {// 1 MB
+ // ok, we don't write actually into a file, only to RAM, and schedule writing the file.
+ byte[] cacheArray;
+ cacheArray = res.writeContent(hfos);
+ if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
+
+ if (sizeBeforeDelete == -1) {
+ // totally fresh file
+ hpc.status = plasmaHTCache.CACHE_FILL; // it's an insert
+ cacheManager.stackProcess(hpc, cacheArray);
+ } else if (sizeBeforeDelete == cacheArray.length) {
+ // before we came here we deleted a cache entry
+ cacheArray = null;
+ hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_BAD;
+ cacheManager.stackProcess(hpc); // unnecessary update
} else {
- scraper = new htmlFilterContentScraper(url);
- hfos = new htmlFilterOutputStream(respond, scraper, transformer, (ext.length() == 0));
- if (((htmlFilterOutputStream) hfos).binarySuspect()) {
- scraper = null; // forget it, may be rubbish
- log.logDebug("Content of " + url + " is probably binary. deleted scraper.");
- }
- hpc.scraper = scraper;
+ // before we came here we deleted a cache entry
+ hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_GOOD;
+ cacheManager.stackProcess(hpc, cacheArray); // necessary update, write response header to cache
}
- } else {
- log.logDebug("Resource " + url + " has wrong extension (" + ext + ") or wrong mime-type (" + res.responseHeader.mime() + "). not scraped");
- scraper = null;
- hfos = respond;
- hpc.scraper = scraper;
- }
-
- // handle incoming cookies
- handleIncomingCookies(res.responseHeader, host, ip);
-
- // request has been placed and result has been returned. work off response
- try {
- respondHeader(respond, res.status, res.responseHeader);
- String storeError;
- if ((storeError = hpc.shallStoreCache()) == null) {
- // we write a new cache entry
- if ((contentLength > 0) && // known
- (contentLength < 1048576)) // 1 MB
- {
- byte[] cacheArray;
- if (transformer.isIdentityTransformer()) {
- res.writeContentX(hfos, respond);
- cacheArray = hpc.getContentBytes();
- } else {
- cacheArray = res.writeContent(hfos);
- }
- if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
- // before we came here we deleted a cache entry
- if (sizeBeforeDelete == cacheArray.length) {
- cacheArray = null;
- hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_BAD;
- cacheManager.stackProcess(hpc); // unnecessary update
- } else {
- hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_GOOD;
- cacheManager.stackProcess(hpc, cacheArray); // necessary update, write response header to cache
- }
- } else {
- // the file is too big to cache it in the ram, write to file
- cacheFile.getParentFile().mkdirs();
- if (transformer.isIdentityTransformer()) {
- res.writeContent(respond, cacheFile);
- if (contentLength < 10485760) { // 10 mb
- serverFileUtils.copy(cacheFile, hfos);
- } // else hfos is empty and that means: no work afterwards with it
- } else {
- res.writeContent(hfos, cacheFile);
- }
- if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
- // before we came here we deleted a cache entry
- if (sizeBeforeDelete == cacheFile.length()) {
- hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_BAD;
- cacheManager.stackProcess(hpc); // unnecessary update
- } else {
- hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_GOOD;
- cacheManager.stackProcess(hpc); // necessary update, write response header to cache
- }
- }
- } else {
- // no caching
- log.logDebug(cacheFile.toString() + " not cached: " + storeError);
- res.writeContent(hfos, null);
- if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
- // before we came here we deleted a cache entry
- hpc.status = plasmaHTCache.CACHE_STALE_NO_RELOAD;
- cacheManager.stackProcess(hpc);
- }
- } catch (SocketException e) {
- // this may happen if the client suddenly closes its connection
- // maybe the user has stopped loading
- // in that case, we are not responsible and just forget it
- // but we clean the cache also, since it may be only partial
- // and most possible corrupted
- if (cacheFile.exists()) cacheFile.delete();
- }
- remote.close();
- } catch (Exception e) {
- // this may happen if the targeted host does not exist or anything with the
- // remote server was wrong.
- // in any case, sending a 404 is appropriate
- try {
- if ((e.toString().indexOf("unknown host")) > 0) {
- respondHeader(respond,"404 unknown host", new httpHeader(null));
- } else {
- respondHeader(respond,"404 Not Found", new httpHeader(null));
- respond.write(("Exception occurred:\r\n").getBytes());
- respond.write((e.toString() + "\r\n").getBytes());
- respond.write(("[TRACE: ").getBytes());
- e.printStackTrace(new PrintStream(respond));
- respond.write(("]\r\n").getBytes());
- }
- } catch (Exception ee) {}
- }
- }
- } else {
- // we take a new file from the net and respond with that
- try {
- // open the connection
- //httpc remote = newhttpc(host, port, timeout);
- httpc remote;
- if (yAddress == null) {
- remote = newhttpc(host, port, timeout);
- } else {
- remote = newhttpc(yAddress, timeout);
- }
- //System.out.println("HEADER: CLIENT TO PROXY = " + requestHeader.toString()); // DEBUG
-
- // send request
- httpc.response res = remote.GET(remotePath, requestHeader);
- long contentLength = res.responseHeader.contentLength();
-
- // reserve cache entry
- hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, res.status, res.responseHeader, null, switchboard.defaultProxyProfile);
-
- // make a scraper and transformer
- if (((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) &&
- (httpd.isTextMime(res.responseHeader.mime(), switchboard.mimeWhite))) {
- if (transformer.isIdentityTransformer()) {
- hfos = hpc.getContentOutputStream();
} else {
- scraper = new htmlFilterContentScraper(url);
- hfos = new htmlFilterOutputStream(respond, scraper, transformer, (ext.length() == 0));
- if (((htmlFilterOutputStream) hfos).binarySuspect()) {
- scraper = null; // forget it, may be rubbish
- log.logDebug("Content of " + url + " is probably binary. deleted scraper.");
+ // the file is too big to cache it in the ram, write to file right here
+ cacheFile.getParentFile().mkdirs();
+ res.writeContent(hfos, cacheFile);
+ if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
+ if (sizeBeforeDelete == -1) {
+ // totally fresh file
+ hpc.status = plasmaHTCache.CACHE_FILL; // it's an insert
+ cacheManager.stackProcess(hpc);
+ } else if (sizeBeforeDelete == cacheFile.length()) {
+ // before we came here we deleted a cache entry
+ hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_BAD;
+ cacheManager.stackProcess(hpc); // unnecessary update
+ } else {
+ // before we came here we deleted a cache entry
+ hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_GOOD;
+ cacheManager.stackProcess(hpc); // necessary update, write response header to cache
}
- hpc.scraper = scraper;
}
} else {
- log.logDebug("Resource " + url + " has wrong extension (" + ext + ") or wrong mime-type (" + res.responseHeader.mime() + "). not scraped");
- scraper = null;
- hfos = respond;
- hpc.scraper = scraper;
- }
-
- // handle incoming cookies
- handleIncomingCookies(res.responseHeader, host, ip);
-
- // request has been placed and result has been returned. work off response
- try {
- //System.out.println("HEADER: SERVER TO PROXY = [" + res.status + "] " + ((httpHeader) res.responseHeader).toString()); // DEBUG
- respondHeader(respond, res.status, res.responseHeader);
- String storeError;
- if ((storeError = hpc.shallStoreCache()) == null) {
- // we write a new cache entry
- if ((contentLength > 0) && (contentLength < 1048576)) {
- // write to buffer
- byte[] cacheArray;
- if (transformer.isIdentityTransformer()) {
- res.writeContentX(hfos, respond);
- cacheArray = hpc.getContentBytes();
- } else {
- cacheArray = res.writeContent(hfos);
- }
- if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
- // enQueue new entry with response header and file as byte[]
- hpc.status = plasmaHTCache.CACHE_FILL;
- cacheManager.stackProcess(hpc, cacheArray);
- } else try {
- // write to file system directly
- cacheFile.getParentFile().mkdirs();
- if (transformer.isIdentityTransformer()) {
- res.writeContent(respond, cacheFile);
- if (contentLength < 10485760) { // 10 mb
- serverFileUtils.copy(cacheFile, hfos);
- } // else hfos is empty and that means: no work afterwards with it
- } else {
- res.writeContent(hfos, cacheFile);
- }
- if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
- // enQueue new entry with response header
- hpc.status = plasmaHTCache.CACHE_FILL;
- cacheManager.stackProcess(hpc);
- } catch (FileNotFoundException e) {
- // this may happen if there are no write rights whatsoever
- // (do nothing)
- /*
- Exception occurred:
- java.io.FileNotFoundException:
- /opt/yacy_pre_v0.314_20041219/DATA/HTCACHE/www.spiegel.de/fotostrecke/0,5538,PB64-SUQ9NDYwNyZucj0z,00.html
- (Permission denied)
- */
- }
- } else {
- // no caching
- //System.out.println("DEBUG: " + res.status + " " + cacheFile.toString()); // debug
- log.logDebug(cacheFile.toString() + " not cached: " + storeError);
- res.writeContent(hfos, null);
- if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
- // no old file and no load. just data passing
+ // no caching
+ log.logDebug(cacheFile.toString() + " not cached: " + storeError);
+ res.writeContent(hfos, null);
+ if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
+ if (sizeBeforeDelete == -1) {
+ // no old file and no load. just data passing
hpc.status = plasmaHTCache.CACHE_PASSING;
- cacheManager.stackProcess(hpc);
- }
- } catch (SocketException e) {
- // this may happen if the client suddenly closes its connection
- // maybe the user has stopped loading
- // in that case, we are not responsible and just forget it
- // but we clean the cache also, since it may be only partial
- // and most possible corrupted
- if (cacheFile.exists()) cacheFile.delete();
- respondHeader(respond,"404 client unexpectedly closed connection", new httpHeader(null));
- }
- remote.close();
- } catch (Exception e) {
- // this may happen if the targeted host does not exist or anything with the
- // remote server was wrong.
- // in any case, sending a 404 is appropriate
- try {
- if ((e.toString().indexOf("unknown host")) > 0) {
- respondHeader(respond,"404 unknown host", new httpHeader(null));
- } else {
- respondHeader(respond,"404 resource not available (generic exception: " + e.toString() + ")", new httpHeader(null));
- //respond.write(("Exception occurred:\r\n").getBytes());
- //respond.write((e.toString() + "\r\n").getBytes());
- //respond.write(("[TRACE: ").getBytes());
- //e.printStackTrace(new PrintStream(respond));
- //respond.write(("]\r\n").getBytes());
- /* http://www.geocrawler.com/archives/3/201/1999/8/50/2505805/
- > java.net.ConnectException: Connection refused
- */
- e.printStackTrace();
- }
- } catch (Exception ee) {}
- }
- }
- respond.flush();
+ cacheManager.stackProcess(hpc);
+ } else {
+ // before we came here we deleted a cache entry
+ hpc.status = plasmaHTCache.CACHE_STALE_NO_RELOAD;
+ cacheManager.stackProcess(hpc);
+ }
+ }
+ } catch (SocketException e) {
+ // this may happen if the client suddenly closes its connection
+ // maybe the user has stopped loading
+ // in that case, we are not responsible and just forget it
+ // but we clean the cache also, since it may be only partial
+ // and most possible corrupted
+ if (cacheFile.exists()) cacheFile.delete();
+ respondHeader(respond,"404 client unexpectedly closed connection", new httpHeader(null));
+ }
+ remote.close();
+ } catch (Exception e) {
+ // this may happen if the targeted host does not exist or anything with the
+ // remote server was wrong.
+ // in any case, sending a 404 is appropriate
+ try {
+ if ((e.toString().indexOf("unknown host")) > 0) {
+ respondHeader(respond,"404 unknown host", new httpHeader(null));
+ } else {
+ respondHeader(respond,"404 Not Found", new httpHeader(null));
+ respond.write(("Exception occurred:\r\n").getBytes());
+ respond.write((e.toString() + "\r\n").getBytes());
+ respond.write(("[TRACE: ").getBytes());
+ e.printStackTrace(new PrintStream(respond));
+ respond.write(("]\r\n").getBytes());
+ }
+ } catch (Exception ee) {}
+ }
+ respond.flush();
}
diff --git a/source/de/anomic/plasma/plasmaCrawlLoader.java b/source/de/anomic/plasma/plasmaCrawlLoader.java
index bc1821b22..5fbbc4ceb 100644
--- a/source/de/anomic/plasma/plasmaCrawlLoader.java
+++ b/source/de/anomic/plasma/plasmaCrawlLoader.java
@@ -212,12 +212,14 @@ public class plasmaCrawlLoader {
} else if ((profile.storeHTCache()) && ((error = htCache.shallStoreCache()) == null)) {
// we write the new cache entry to file system directly
cacheFile.getParentFile().mkdirs();
- res.writeContent(htCache.getContentOutputStream(), cacheFile); // writes in content scraper and cache file
+ FileOutputStream fos = new FileOutputStream(cacheFile);
+ htCache.cacheArray = res.writeContent(fos); // writes in cacheArray and cache file
+ fos.close();
htCache.status = plasmaHTCache.CACHE_FILL;
} else {
if (error != null) log.logDebug("CRAWLER NOT STORED RESOURCE " + url.toString() + ": " + error);
// anyway, the content still lives in the content scraper
- res.writeContent(htCache.getContentOutputStream(), null); // writes only into content scraper
+ htCache.cacheArray = res.writeContent(null); // writes only into cacheArray
htCache.status = plasmaHTCache.CACHE_PASSING;
}
// enQueue new entry with response header
diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java
index f5d5b9ebe..e358acdaf 100644
--- a/source/de/anomic/plasma/plasmaHTCache.java
+++ b/source/de/anomic/plasma/plasmaHTCache.java
@@ -452,7 +452,6 @@ public class plasmaHTCache {
public String language;
public plasmaCrawlProfile.entry profile;
private String initiator;
- public ByteArrayOutputStream content;
public htmlFilterContentScraper scraper;
@@ -479,7 +478,6 @@ public class plasmaHTCache {
this.requestHeader = requestHeader;
this.responseStatus = responseStatus;
this.responseHeader = responseHeader;
- this.content = new ByteArrayOutputStream();
this.profile = profile;
this.initiator = (initiator == null) ? null : ((initiator.length() == 0) ? null: initiator);
@@ -507,13 +505,6 @@ public class plasmaHTCache {
this.scraper = null;
}
- public OutputStream getContentOutputStream() {
- return (OutputStream) content;
- }
- public byte[] getContentBytes() {
- try { content.flush(); } catch (IOException e) {}
- return content.toByteArray();
- }
public String initiator() {
return initiator;
}
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index 68aa6c746..934ad3bec 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -469,8 +469,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
private synchronized void processResourceStack(plasmaHTCache.Entry entry) {
// work off one stack entry with a fresh resource (scraped web page)
- byte[] content;
- if (((content = entry.getContentBytes()).length > 0) || (entry.scraper != null)) try {
+ if ((entry.cacheArray != null) || (entry.scraper != null)) try {
// we must distinguish the following cases: resource-load was initiated by
// 1) global crawling: the index is extern, not here (not possible here)
// 2) result of search queries, some indexes are here (not possible here)
@@ -502,7 +501,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
document = parser.transformScraper(entry.url, entry.responseHeader.mime(), entry.scraper);
} else {
log.logDebug("(Parser) '" + entry.urlString + "' is not parsed, parsing now");
- document = parser.parseSource(entry.url, entry.responseHeader.mime(), content);
+ document = parser.parseSource(entry.url, entry.responseHeader.mime(), entry.cacheArray);
}
// put anchors on crawl stack
diff --git a/source/de/anomic/server/serverByteBuffer.java b/source/de/anomic/server/serverByteBuffer.java
index 047026ff2..cc5b8b9cc 100644
--- a/source/de/anomic/server/serverByteBuffer.java
+++ b/source/de/anomic/server/serverByteBuffer.java
@@ -43,7 +43,7 @@ package de.anomic.server;
import java.io.*;
import java.util.*;
-public class serverByteBuffer {
+public class serverByteBuffer extends OutputStream {
public static final byte singlequote = (byte) 39;
public static final byte doublequote = (byte) 34;
@@ -119,20 +119,37 @@ public class serverByteBuffer {
offset = 0;
}
- public serverByteBuffer append(byte b) {
- if (offset + length + 1 > buffer.length) grow();
+ public void write(int b) {
+ write((byte) (b & 0xff));
+ }
+
+ public void write(byte b) {
+ if (offset + length + 1 > buffer.length) grow();
buffer[offset + length++] = b;
+ }
+
+ public void write(byte[] bb) {
+ write(bb, 0, bb.length);
+ }
+
+ public void write(byte[] bb, int of, int le) {
+ while (offset + length + le > buffer.length) grow();
+ System.arraycopy(bb, of, buffer, offset + length, le);
+ length += le;
+ }
+
+ public serverByteBuffer append(byte b) {
+ write(b);
return this;
}
public serverByteBuffer append(byte[] bb) {
- return append(bb, 0, bb.length);
+ write(bb);
+ return this;
}
public serverByteBuffer append(byte[] bb, int of, int le) {
- while (offset + length + le > buffer.length) grow();
- System.arraycopy(bb, of, buffer, offset + length, le);
- length += le;
+ write(bb, of, le);
return this;
}
diff --git a/yacy.blue b/yacy.blue
index d2833eb81..054c67159 100644
--- a/yacy.blue
+++ b/yacy.blue
@@ -1 +1 @@
-testblue
+testblue
\ No newline at end of file