new crawl/proxy/cache design + fixes

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@18 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 20 years ago
parent e7d055b98e
commit c0807abd33

@ -65,7 +65,7 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer
} }
public void init(String initarg) { public void init(String initarg) {
System.out.println("Transformer init: " + initarg); //System.out.println("Transformer init: " + initarg);
if (bluelist == null) { if (bluelist == null) {
// here, the initarg is used to load a list of bluelisted words // here, the initarg is used to load a list of bluelisted words
bluelist = new Vector(); bluelist = new Vector();
@ -79,7 +79,7 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer
r.close(); r.close();
} catch (Exception e) { } catch (Exception e) {
} }
if (bluelist.size() == 0) System.out.println("BLUELIST is empty"); //if (bluelist.size() == 0) System.out.println("BLUELIST is empty");
} }
} }

@ -315,10 +315,9 @@ public class httpc {
} }
public byte[] writeContent(OutputStream procOS) throws IOException { public byte[] writeContent(OutputStream procOS) throws IOException {
ByteArrayOutputStream bos = new ByteArrayOutputStream(); serverByteBuffer sbb = new serverByteBuffer();
writeContentX(procOS, bos); writeContentX(procOS, sbb);
bos.flush(); return sbb.getBytes();
return bos.toByteArray();
} }
public void writeContent(OutputStream procOS, File file) throws IOException { public void writeContent(OutputStream procOS, File file) throws IOException {

@ -331,384 +331,263 @@ public class httpdProxyHandler extends httpdAbstractHandler implements httpdHand
return; return;
} catch (Exception ee) {} } catch (Exception ee) {}
} }
// handle outgoing cookies // handle outgoing cookies
handleOutgoingCookies(requestHeader, host, ip); handleOutgoingCookies(requestHeader, host, ip);
// set another userAgent, if not yellowlisted // set another userAgent, if not yellowlisted
if ((yellowList != null) && (!(yellowList.contains(domain(hostlow))))) { if ((yellowList != null) && (!(yellowList.contains(domain(hostlow))))) {
// change the User-Agent // change the User-Agent
requestHeader.put("User-Agent", userAgent); requestHeader.put("User-Agent", userAgent);
} }
// set a scraper and a htmlFilter // set a scraper and a htmlFilter
OutputStream hfos = null; OutputStream hfos = null;
htmlFilterContentScraper scraper = null; htmlFilterContentScraper scraper = null;
// resolve yacy and yacyh domains // resolve yacy and yacyh domains
String yAddress = yacyCore.seedDB.resolveYacyAddress(host); String yAddress = yacyCore.seedDB.resolveYacyAddress(host);
// re-calc the url path // re-calc the url path
String remotePath = (args == null) ? path : (path + "?" + args); // with leading '/' String remotePath = (args == null) ? path : (path + "?" + args); // with leading '/'
// attach possible yacy-sublevel-domain // attach possible yacy-sublevel-domain
if ((yAddress != null) && if ((yAddress != null) &&
((pos = yAddress.indexOf("/")) >= 0) && ((pos = yAddress.indexOf("/")) >= 0) &&
(!(remotePath.startsWith("/env"))) // this is the special path, staying always at root-level (!(remotePath.startsWith("/env"))) // this is the special path, staying always at root-level
) remotePath = yAddress.substring(pos) + remotePath; ) remotePath = yAddress.substring(pos) + remotePath;
// decide wether to use a cache entry or connect to the network // decide wether to use a cache entry or connect to the network
File cacheFile = cacheManager.getCachePath(url); File cacheFile = cacheManager.getCachePath(url);
String urlHash = plasmaCrawlLURL.urlHash(url); String urlHash = plasmaCrawlLURL.urlHash(url);
httpHeader cachedResponseHeader = null; httpHeader cachedResponseHeader = null;
boolean cacheExists = ((cacheFile.exists()) && (cacheFile.isFile()) && boolean cacheExists = ((cacheFile.exists()) && (cacheFile.isFile()) &&
((cachedResponseHeader = cacheManager.getCachedResponse(urlHash)) != null)); ((cachedResponseHeader = cacheManager.getCachedResponse(urlHash)) != null));
// why are files unzipped upon arrival? why not zip all files in cache? // why are files unzipped upon arrival? why not zip all files in cache?
// This follows from the following premises // This follows from the following premises
// (a) no file shall be unzip-ed more than once to prevent unnessesary computing time // (a) no file shall be unzip-ed more than once to prevent unnessesary computing time
// (b) old cache entries shall be comparable with refill-entries to detect/distiguish case 3+4 // (b) old cache entries shall be comparable with refill-entries to detect/distiguish case 3+4
// (c) the indexing mechanism needs files unzip-ed, a schedule could do that later // (c) the indexing mechanism needs files unzip-ed, a schedule could do that later
// case b and c contradicts, if we use a scheduler, because files in a stale cache would be unzipped // case b and c contradicts, if we use a scheduler, because files in a stale cache would be unzipped
// and the newly arrival would be zipped and would have to be unzipped upon load. But then the // and the newly arrival would be zipped and would have to be unzipped upon load. But then the
// scheduler is superfluous. Therefore the only reminding case is // scheduler is superfluous. Therefore the only reminding case is
// (d) cached files shall be either all zipped or unzipped // (d) cached files shall be either all zipped or unzipped
// case d contradicts with a, because files need to be unzipped for indexing. Therefore // case d contradicts with a, because files need to be unzipped for indexing. Therefore
// the only remaining case is to unzip files right upon load. Thats what we do here. // the only remaining case is to unzip files right upon load. Thats what we do here.
// finally use existing cache if appropriate // finally use existing cache if appropriate
// here we must decide weather or not to save the data // here we must decide weather or not to save the data
// to a cache // to a cache
// we distinguish four CACHE STATE cases: // we distinguish four CACHE STATE cases:
// 1. cache fill // 1. cache fill
// 2. cache fresh - no refill // 2. cache fresh - no refill
// 3. cache stale - refill - necessary // 3. cache stale - refill - necessary
// 4. cache stale - refill - superfluous // 4. cache stale - refill - superfluous
// in two of these cases we trigger a scheduler to handle newly arrived files: // in two of these cases we trigger a scheduler to handle newly arrived files:
// case 1 and case 3 // case 1 and case 3
plasmaHTCache.Entry hpc; plasmaHTCache.Entry hpc;
if (cacheExists) { if ((cacheExists) &&
// we respond on the request by using the cache ((hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, "200 OK",
cachedResponseHeader, null,
hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, "200 OK", cachedResponseHeader, null, switchboard.defaultProxyProfile); switchboard.defaultProxyProfile)).shallUseCache())) {
// we respond on the request by using the cache, the cache is fresh
if (hpc.shallUseCache()) {
// the cache is fresh try {
// replace date field in old header by actual date, this is according to RFC
try { cachedResponseHeader.put("Date", httpc.dateString(httpc.nowDate()));
// replace date field in old header by actual date, this is according to RFC
cachedResponseHeader.put("Date", httpc.dateString(httpc.nowDate())); // maybe the content length is missing
if (!(cachedResponseHeader.containsKey("CONTENT-LENGTH")))
// maybe the content length is missing cachedResponseHeader.put("CONTENT-LENGTH", (String) ("" + cacheFile.length()));
if (!(cachedResponseHeader.containsKey("CONTENT-LENGTH")))
cachedResponseHeader.put("CONTENT-LENGTH", (String) ("" + cacheFile.length())); // check if we can send a 304 instead the complete content
if (requestHeader.containsKey("IF-MODIFIED-SINCE")) {
// check if we can send a 304 instead the complete content // conditional request: freshness of cache for that condition was already
if (requestHeader.containsKey("IF-MODIFIED-SINCE")) { // checked within shallUseCache(). Now send only a 304 response
// conditional request: freshness of cache for that condition was already log.logInfo("CACHE HIT/304 " + cacheFile.toString());
// checked within shallUseCache(). Now send only a 304 response
log.logInfo("CACHE HIT/304 " + cacheFile.toString()); // send cached header with replaced date and added length
respondHeader(respond, "304 OK", cachedResponseHeader); // respond with 'not modified'
// send cached header with replaced date and added length
respondHeader(respond, "304 OK", cachedResponseHeader); // respond with 'not modified' } else {
// unconditional request: send content of cache
} else { log.logInfo("CACHE HIT/203 " + cacheFile.toString());
// unconditional request: send content of cache
log.logInfo("CACHE HIT/203 " + cacheFile.toString()); // send cached header with replaced date and added length
respondHeader(respond, "203 OK", cachedResponseHeader); // respond with 'non-authoritative'
// send cached header with replaced date and added length
respondHeader(respond, "203 OK", cachedResponseHeader); // respond with 'non-authoritative' // make a transformer
if ((!(transformer.isIdentityTransformer())) &&
// make a transformer ((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) &&
if ((!(transformer.isIdentityTransformer())) && ((cachedResponseHeader == null) || (httpd.isTextMime(cachedResponseHeader.mime(), switchboard.mimeWhite)))) {
((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) && hfos = new htmlFilterOutputStream(respond, null, transformer, (ext.length() == 0));
((cachedResponseHeader == null) || (httpd.isTextMime(cachedResponseHeader.mime(), switchboard.mimeWhite)))) {
hfos = new htmlFilterOutputStream(respond, null, transformer, (ext.length() == 0));
} else {
hfos = respond;
}
// send also the complete body now from the cache
// simply read the file and transfer to out socket
InputStream is = new FileInputStream(cacheFile);
byte[] buffer = new byte[2048];
int l;
while ((l = is.read(buffer)) > 0) {hfos.write(buffer, 0, l);}
is.close();
if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
}
// that's it!
} catch (SocketException e) {
// this happens if the client stops loading the file
// we do nothing here
respondError(respond, "111 socket error: " + e.getMessage(), 1, url.toString());
}
} else {
// the cache is (supposed to be) stale
// delete the cache
long sizeBeforeDelete = cacheFile.length();
cacheFile.delete();
// take a new file from the server
httpc remote = null;
httpc.response res = null;
try {
// open the connection
if (yAddress == null) {
remote = newhttpc(host, port, timeout);
} else { } else {
remote = newhttpc(yAddress, timeout); hfos = respond;
} }
//System.out.println("HEADER: CLIENT TO PROXY = " + requestHeader.toString()); // DEBUG
// send request
res = remote.GET(remotePath, requestHeader);
long contentLength = res.responseHeader.contentLength();
// reserver cache entry
hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, res.status, res.responseHeader, null, switchboard.defaultProxyProfile);
// send also the complete body now from the cache
// simply read the file and transfer to out socket
InputStream is = new FileInputStream(cacheFile);
byte[] buffer = new byte[2048];
int l;
while ((l = is.read(buffer)) > 0) {hfos.write(buffer, 0, l);}
is.close();
if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
}
// that's it!
} catch (SocketException e) {
// this happens if the client stops loading the file
// we do nothing here
respondError(respond, "111 socket error: " + e.getMessage(), 1, url.toString());
}
respond.flush();
return;
}
// the cache does either not exist or is (supposed to be) stale
long sizeBeforeDelete = -1;
if (cacheExists) {
// delete the cache
sizeBeforeDelete = cacheFile.length();
cacheFile.delete();
}
// take a new file from the server
httpc remote = null;
httpc.response res = null;
try {
// open the connection
if (yAddress == null) {
remote = newhttpc(host, port, timeout);
} else {
remote = newhttpc(yAddress, timeout);
}
//System.out.println("HEADER: CLIENT TO PROXY = " + requestHeader.toString()); // DEBUG
// send request
res = remote.GET(remotePath, requestHeader);
long contentLength = res.responseHeader.contentLength();
// reserver cache entry
hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, res.status, res.responseHeader, null, switchboard.defaultProxyProfile);
// handle file types
if (((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) &&
(httpd.isTextMime(res.responseHeader.mime(), switchboard.mimeWhite))) {
if (transformer.isIdentityTransformer()) {
// no transformation, only passthrough
hfos = respond;
} else {
// make a scraper and transformer // make a scraper and transformer
if (((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) && scraper = new htmlFilterContentScraper(url);
(httpd.isTextMime(res.responseHeader.mime(), switchboard.mimeWhite))) { hfos = new htmlFilterOutputStream(respond, scraper, transformer, (ext.length() == 0));
if (transformer.isIdentityTransformer()) { if (((htmlFilterOutputStream) hfos).binarySuspect()) {
hfos = hpc.getContentOutputStream(); scraper = null; // forget it, may be rubbish
log.logDebug("Content of " + url + " is probably binary. deleted scraper.");
}
hpc.scraper = scraper;
}
} else {
log.logDebug("Resource " + url + " has wrong extension (" + ext + ") or wrong mime-type (" + res.responseHeader.mime() + "). not scraped");
scraper = null;
hfos = respond;
hpc.scraper = scraper;
}
// handle incoming cookies
handleIncomingCookies(res.responseHeader, host, ip);
// request has been placed and result has been returned. work off response
try {
respondHeader(respond, res.status, res.responseHeader);
String storeError;
if ((storeError = hpc.shallStoreCache()) == null) {
// we write a new cache entry
if ((contentLength > 0) && // known
(contentLength < 1048576)) {// 1 MB
// ok, we don't write actually into a file, only to RAM, and schedule writing the file.
byte[] cacheArray;
cacheArray = res.writeContent(hfos);
if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
if (sizeBeforeDelete == -1) {
// totally fresh file
hpc.status = plasmaHTCache.CACHE_FILL; // it's an insert
cacheManager.stackProcess(hpc, cacheArray);
} else if (sizeBeforeDelete == cacheArray.length) {
// before we came here we deleted a cache entry
cacheArray = null;
hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_BAD;
cacheManager.stackProcess(hpc); // unnecessary update
} else { } else {
scraper = new htmlFilterContentScraper(url); // before we came here we deleted a cache entry
hfos = new htmlFilterOutputStream(respond, scraper, transformer, (ext.length() == 0)); hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_GOOD;
if (((htmlFilterOutputStream) hfos).binarySuspect()) { cacheManager.stackProcess(hpc, cacheArray); // necessary update, write response header to cache
scraper = null; // forget it, may be rubbish
log.logDebug("Content of " + url + " is probably binary. deleted scraper.");
}
hpc.scraper = scraper;
} }
} else {
log.logDebug("Resource " + url + " has wrong extension (" + ext + ") or wrong mime-type (" + res.responseHeader.mime() + "). not scraped");
scraper = null;
hfos = respond;
hpc.scraper = scraper;
}
// handle incoming cookies
handleIncomingCookies(res.responseHeader, host, ip);
// request has been placed and result has been returned. work off response
try {
respondHeader(respond, res.status, res.responseHeader);
String storeError;
if ((storeError = hpc.shallStoreCache()) == null) {
// we write a new cache entry
if ((contentLength > 0) && // known
(contentLength < 1048576)) // 1 MB
{
byte[] cacheArray;
if (transformer.isIdentityTransformer()) {
res.writeContentX(hfos, respond);
cacheArray = hpc.getContentBytes();
} else {
cacheArray = res.writeContent(hfos);
}
if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
// before we came here we deleted a cache entry
if (sizeBeforeDelete == cacheArray.length) {
cacheArray = null;
hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_BAD;
cacheManager.stackProcess(hpc); // unnecessary update
} else {
hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_GOOD;
cacheManager.stackProcess(hpc, cacheArray); // necessary update, write response header to cache
}
} else {
// the file is too big to cache it in the ram, write to file
cacheFile.getParentFile().mkdirs();
if (transformer.isIdentityTransformer()) {
res.writeContent(respond, cacheFile);
if (contentLength < 10485760) { // 10 mb
serverFileUtils.copy(cacheFile, hfos);
} // else hfos is empty and that means: no work afterwards with it
} else {
res.writeContent(hfos, cacheFile);
}
if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
// before we came here we deleted a cache entry
if (sizeBeforeDelete == cacheFile.length()) {
hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_BAD;
cacheManager.stackProcess(hpc); // unnecessary update
} else {
hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_GOOD;
cacheManager.stackProcess(hpc); // necessary update, write response header to cache
}
}
} else {
// no caching
log.logDebug(cacheFile.toString() + " not cached: " + storeError);
res.writeContent(hfos, null);
if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
// before we came here we deleted a cache entry
hpc.status = plasmaHTCache.CACHE_STALE_NO_RELOAD;
cacheManager.stackProcess(hpc);
}
} catch (SocketException e) {
// this may happen if the client suddenly closes its connection
// maybe the user has stopped loading
// in that case, we are not responsible and just forget it
// but we clean the cache also, since it may be only partial
// and most possible corrupted
if (cacheFile.exists()) cacheFile.delete();
}
remote.close();
} catch (Exception e) {
// this may happen if the targeted host does not exist or anything with the
// remote server was wrong.
// in any case, sending a 404 is appropriate
try {
if ((e.toString().indexOf("unknown host")) > 0) {
respondHeader(respond,"404 unknown host", new httpHeader(null));
} else {
respondHeader(respond,"404 Not Found", new httpHeader(null));
respond.write(("Exception occurred:\r\n").getBytes());
respond.write((e.toString() + "\r\n").getBytes());
respond.write(("[TRACE: ").getBytes());
e.printStackTrace(new PrintStream(respond));
respond.write(("]\r\n").getBytes());
}
} catch (Exception ee) {}
}
}
} else {
// we take a new file from the net and respond with that
try {
// open the connection
//httpc remote = newhttpc(host, port, timeout);
httpc remote;
if (yAddress == null) {
remote = newhttpc(host, port, timeout);
} else {
remote = newhttpc(yAddress, timeout);
}
//System.out.println("HEADER: CLIENT TO PROXY = " + requestHeader.toString()); // DEBUG
// send request
httpc.response res = remote.GET(remotePath, requestHeader);
long contentLength = res.responseHeader.contentLength();
// reserve cache entry
hpc = cacheManager.newEntry(requestDate, 0, url, requestHeader, res.status, res.responseHeader, null, switchboard.defaultProxyProfile);
// make a scraper and transformer
if (((ext == null) || (!(switchboard.extensionBlack.contains(ext)))) &&
(httpd.isTextMime(res.responseHeader.mime(), switchboard.mimeWhite))) {
if (transformer.isIdentityTransformer()) {
hfos = hpc.getContentOutputStream();
} else { } else {
scraper = new htmlFilterContentScraper(url); // the file is too big to cache it in the ram, write to file right here
hfos = new htmlFilterOutputStream(respond, scraper, transformer, (ext.length() == 0)); cacheFile.getParentFile().mkdirs();
if (((htmlFilterOutputStream) hfos).binarySuspect()) { res.writeContent(hfos, cacheFile);
scraper = null; // forget it, may be rubbish if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
log.logDebug("Content of " + url + " is probably binary. deleted scraper."); if (sizeBeforeDelete == -1) {
// totally fresh file
hpc.status = plasmaHTCache.CACHE_FILL; // it's an insert
cacheManager.stackProcess(hpc);
} else if (sizeBeforeDelete == cacheFile.length()) {
// before we came here we deleted a cache entry
hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_BAD;
cacheManager.stackProcess(hpc); // unnecessary update
} else {
// before we came here we deleted a cache entry
hpc.status = plasmaHTCache.CACHE_STALE_RELOAD_GOOD;
cacheManager.stackProcess(hpc); // necessary update, write response header to cache
} }
hpc.scraper = scraper;
} }
} else { } else {
log.logDebug("Resource " + url + " has wrong extension (" + ext + ") or wrong mime-type (" + res.responseHeader.mime() + "). not scraped"); // no caching
scraper = null; log.logDebug(cacheFile.toString() + " not cached: " + storeError);
hfos = respond; res.writeContent(hfos, null);
hpc.scraper = scraper; if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
} if (sizeBeforeDelete == -1) {
// no old file and no load. just data passing
// handle incoming cookies
handleIncomingCookies(res.responseHeader, host, ip);
// request has been placed and result has been returned. work off response
try {
//System.out.println("HEADER: SERVER TO PROXY = [" + res.status + "] " + ((httpHeader) res.responseHeader).toString()); // DEBUG
respondHeader(respond, res.status, res.responseHeader);
String storeError;
if ((storeError = hpc.shallStoreCache()) == null) {
// we write a new cache entry
if ((contentLength > 0) && (contentLength < 1048576)) {
// write to buffer
byte[] cacheArray;
if (transformer.isIdentityTransformer()) {
res.writeContentX(hfos, respond);
cacheArray = hpc.getContentBytes();
} else {
cacheArray = res.writeContent(hfos);
}
if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
// enQueue new entry with response header and file as byte[]
hpc.status = plasmaHTCache.CACHE_FILL;
cacheManager.stackProcess(hpc, cacheArray);
} else try {
// write to file system directly
cacheFile.getParentFile().mkdirs();
if (transformer.isIdentityTransformer()) {
res.writeContent(respond, cacheFile);
if (contentLength < 10485760) { // 10 mb
serverFileUtils.copy(cacheFile, hfos);
} // else hfos is empty and that means: no work afterwards with it
} else {
res.writeContent(hfos, cacheFile);
}
if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
// enQueue new entry with response header
hpc.status = plasmaHTCache.CACHE_FILL;
cacheManager.stackProcess(hpc);
} catch (FileNotFoundException e) {
// this may happen if there are no write rights whatsoever
// (do nothing)
/*
Exception occurred:
java.io.FileNotFoundException:
/opt/yacy_pre_v0.314_20041219/DATA/HTCACHE/www.spiegel.de/fotostrecke/0,5538,PB64-SUQ9NDYwNyZucj0z,00.html
(Permission denied)
*/
}
} else {
// no caching
//System.out.println("DEBUG: " + res.status + " " + cacheFile.toString()); // debug
log.logDebug(cacheFile.toString() + " not cached: " + storeError);
res.writeContent(hfos, null);
if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize();
// no old file and no load. just data passing
hpc.status = plasmaHTCache.CACHE_PASSING; hpc.status = plasmaHTCache.CACHE_PASSING;
cacheManager.stackProcess(hpc); cacheManager.stackProcess(hpc);
} } else {
} catch (SocketException e) { // before we came here we deleted a cache entry
// this may happen if the client suddenly closes its connection hpc.status = plasmaHTCache.CACHE_STALE_NO_RELOAD;
// maybe the user has stopped loading cacheManager.stackProcess(hpc);
// in that case, we are not responsible and just forget it }
// but we clean the cache also, since it may be only partial }
// and most possible corrupted } catch (SocketException e) {
if (cacheFile.exists()) cacheFile.delete(); // this may happen if the client suddenly closes its connection
respondHeader(respond,"404 client unexpectedly closed connection", new httpHeader(null)); // maybe the user has stopped loading
} // in that case, we are not responsible and just forget it
remote.close(); // but we clean the cache also, since it may be only partial
} catch (Exception e) { // and most possible corrupted
// this may happen if the targeted host does not exist or anything with the if (cacheFile.exists()) cacheFile.delete();
// remote server was wrong. respondHeader(respond,"404 client unexpectedly closed connection", new httpHeader(null));
// in any case, sending a 404 is appropriate }
try { remote.close();
if ((e.toString().indexOf("unknown host")) > 0) { } catch (Exception e) {
respondHeader(respond,"404 unknown host", new httpHeader(null)); // this may happen if the targeted host does not exist or anything with the
} else { // remote server was wrong.
respondHeader(respond,"404 resource not available (generic exception: " + e.toString() + ")", new httpHeader(null)); // in any case, sending a 404 is appropriate
//respond.write(("Exception occurred:\r\n").getBytes()); try {
//respond.write((e.toString() + "\r\n").getBytes()); if ((e.toString().indexOf("unknown host")) > 0) {
//respond.write(("[TRACE: ").getBytes()); respondHeader(respond,"404 unknown host", new httpHeader(null));
//e.printStackTrace(new PrintStream(respond)); } else {
//respond.write(("]\r\n").getBytes()); respondHeader(respond,"404 Not Found", new httpHeader(null));
/* http://www.geocrawler.com/archives/3/201/1999/8/50/2505805/ respond.write(("Exception occurred:\r\n").getBytes());
> java.net.ConnectException: Connection refused respond.write((e.toString() + "\r\n").getBytes());
*/ respond.write(("[TRACE: ").getBytes());
e.printStackTrace(); e.printStackTrace(new PrintStream(respond));
} respond.write(("]\r\n").getBytes());
} catch (Exception ee) {} }
} } catch (Exception ee) {}
} }
respond.flush(); respond.flush();
} }

@ -212,12 +212,14 @@ public class plasmaCrawlLoader {
} else if ((profile.storeHTCache()) && ((error = htCache.shallStoreCache()) == null)) { } else if ((profile.storeHTCache()) && ((error = htCache.shallStoreCache()) == null)) {
// we write the new cache entry to file system directly // we write the new cache entry to file system directly
cacheFile.getParentFile().mkdirs(); cacheFile.getParentFile().mkdirs();
res.writeContent(htCache.getContentOutputStream(), cacheFile); // writes in content scraper and cache file FileOutputStream fos = new FileOutputStream(cacheFile);
htCache.cacheArray = res.writeContent(fos); // writes in cacheArray and cache file
fos.close();
htCache.status = plasmaHTCache.CACHE_FILL; htCache.status = plasmaHTCache.CACHE_FILL;
} else { } else {
if (error != null) log.logDebug("CRAWLER NOT STORED RESOURCE " + url.toString() + ": " + error); if (error != null) log.logDebug("CRAWLER NOT STORED RESOURCE " + url.toString() + ": " + error);
// anyway, the content still lives in the content scraper // anyway, the content still lives in the content scraper
res.writeContent(htCache.getContentOutputStream(), null); // writes only into content scraper htCache.cacheArray = res.writeContent(null); // writes only into cacheArray
htCache.status = plasmaHTCache.CACHE_PASSING; htCache.status = plasmaHTCache.CACHE_PASSING;
} }
// enQueue new entry with response header // enQueue new entry with response header

@ -452,7 +452,6 @@ public class plasmaHTCache {
public String language; public String language;
public plasmaCrawlProfile.entry profile; public plasmaCrawlProfile.entry profile;
private String initiator; private String initiator;
public ByteArrayOutputStream content;
public htmlFilterContentScraper scraper; public htmlFilterContentScraper scraper;
@ -479,7 +478,6 @@ public class plasmaHTCache {
this.requestHeader = requestHeader; this.requestHeader = requestHeader;
this.responseStatus = responseStatus; this.responseStatus = responseStatus;
this.responseHeader = responseHeader; this.responseHeader = responseHeader;
this.content = new ByteArrayOutputStream();
this.profile = profile; this.profile = profile;
this.initiator = (initiator == null) ? null : ((initiator.length() == 0) ? null: initiator); this.initiator = (initiator == null) ? null : ((initiator.length() == 0) ? null: initiator);
@ -507,13 +505,6 @@ public class plasmaHTCache {
this.scraper = null; this.scraper = null;
} }
public OutputStream getContentOutputStream() {
return (OutputStream) content;
}
public byte[] getContentBytes() {
try { content.flush(); } catch (IOException e) {}
return content.toByteArray();
}
public String initiator() { public String initiator() {
return initiator; return initiator;
} }

@ -469,8 +469,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
private synchronized void processResourceStack(plasmaHTCache.Entry entry) { private synchronized void processResourceStack(plasmaHTCache.Entry entry) {
// work off one stack entry with a fresh resource (scraped web page) // work off one stack entry with a fresh resource (scraped web page)
byte[] content; if ((entry.cacheArray != null) || (entry.scraper != null)) try {
if (((content = entry.getContentBytes()).length > 0) || (entry.scraper != null)) try {
// we must distinguish the following cases: resource-load was initiated by // we must distinguish the following cases: resource-load was initiated by
// 1) global crawling: the index is extern, not here (not possible here) // 1) global crawling: the index is extern, not here (not possible here)
// 2) result of search queries, some indexes are here (not possible here) // 2) result of search queries, some indexes are here (not possible here)
@ -502,7 +501,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
document = parser.transformScraper(entry.url, entry.responseHeader.mime(), entry.scraper); document = parser.transformScraper(entry.url, entry.responseHeader.mime(), entry.scraper);
} else { } else {
log.logDebug("(Parser) '" + entry.urlString + "' is not parsed, parsing now"); log.logDebug("(Parser) '" + entry.urlString + "' is not parsed, parsing now");
document = parser.parseSource(entry.url, entry.responseHeader.mime(), content); document = parser.parseSource(entry.url, entry.responseHeader.mime(), entry.cacheArray);
} }
// put anchors on crawl stack // put anchors on crawl stack

@ -43,7 +43,7 @@ package de.anomic.server;
import java.io.*; import java.io.*;
import java.util.*; import java.util.*;
public class serverByteBuffer { public class serverByteBuffer extends OutputStream {
public static final byte singlequote = (byte) 39; public static final byte singlequote = (byte) 39;
public static final byte doublequote = (byte) 34; public static final byte doublequote = (byte) 34;
@ -119,20 +119,37 @@ public class serverByteBuffer {
offset = 0; offset = 0;
} }
public serverByteBuffer append(byte b) { public void write(int b) {
if (offset + length + 1 > buffer.length) grow(); write((byte) (b & 0xff));
}
public void write(byte b) {
if (offset + length + 1 > buffer.length) grow();
buffer[offset + length++] = b; buffer[offset + length++] = b;
}
public void write(byte[] bb) {
write(bb, 0, bb.length);
}
public void write(byte[] bb, int of, int le) {
while (offset + length + le > buffer.length) grow();
System.arraycopy(bb, of, buffer, offset + length, le);
length += le;
}
public serverByteBuffer append(byte b) {
write(b);
return this; return this;
} }
public serverByteBuffer append(byte[] bb) { public serverByteBuffer append(byte[] bb) {
return append(bb, 0, bb.length); write(bb);
return this;
} }
public serverByteBuffer append(byte[] bb, int of, int le) { public serverByteBuffer append(byte[] bb, int of, int le) {
while (offset + length + le > buffer.length) grow(); write(bb, of, le);
System.arraycopy(bb, of, buffer, offset + length, le);
length += le;
return this; return this;
} }

@ -1 +1 @@
testblue testblue
Loading…
Cancel
Save