From cb17ff4aa2bde12eea32882eac6480299b54d838 Mon Sep 17 00:00:00 2001 From: theli Date: Thu, 9 Jun 2005 10:22:05 +0000 Subject: [PATCH] *) adding support of proxy access logging (much similar to squids access.log) file *) splitting doGet function in separate functions for fulfilling requests from cache and from web to make error handling easier *) using connection property and httpHeader constants instead of hardcoded strings whenever possible *) sending back a proxy error message as body of every respond containing a http error code *) correcting problems of messages received from other proxies containing 204, 304 status codes. *) using chunked transfer encoding if the server has not set the content length (e.g. because of gzip content encoding) but the client has established a persistent connection to yacy. This is only possible for http/1.1 clients. For http/1.0 clients the connection will simply be closed on the end of the message. *) removing unneeded functions (e.g. respondError) because of newly introduced functions of httpd.java *) removing hop by hop headers (according to rfc) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@245 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/http/httpdProxyHandler.java | 1503 ++++++++++-------- 1 file changed, 870 insertions(+), 633 deletions(-) diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java index 066abfb08..8347802aa 100644 --- a/source/de/anomic/http/httpdProxyHandler.java +++ b/source/de/anomic/http/httpdProxyHandler.java @@ -60,14 +60,12 @@ package de.anomic.http; import java.io.BufferedReader; -import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; -import java.io.PrintStream; import java.io.PushbackInputStream; import java.net.MalformedURLException; import java.net.Socket; @@ -75,23 +73,26 @@ import java.net.SocketException; import java.net.URL; import java.util.Date; import java.util.HashSet; -import java.util.Iterator; import java.util.Properties; import java.util.TreeMap; +import java.util.logging.FileHandler; +import java.util.logging.Level; +import java.util.logging.Logger; +import java.util.zip.GZIPOutputStream; import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterContentTransformer; import de.anomic.htmlFilter.htmlFilterOutputStream; import de.anomic.htmlFilter.htmlFilterTransformer; -import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaParser; import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.plasma.plasmaURL; import de.anomic.server.serverCore; import de.anomic.server.serverFileUtils; -import de.anomic.server.serverLog; -import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; +import de.anomic.server.logging.serverLog; +import de.anomic.server.logging.serverMiniLogFormatter; import de.anomic.yacy.yacyCore; @@ -101,7 +102,6 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt // can only be instantiated upon first instantiation of this class object private static plasmaSwitchboard switchboard = null; private static plasmaHTCache cacheManager = null; - public static serverLog log; public static HashSet yellowList = null; public static TreeMap blackListURLs = null; private static int timeout = 30000; @@ -114,6 +114,7 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt public static int remoteProxyPort = -1; public static String remoteProxyNoProxy = ""; public static String[] remoteProxyNoProxyPatterns = null; + private static final HashSet remoteProxyAllowProxySet = new HashSet(); private static final HashSet remoteProxyDisallowProxySet = new HashSet(); @@ -121,33 +122,66 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt private static htmlFilterTransformer transformer = null; public static final String userAgent = "yacy (" + httpc.systemOST +") yacy.net"; private File htRootPath = null; - + + private serverLog theLogger; + private Properties currentConProp = null; + + private static boolean doAccessLogging = false; + /** + * Do logging configuration for special proxy access log file + */ + static { + try { + Logger proxyLogger = Logger.getLogger("PROXY.access"); + proxyLogger.setUseParentHandlers(false); + FileHandler txtLog = new FileHandler("log/proxyAccess%u%g.log",1024*1024, 20, true); + txtLog.setFormatter(new serverMiniLogFormatter()); + txtLog.setLevel(Level.FINEST); + proxyLogger.addHandler(txtLog); + + doAccessLogging = true; + } catch (Exception e) { + System.err.println("PROXY: Unable to configure proxy access logging."); + } + } + + /** + * Special logger instance for proxy access logging much similar + * to the squid access.log file + */ + private final serverLog proxyLog = new serverLog("PROXY.access"); + + /** + * Reusable {@link StringBuffer} for logging + */ + private final StringBuffer logMessage = new StringBuffer(); + // class methods public httpdProxyHandler(serverSwitch sb) { - if (switchboard == null) { - switchboard = (plasmaSwitchboard) sb; - cacheManager = switchboard.getCacheManager(); - - isTransparentProxy = Boolean.valueOf(switchboard.getConfig("isTransparentProxy","false")).booleanValue(); - // load remote proxy data - remoteProxyHost = switchboard.getConfig("remoteProxyHost",""); + // creating a logger + this.theLogger = new serverLog("PROXY"); + + if (switchboard == null) { + switchboard = (plasmaSwitchboard) sb; + cacheManager = switchboard.getCacheManager(); + + isTransparentProxy = Boolean.valueOf(switchboard.getConfig("isTransparentProxy","false")).booleanValue(); + + // load remote proxy data + remoteProxyHost = switchboard.getConfig("remoteProxyHost",""); try { remoteProxyPort = Integer.parseInt(switchboard.getConfig("remoteProxyPort","3128")); } catch (NumberFormatException e) { remoteProxyPort = 3128; } - remoteProxyUse = switchboard.getConfig("remoteProxyUse","false").equals("true"); + remoteProxyUse = switchboard.getConfig("remoteProxyUse","false").equals("true"); remoteProxyNoProxy = switchboard.getConfig("remoteProxyNoProxy",""); remoteProxyNoProxyPatterns = remoteProxyNoProxy.split(","); - // set loglevel - int loglevel = Integer.parseInt(switchboard.getConfig("proxyLoglevel", "2")); - log = new serverLog("HTTPDProxy", loglevel); + // set timeout + timeout = Integer.parseInt(switchboard.getConfig("clientTimeout", "10000")); - // set timeout - timeout = Integer.parseInt(switchboard.getConfig("clientTimeout", "10000")); - // create a htRootPath: system pages if (htRootPath == null) { htRootPath = new File(switchboard.getRootPath(), switchboard.getConfig("htRootPath","htroot")); @@ -157,84 +191,84 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt // load a transformer transformer = new htmlFilterContentTransformer(); transformer.init(new File(switchboard.getRootPath(), switchboard.getConfig("plasmaBlueList", "")).toString()); - - String f; - // load the yellow-list - f = switchboard.getConfig("proxyYellowList", null); - if (f != null) yellowList = loadSet("yellow", f); else yellowList = new HashSet(); - - // load the black-list / inspired by [AS] - f = switchboard.getConfig("proxyBlackListsActive", null); - if (f != null) blackListURLs = loadBlacklist("black", f, "/"); else blackListURLs = new TreeMap(); - log.logSystem("Proxy Handler Initialized"); - } + + String f; + // load the yellow-list + f = switchboard.getConfig("proxyYellowList", null); + if (f != null) yellowList = loadSet("yellow", f); else yellowList = new HashSet(); + + // load the black-list / inspired by [AS] + f = switchboard.getConfig("proxyBlackListsActive", null); + if (f != null) blackListURLs = loadBlacklist("black", f, "/"); else blackListURLs = new TreeMap(); + this.theLogger.logSystem("Proxy Handler Initialized"); + } } - - + + private static HashSet loadSet(String setname, String filename) { - HashSet set = new HashSet(); + HashSet set = new HashSet(); BufferedReader br = null; - try { - br = new BufferedReader(new InputStreamReader(new FileInputStream(filename))); - String line; - while ((line = br.readLine()) != null) { - line = line.trim(); - if ((line.length() > 0) && (!(line.startsWith("#")))) set.add(line.trim().toLowerCase()); - } - br.close(); - serverLog.logInfo("PROXY", "read " + setname + " set from file " + filename); - } catch (IOException e) { - } finally { + try { + br = new BufferedReader(new InputStreamReader(new FileInputStream(filename))); + String line; + while ((line = br.readLine()) != null) { + line = line.trim(); + if ((line.length() > 0) && (!(line.startsWith("#")))) set.add(line.trim().toLowerCase()); + } + br.close(); + serverLog.logInfo("PROXY", "read " + setname + " set from file " + filename); + } catch (IOException e) { + } finally { if (br != null) try { br.close(); } catch (Exception e) {} - } - return set; + } + return set; } - + private static TreeMap loadMap(String mapname, String filename, String sep) { - TreeMap map = new TreeMap(); + TreeMap map = new TreeMap(); BufferedReader br = null; - try { - br = new BufferedReader(new InputStreamReader(new FileInputStream(filename))); - String line; - int pos; - while ((line = br.readLine()) != null) { - line = line.trim(); - if ((line.length() > 0) && (!(line.startsWith("#"))) && ((pos = line.indexOf(sep)) > 0)) - map.put(line.substring(0, pos).trim().toLowerCase(), line.substring(pos + sep.length()).trim()); - } - serverLog.logInfo("PROXY", "read " + mapname + " map from file " + filename); - } catch (IOException e) { - } finally { + try { + br = new BufferedReader(new InputStreamReader(new FileInputStream(filename))); + String line; + int pos; + while ((line = br.readLine()) != null) { + line = line.trim(); + if ((line.length() > 0) && (!(line.startsWith("#"))) && ((pos = line.indexOf(sep)) > 0)) + map.put(line.substring(0, pos).trim().toLowerCase(), line.substring(pos + sep.length()).trim()); + } + serverLog.logInfo("PROXY", "read " + mapname + " map from file " + filename); + } catch (IOException e) { + } finally { if (br != null) try { br.close(); } catch (Exception e) {} - } - return map; + } + return map; } - + public static TreeMap loadBlacklist(String mapname, String filenames, String sep) { - TreeMap map = new TreeMap(); - if (switchboard == null) return map; // not initialized yet - File listsPath = new File(switchboard.getRootPath(), switchboard.getConfig("listsPath", "DATA/LISTS")); - String filenamesarray[] = filenames.split(","); - - if(filenamesarray.length >0) - for(int i = 0; i < filenamesarray.length; i++) - map.putAll(loadMap(mapname, (new File(listsPath, filenamesarray[i])).toString(), sep)); - return map; + TreeMap map = new TreeMap(); + if (switchboard == null) return map; // not initialized yet + File listsPath = new File(switchboard.getRootPath(), switchboard.getConfig("listsPath", "DATA/LISTS")); + String filenamesarray[] = filenames.split(","); + + if(filenamesarray.length >0) + for(int i = 0; i < filenamesarray.length; i++) + map.putAll(loadMap(mapname, (new File(listsPath, filenamesarray[i])).toString(), sep)); + return map; } - + private static String domain(String host) { - String domain = host; - int pos = domain.lastIndexOf("."); - if (pos >= 0) { - // truncate from last part - domain = domain.substring(0, pos); - pos = domain.lastIndexOf("."); - if (pos >= 0) { - // truncate from first part - domain = domain.substring(pos + 1); - } - } - return domain; + String domain = host; + int pos = domain.lastIndexOf("."); + if (pos >= 0) { + // truncate from last part + domain = domain.substring(0, pos); + pos = domain.lastIndexOf("."); + if (pos >= 0) { + // truncate from first part + domain = domain.substring(pos + 1); + } + } + return domain; } private boolean blacklistedURL(String hostlow, String path) { @@ -257,7 +291,7 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt return true; } } - + String pp = ""; // path-pattern return (((pp = (String) blackListURLs.get(hostlow)) != null) && ((pp.equals("*")) || (path.substring(1).matches(pp)))); @@ -265,17 +299,17 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt public void handleOutgoingCookies(httpHeader requestHeader, String targethost, String clienthost) { /* - The syntax for the header is: - - cookie = "Cookie:" cookie-version - 1*((";" | ",") cookie-value) - cookie-value = NAME "=" VALUE [";" path] [";" domain] - cookie-version = "$Version" "=" value - NAME = attr - VALUE = value - path = "$Path" "=" value - domain = "$Domain" "=" value - */ + The syntax for the header is: + + cookie = "Cookie:" cookie-version + 1*((";" | ",") cookie-value) + cookie-value = NAME "=" VALUE [";" path] [";" domain] + cookie-version = "$Version" "=" value + NAME = attr + VALUE = value + path = "$Path" "=" value + domain = "$Domain" "=" value + */ if (requestHeader.containsKey(httpHeader.COOKIE)) { Object[] entry = new Object[]{new Date(), clienthost, requestHeader.getMultiple(httpHeader.COOKIE)}; switchboard.outgoingCookies.put(targethost, entry); @@ -284,297 +318,324 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt public void handleIncomingCookies(httpHeader respondHeader, String serverhost, String targetclient) { /* - The syntax for the Set-Cookie response header is - - set-cookie = "Set-Cookie:" cookies - cookies = 1#cookie - cookie = NAME "=" VALUE *(";" cookie-av) - NAME = attr - VALUE = value - cookie-av = "Comment" "=" value - | "Domain" "=" value - | "Max-Age" "=" value - | "Path" "=" value - | "Secure" - | "Version" "=" 1*DIGIT - */ + The syntax for the Set-Cookie response header is + + set-cookie = "Set-Cookie:" cookies + cookies = 1#cookie + cookie = NAME "=" VALUE *(";" cookie-av) + NAME = attr + VALUE = value + cookie-av = "Comment" "=" value + | "Domain" "=" value + | "Max-Age" "=" value + | "Path" "=" value + | "Secure" + | "Version" "=" 1*DIGIT + */ if (respondHeader.containsKey(httpHeader.SET_COOKIE)) { Object[] entry = new Object[]{new Date(), targetclient, respondHeader.getMultiple(httpHeader.SET_COOKIE)}; switchboard.incomingCookies.put(serverhost, entry); } } - + + /** + * @param conProp a collection of properties about the connection, like URL + * @param requestHeader The header lines of the connection from the request + * @param respond the OutputStream to the client + * @see de.anomic.http.httpdHandler#doGet(java.util.Properties, de.anomic.http.httpHeader, java.io.OutputStream) + */ public void doGet(Properties conProp, httpHeader requestHeader, OutputStream respond) throws IOException { - // prepare response - // conProp : a collection of properties about the connection, like URL - // requestHeader : The header lines of the connection from the request - // args : the argument values of a connection, like &-values in GET and values within boundaries in POST - // files : files within POST boundaries, same key as in args - - if (yacyTrigger) de.anomic.yacy.yacyCore.triggerOnlineAction(); - - Date requestDate = new Date(); // remember the time... - String method = conProp.getProperty("METHOD"); - String host = conProp.getProperty("HOST"); - String path = conProp.getProperty("PATH"); // always starts with leading '/' - String args = conProp.getProperty("ARGS"); // may be null if no args were given - String ip = conProp.getProperty("CLIENTIP"); // the ip from the connecting peer - - int port; - int pos; - - if ((pos = host.indexOf(":")) < 0) { - port = 80; - } else { - port = Integer.parseInt(host.substring(pos + 1)); - host = host.substring(0, pos); - } - - String ext; - if ((pos = path.lastIndexOf('.')) < 0) { - ext = ""; - } else { - ext = path.substring(pos + 1).toLowerCase(); - } - - URL url = null; - try { - if (args == null) - url = new URL("http", host, port, path); - else - url = new URL("http", host, port, path + "?" + args); - } catch (MalformedURLException e) { - serverLog.logError("PROXY", "ERROR: internal error with url generation: host=" + - host + ", port=" + port + ", path=" + path + ", args=" + args); - url = null; - } - //System.out.println("GENERATED URL:" + url.toString()); // debug - - // check the blacklist - // blacklist idea inspired by [AS]: - // respond a 404 for all AGIS ("all you get is shit") servers - String hostlow = host.toLowerCase(); - if (blacklistedURL(hostlow, path)) { - try { - respondHeader(respond,"404 Not Found (AGIS)", new httpHeader(null)); - respond.write(("404 (generated): URL '" + hostlow + "' blocked by yacy proxy (blacklisted)\r\n").getBytes()); - respond.flush(); - serverLog.logInfo("PROXY", "AGIS blocking of host '" + hostlow + "'"); // debug - return; - } catch (Exception ee) {} - } - - // handle outgoing cookies - handleOutgoingCookies(requestHeader, host, ip); - // set another userAgent, if not yellowlisted - if ((yellowList != null) && (!(yellowList.contains(domain(hostlow))))) { - // change the User-Agent - requestHeader.put(httpHeader.USER_AGENT, userAgent); - } - - // set a scraper and a htmlFilter - OutputStream hfos = null; - htmlFilterContentScraper scraper = null; - - // resolve yacy and yacyh domains - String yAddress = yacyCore.seedDB.resolveYacyAddress(host); + this.currentConProp = conProp; - // re-calc the url path - String remotePath = (args == null) ? path : (path + "?" + args); // with leading '/' - - // attach possible yacy-sublevel-domain - if ((yAddress != null) && - ((pos = yAddress.indexOf("/")) >= 0) && - (!(remotePath.startsWith("/env"))) // this is the special path, staying always at root-level - ) remotePath = yAddress.substring(pos) + remotePath; - - // decide wether to use a cache entry or connect to the network - File cacheFile = cacheManager.getCachePath(url); - String urlHash = plasmaCrawlLURL.urlHash(url); - httpHeader cachedResponseHeader = null; - boolean cacheExists = ((cacheFile.exists()) && (cacheFile.isFile()) && - ((cachedResponseHeader = cacheManager.getCachedResponse(urlHash)) != null)); - - // why are files unzipped upon arrival? why not zip all files in cache? - // This follows from the following premises - // (a) no file shall be unzip-ed more than once to prevent unnessesary computing time - // (b) old cache entries shall be comparable with refill-entries to detect/distiguish case 3+4 - // (c) the indexing mechanism needs files unzip-ed, a schedule could do that later - // case b and c contradicts, if we use a scheduler, because files in a stale cache would be unzipped - // and the newly arrival would be zipped and would have to be unzipped upon load. But then the - // scheduler is superfluous. Therefore the only reminding case is - // (d) cached files shall be either all zipped or unzipped - // case d contradicts with a, because files need to be unzipped for indexing. Therefore - // the only remaining case is to unzip files right upon load. Thats what we do here. - - // finally use existing cache if appropriate - // here we must decide weather or not to save the data - // to a cache - // we distinguish four CACHE STATE cases: - // 1. cache fill - // 2. cache fresh - no refill - // 3. cache stale - refill - necessary - // 4. cache stale - refill - superfluous - // in two of these cases we trigger a scheduler to handle newly arrived files: - // case 1 and case 3 - plasmaHTCache.Entry cacheEntry; - if ((cacheExists) && - ((cacheEntry = cacheManager.newEntry(requestDate, 0, url, requestHeader, "200 OK", - cachedResponseHeader, null, - switchboard.defaultProxyProfile)).shallUseCache())) { - // we respond on the request by using the cache, the cache is fresh + try { + // remembering the starting time of the request + Date requestDate = new Date(); // remember the time... + this.currentConProp.put(httpd.CONNECTION_PROP_REQUEST_START,new Long(requestDate.getTime())); + if (yacyTrigger) de.anomic.yacy.yacyCore.triggerOnlineAction(); + + // using an ByteCount OutputStream to count the send bytes (needed for the logfile) + respond = new httpdByteCountOutputStream(respond,conProp.getProperty(httpd.CONNECTION_PROP_REQUESTLINE).length() + 2); + + String host = conProp.getProperty(httpd.CONNECTION_PROP_HOST); + String path = conProp.getProperty(httpd.CONNECTION_PROP_PATH); // always starts with leading '/' + String args = conProp.getProperty(httpd.CONNECTION_PROP_ARGS); // may be null if no args were given + String ip = conProp.getProperty(httpd.CONNECTION_PROP_CLIENTIP); // the ip from the connecting peer + + int port, pos; + if ((pos = host.indexOf(":")) < 0) { + port = 80; + } else { + port = Integer.parseInt(host.substring(pos + 1)); + host = host.substring(0, pos); + } + + String ext; + if ((pos = path.lastIndexOf('.')) < 0) { + ext = ""; + } else { + ext = path.substring(pos + 1).toLowerCase(); + } + URL url = null; try { - // replace date field in old header by actual date, this is according to RFC - cachedResponseHeader.put(httpHeader.DATE, httpc.dateString(httpc.nowDate())); - - // maybe the content length is missing - if (!(cachedResponseHeader.containsKey(httpHeader.CONTENT_LENGTH))) - cachedResponseHeader.put(httpHeader.CONTENT_LENGTH, Long.toString(cacheFile.length())); - - // check if we can send a 304 instead the complete content - if (requestHeader.containsKey(httpHeader.IF_MODIFIED_SINCE)) { - // conditional request: freshness of cache for that condition was already - // checked within shallUseCache(). Now send only a 304 response - log.logInfo("CACHE HIT/304 " + cacheFile.toString()); - - // send cached header with replaced date and added length - respondHeader(respond, "304 OK", cachedResponseHeader); // respond with 'not modified' - - } else { - // unconditional request: send content of cache - log.logInfo("CACHE HIT/203 " + cacheFile.toString()); - - // send cached header with replaced date and added length - respondHeader(respond, "203 OK", cachedResponseHeader); // respond with 'non-authoritative' - - // make a transformer - if ((!(transformer.isIdentityTransformer())) && - ((ext == null) || (!(plasmaParser.mediaExtContains(ext)))) && - ((cachedResponseHeader == null) || (plasmaParser.realtimeParsableMimeTypesContains(cachedResponseHeader.mime())))) { - hfos = new htmlFilterOutputStream(respond, null, transformer, (ext.length() == 0)); - } else { - hfos = respond; - } - - // send also the complete body now from the cache - // simply read the file and transfer to out socket - InputStream is = null; - try { - is = new FileInputStream(cacheFile); - byte[] buffer = new byte[2048]; - int l; - while ((l = is.read(buffer)) > 0) {hfos.write(buffer, 0, l);} - } finally { - if (is != null) try { is.close(); } catch (Exception e) {} - } - if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize(); - } - // that's it! - } catch (SocketException e) { - // this happens if the client stops loading the file - // we do nothing here - respondError(respond, "111 socket error: " + e.getMessage(), 1, url.toString()); + url = new URL("http", host, port, (args == null) ? path : path + "?" + args); + } catch (MalformedURLException e) { + String errorMsg = "ERROR: internal error with url generation: host=" + + host + ", port=" + port + ", path=" + path + ", args=" + args; + serverLog.logError("PROXY", errorMsg); + httpd.sendRespondError(conProp,respond,4,501,null,errorMsg,e); + return; + } + + // check the blacklist + // blacklist idea inspired by [AS]: + // respond a 404 for all AGIS ("all you get is shit") servers + String hostlow = host.toLowerCase(); + if (blacklistedURL(hostlow, path)) { + httpd.sendRespondError(conProp,respond,4,403,null, + "URL '" + hostlow + "' blocked by yacy proxy (blacklisted)",null); + this.theLogger.logInfo("AGIS blocking of host '" + hostlow + "'"); + return; + } + + // handle outgoing cookies + handleOutgoingCookies(requestHeader, host, ip); + + // set another userAgent, if not yellowlisted + if ((yellowList != null) && (!(yellowList.contains(domain(hostlow))))) { + // change the User-Agent + requestHeader.put(httpHeader.USER_AGENT, userAgent); } + + // decide wether to use a cache entry or connect to the network + File cacheFile = cacheManager.getCachePath(url); + String urlHash = plasmaURL.urlHash(url); + httpHeader cachedResponseHeader = cacheManager.getCachedResponse(urlHash); + boolean cacheExists = ((cacheFile.exists()) && (cacheFile.isFile()) && (cachedResponseHeader != null)); + + // why are files unzipped upon arrival? why not zip all files in cache? + // This follows from the following premises + // (a) no file shall be unzip-ed more than once to prevent unnessesary computing time + // (b) old cache entries shall be comparable with refill-entries to detect/distiguish case 3+4 + // (c) the indexing mechanism needs files unzip-ed, a schedule could do that later + // case b and c contradicts, if we use a scheduler, because files in a stale cache would be unzipped + // and the newly arrival would be zipped and would have to be unzipped upon load. But then the + // scheduler is superfluous. Therefore the only reminding case is + // (d) cached files shall be either all zipped or unzipped + // case d contradicts with a, because files need to be unzipped for indexing. Therefore + // the only remaining case is to unzip files right upon load. Thats what we do here. + + // finally use existing cache if appropriate + // here we must decide weather or not to save the data + // to a cache + // we distinguish four CACHE STATE cases: + // 1. cache fill + // 2. cache fresh - no refill + // 3. cache stale - refill - necessary + // 4. cache stale - refill - superfluous + // in two of these cases we trigger a scheduler to handle newly arrived files: + // case 1 and case 3 + plasmaHTCache.Entry cacheEntry = (cachedResponseHeader == null) ? null : + cacheManager.newEntry( + requestDate, // init date + 0, // crawling depth + url, // url + requestHeader, // request headers + "200 OK", // request status + cachedResponseHeader, // response headers + null, // initiator + switchboard.defaultProxyProfile // profile + ); + + if (cacheExists && cacheEntry.shallUseCache()) { + fulfillRequestFromCache(conProp,url,ext,requestHeader,cachedResponseHeader,cacheFile,respond); + } else { + fulfillRequestFromWeb(conProp,url,ext,requestHeader,cachedResponseHeader,cacheFile,respond); + } + + } catch (Exception e) { + String errorMsg = "Unexpected Error. " + e.getClass().getName() + ": " + e.getMessage(); + System.err.println("PROXY: " + errorMsg); + this.theLogger.logError(errorMsg); + } finally { respond.flush(); - return; + if (respond instanceof httpdByteCountOutputStream) ((httpdByteCountOutputStream)respond).finish(); + + this.currentConProp.put(httpd.CONNECTION_PROP_REQUEST_END,new Long(System.currentTimeMillis())); + this.currentConProp.put(httpd.CONNECTION_PROP_PROXY_RESPOND_SIZE,new Long(((httpdByteCountOutputStream)respond).getCount())); + this.logProxyAccess(); } + } + + private void fulfillRequestFromWeb(Properties conProp, URL url,String ext, httpHeader requestHeader, httpHeader cachedResponseHeader, File cacheFile, OutputStream respond) { - // the cache does either not exist or is (supposed to be) stale - long sizeBeforeDelete = -1; - if (cacheExists) { - // delete the cache - sizeBeforeDelete = cacheFile.length(); - cacheFile.delete(); - } + GZIPOutputStream gzippedOut = null; + httpChunkedOutputStream chunkedOut = null; + OutputStream hfos = null; + htmlFilterContentScraper scraper = null; - // take a new file from the server httpc remote = null; - httpc.response res = null; - + httpc.response res = null; try { - // open the connection - if (yAddress == null) { - remote = newhttpc(host, port, timeout); + + String host = conProp.getProperty(httpd.CONNECTION_PROP_HOST); + String path = conProp.getProperty(httpd.CONNECTION_PROP_PATH); // always starts with leading '/' + String args = conProp.getProperty(httpd.CONNECTION_PROP_ARGS); // may be null if no args were given + String ip = conProp.getProperty(httpd.CONNECTION_PROP_CLIENTIP); // the ip from the connecting peer + String httpVer = conProp.getProperty(httpd.CONNECTION_PROP_HTTP_VER); // the ip from the connecting peer + + int port, pos; + if ((pos = host.indexOf(":")) < 0) { + port = 80; } else { - remote = newhttpc(yAddress, timeout); - } - //System.out.println("HEADER: CLIENT TO PROXY = " + requestHeader.toString()); // DEBUG + port = Integer.parseInt(host.substring(pos + 1)); + host = host.substring(0, pos); + } + + // resolve yacy and yacyh domains + String yAddress = yacyCore.seedDB.resolveYacyAddress(host); + + // re-calc the url path + String remotePath = (args == null) ? path : (path + "?" + args); // with leading '/' + + // attach possible yacy-sublevel-domain + if ((yAddress != null) && + ((pos = yAddress.indexOf("/")) >= 0) && + (!(remotePath.startsWith("/env"))) // this is the special path, staying always at root-level + ) remotePath = yAddress.substring(pos) + remotePath; + + // open the connection + remote = (yAddress == null) ? newhttpc(host, port, timeout) : newhttpc(yAddress, timeout); + + // removing hop by hop headers + this.removeHopByHopHeaders(requestHeader); // send request res = remote.GET(remotePath, requestHeader); + conProp.put(httpd.CONNECTION_PROP_CLIENT_REQUEST_HEADER,requestHeader); long contentLength = res.responseHeader.contentLength(); + // if the content length is not set we have to use chunked transfer encoding + if (contentLength < 0) { + // according to http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html + // a 204,304 message must not contain a message body. + // Therefore we need to set the content-length to 0. + if (res.status.startsWith("204") || + res.status.startsWith("304")) { + res.responseHeader.put(httpHeader.CONTENT_LENGTH,"0"); + } else { + if (httpVer.equals("HTTP/0.9") || httpVer.equals("HTTP/1.0")) { + conProp.setProperty(httpd.CONNECTION_PROP_PERSISTENT,"close"); + } else { + res.responseHeader.put(httpHeader.TRANSFER_ENCODING, "chunked"); + chunkedOut = new httpChunkedOutputStream(respond); + } + res.responseHeader.remove(httpHeader.CONTENT_LENGTH); + } + } + +// if (((String)requestHeader.get(httpHeader.ACCEPT_ENCODING,"")).indexOf("gzip") != -1) { +// zipped = new GZIPOutputStream((chunked != null) ? chunked : respond); +// res.responseHeader.put(httpHeader.CONTENT_ENCODING, "gzip"); +// res.responseHeader.remove(httpHeader.CONTENT_LENGTH); +// } + + // the cache does either not exist or is (supposed to be) stale + long sizeBeforeDelete = -1; + if ((cacheFile.exists()) && (cacheFile.isFile()) && (cachedResponseHeader != null)) { + // delete the cache + sizeBeforeDelete = cacheFile.length(); + cacheFile.delete(); + conProp.setProperty(httpd.CONNECTION_PROP_PROXY_RESPOND_CODE,"TCP_REFRESH_MISS"); + } + // reserver cache entry - cacheEntry = cacheManager.newEntry(requestDate, 0, url, requestHeader, res.status, res.responseHeader, null, switchboard.defaultProxyProfile); + Date requestDate = new Date(((Long)conProp.get(httpd.CONNECTION_PROP_REQUEST_START)).longValue()); + plasmaHTCache.Entry cacheEntry = cacheManager.newEntry( + requestDate, + 0, + url, + requestHeader, + res.status, + res.responseHeader, + null, + switchboard.defaultProxyProfile + ); // handle file types if (((ext == null) || (!(plasmaParser.mediaExtContains(ext)))) && - (plasmaParser.realtimeParsableMimeTypesContains(res.responseHeader.mime()))) { - // this is a file that is a possible candidate for parsing by the indexer + (plasmaParser.realtimeParsableMimeTypesContains(res.responseHeader.mime()))) { + // this is a file that is a possible candidate for parsing by the indexer if (transformer.isIdentityTransformer()) { - log.logDebug("create passthrough (parse candidate) for url " + url); - // no transformation, only passthrough - // this is especially the case if the bluelist is empty - // in that case, the content is not scraped here but later - hfos = respond; + this.theLogger.logDebug("create passthrough (parse candidate) for url " + url); + // no transformation, only passthrough + // this isng especially the case if the bluelist is empty + // in that case, the content is not scraped here but later + hfos = (gzippedOut != null) ? gzippedOut : ((chunkedOut != null)? chunkedOut : respond); } else { // make a scraper and transformer - log.logDebug("create scraper for url " + url); + this.theLogger.logDebug("create scraper for url " + url); scraper = new htmlFilterContentScraper(url); - hfos = new htmlFilterOutputStream(respond, scraper, transformer, (ext.length() == 0)); + hfos = new htmlFilterOutputStream((gzippedOut != null) ? gzippedOut : ((chunkedOut != null)? chunkedOut : respond), scraper, transformer, (ext.length() == 0)); if (((htmlFilterOutputStream) hfos).binarySuspect()) { scraper = null; // forget it, may be rubbish - log.logDebug("Content of " + url + " is probably binary. deleted scraper."); + this.theLogger.logDebug("Content of " + url + " is probably binary. deleted scraper."); } cacheEntry.scraper = scraper; } } else { - log.logDebug("Resource " + url + " has wrong extension (" + ext + ") or wrong mime-type (" + res.responseHeader.mime() + "). not scraped"); + this.theLogger.logDebug("Resource " + url + " has wrong extension (" + ext + ") or wrong mime-type (" + res.responseHeader.mime() + "). not scraped"); scraper = null; - hfos = respond; + hfos = (gzippedOut != null) ? gzippedOut : ((chunkedOut != null)? chunkedOut : respond); cacheEntry.scraper = scraper; } // handle incoming cookies handleIncomingCookies(res.responseHeader, host, ip); + // remove hop by hop headers + this.removeHopByHopHeaders(res.responseHeader); + // request has been placed and result has been returned. work off response try { - respondHeader(respond, res.status, res.responseHeader); + httpd.sendRespondHeader(conProp,respond,httpVer,Integer.parseInt(res.status.split(" ")[0]),res.status.split(" ")[1],res.responseHeader); + String storeError; if ((storeError = cacheEntry.shallStoreCache()) == null) { // we write a new cache entry - if ((contentLength > 0) && // known - (contentLength < 1048576)) {// 1 MB + if ((contentLength > 0) && (contentLength < 1048576)) // if the length is known and < 1 MB + { // ok, we don't write actually into a file, only to RAM, and schedule writing the file. byte[] cacheArray = res.writeContent(hfos); - log.logDebug("writeContent of " + url + " produced cacheArray = " + ((cacheArray == null) ? "null" : ("size=" + cacheArray.length))); - + this.theLogger.logDebug("writeContent of " + url + " produced cacheArray = " + ((cacheArray == null) ? "null" : ("size=" + cacheArray.length))); + if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize(); if (sizeBeforeDelete == -1) { // totally fresh file cacheEntry.status = plasmaHTCache.CACHE_FILL; // it's an insert cacheManager.stackProcess(cacheEntry, cacheArray); + conProp.setProperty(httpd.CONNECTION_PROP_PROXY_RESPOND_CODE,"TCP_MISS"); } else if (sizeBeforeDelete == cacheArray.length) { // before we came here we deleted a cache entry cacheArray = null; cacheEntry.status = plasmaHTCache.CACHE_STALE_RELOAD_BAD; cacheManager.stackProcess(cacheEntry); // unnecessary update + conProp.setProperty(httpd.CONNECTION_PROP_PROXY_RESPOND_CODE,"TCP_REF_FAIL_HIT"); } else { // before we came here we deleted a cache entry cacheEntry.status = plasmaHTCache.CACHE_STALE_RELOAD_GOOD; cacheManager.stackProcess(cacheEntry, cacheArray); // necessary update, write response header to cache - } + conProp.setProperty(httpd.CONNECTION_PROP_PROXY_RESPOND_CODE,"TCP_REFRESH_MISS"); + } } else { // the file is too big to cache it in the ram, or the size is unknown - // write to file right here. + // write to file right here. cacheFile.getParentFile().mkdirs(); res.writeContent(hfos, cacheFile); if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize(); - log.logDebug("for write-file of " + url + ": contentLength = " + contentLength + ", sizeBeforeDelete = " + sizeBeforeDelete); + this.theLogger.logDebug("for write-file of " + url + ": contentLength = " + contentLength + ", sizeBeforeDelete = " + sizeBeforeDelete); if (sizeBeforeDelete == -1) { // totally fresh file cacheEntry.status = plasmaHTCache.CACHE_FILL; // it's an insert @@ -588,12 +649,12 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt cacheEntry.status = plasmaHTCache.CACHE_STALE_RELOAD_GOOD; cacheManager.stackProcess(cacheEntry); // necessary update, write response header to cache } - // beware! all these writings will not fill the cacheEntry.cacheArray - // that means they are not available for the indexer (except they are scraped before) + // beware! all these writings will not fill the cacheEntry.cacheArray + // that means they are not available for the indexer (except they are scraped before) } } else { // no caching - log.logDebug(cacheFile.toString() + " not cached: " + storeError); + this.theLogger.logDebug(cacheFile.toString() + " not cached: " + storeError); res.writeContent(hfos, null); if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize(); if (sizeBeforeDelete == -1) { @@ -606,6 +667,15 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt cacheManager.stackProcess(cacheEntry); } } + + if (gzippedOut != null) { + gzippedOut.finish(); + } + if (chunkedOut != null) { + chunkedOut.finish(); + chunkedOut.flush(); + } + } catch (SocketException e) { // this may happen if the client suddenly closes its connection // maybe the user has stopped loading @@ -613,18 +683,27 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt // but we clean the cache also, since it may be only partial // and most possible corrupted if (cacheFile.exists()) cacheFile.delete(); - respondHeader(respond,"404 client unexpectedly closed connection", new httpHeader(null)); + if (!conProp.containsKey(httpd.CONNECTION_PROP_PROXY_RESPOND_HEADER)) { + httpd.sendRespondError(conProp,respond,4,404,null,"client unexpectedly closed connection",e); + } else { + conProp.put(httpd.CONNECTION_PROP_PERSISTENT,"close"); + } } catch (IOException e) { - // can have various reasons + // can have various reasons if (cacheFile.exists()) cacheFile.delete(); - if (e.getMessage().indexOf("Corrupt GZIP trailer") >= 0) { - // just do nothing, we leave it this way - log.logDebug("ignoring bad gzip trail for URL " + url + " (" + e.getMessage() + ")"); - } else { - respondHeader(respond,"404 client unexpectedly closed connection", new httpHeader(null)); - log.logDebug("IOError for URL " + url + " (" + e.getMessage() + ") - responded 404"); - e.printStackTrace(); - } + if (e.getMessage().indexOf("Corrupt GZIP trailer") >= 0) { + // just do nothing, we leave it this way + this.theLogger.logDebug("ignoring bad gzip trail for URL " + url + " (" + e.getMessage() + ")"); + conProp.put(httpd.CONNECTION_PROP_PERSISTENT,"close"); + } else { + if (!conProp.containsKey(httpd.CONNECTION_PROP_PROXY_RESPOND_HEADER)) { + httpd.sendRespondError(conProp,respond,4,404,null,"client unexpectedly closed connection",e); + this.theLogger.logDebug("IOError for URL " + url + " (" + e.getMessage() + ") - responded 404"); + } else { + conProp.put(httpd.CONNECTION_PROP_PERSISTENT,"close"); + } + } + } } catch (Exception e) { // this may happen if the targeted host does not exist or anything with the @@ -632,293 +711,419 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt // in any case, sending a 404 is appropriate try { if ((e.toString().indexOf("unknown host")) > 0) { - respondHeader(respond,"404 unknown host", new httpHeader(null)); + if (!conProp.containsKey(httpd.CONNECTION_PROP_PROXY_RESPOND_HEADER)) { + httpd.sendRespondError(conProp,respond,4,404,null,"unknown host",e); + } else { + conProp.put(httpd.CONNECTION_PROP_PERSISTENT,"close"); + } } else { - respondHeader(respond,"404 Not Found", new httpHeader(null)); - respond.write(("Exception occurred:\r\n").getBytes()); - respond.write((e.toString() + "\r\n").getBytes()); - respond.write(("[TRACE: ").getBytes()); - e.printStackTrace(new PrintStream(respond)); - respond.write(("]\r\n").getBytes()); + if (!conProp.containsKey(httpd.CONNECTION_PROP_PROXY_RESPOND_HEADER)) { + httpd.sendRespondError(conProp,respond,4,404,null,"Not Found",e); + } else { + conProp.put(httpd.CONNECTION_PROP_PERSISTENT,"close"); + } } - } catch (Exception ee) {} + } catch (Exception ee) { + conProp.put(httpd.CONNECTION_PROP_PERSISTENT,"close"); + } } finally { if (remote != null) httpc.returnInstance(remote); - } - respond.flush(); + } } - - private void respondError(OutputStream respond, String origerror, int errorcase, String url) { - FileInputStream fis = null; + + private void fulfillRequestFromCache( + Properties conProp, + URL url, + String ext, + httpHeader requestHeader, + httpHeader cachedResponseHeader, + File cacheFile, + OutputStream respond + ) throws IOException { + + String httpVer = conProp.getProperty(httpd.CONNECTION_PROP_HTTP_VER); + + httpChunkedOutputStream chunkedOut = null; + GZIPOutputStream gzippedOut = null; + OutputStream hfos = null; + + // we respond on the request by using the cache, the cache is fresh try { - // set rewrite values - serverObjects tp = new serverObjects(); - tp.put("errormessage", errorcase); - tp.put("httperror", origerror); - tp.put("url", url); - - // rewrite the file - File file = new File(htRootPath, "/proxymsg/error.html"); - byte[] result; - ByteArrayOutputStream o = new ByteArrayOutputStream(); - fis = new FileInputStream(file); - httpTemplate.writeTemplate(fis, o, tp, "-UNRESOLVED_PATTERN-".getBytes()); - o.close(); - result = o.toByteArray(); + // replace date field in old header by actual date, this is according to RFC + cachedResponseHeader.put(httpHeader.DATE, httpc.dateString(httpc.nowDate())); - // return header - httpHeader header = new httpHeader(); - header.put(httpHeader.DATE, httpc.dateString(httpc.nowDate())); - header.put(httpHeader.CONTENT_TYPE, "text/html"); - header.put(httpHeader.CONTENT_LENGTH, "" + o.size()); - header.put(httpHeader.PRAGMA, "no-cache"); +// if (((String)requestHeader.get(httpHeader.ACCEPT_ENCODING,"")).indexOf("gzip") != -1) { +// chunked = new httpChunkedOutputStream(respond); +// zipped = new GZIPOutputStream(chunked); +// cachedResponseHeader.put(httpHeader.TRANSFER_ENCODING, "chunked"); +// cachedResponseHeader.put(httpHeader.CONTENT_ENCODING, "gzip"); +// } else { + // maybe the content length is missing +// if (!(cachedResponseHeader.containsKey(httpHeader.CONTENT_LENGTH))) +// cachedResponseHeader.put(httpHeader.CONTENT_LENGTH, Long.toString(cacheFile.length())); +// } - // write the array to the client - respondHeader(respond, origerror, header); - serverFileUtils.write(result, respond); - respond.flush(); - } catch (IOException e) { + // check if we can send a 304 instead the complete content + if (requestHeader.containsKey(httpHeader.IF_MODIFIED_SINCE)) { + // conditional request: freshness of cache for that condition was already + // checked within shallUseCache(). Now send only a 304 response + this.theLogger.logInfo("CACHE HIT/304 " + cacheFile.toString()); + conProp.setProperty(httpd.CONNECTION_PROP_PROXY_RESPOND_CODE,"TCP_REFRESH_HIT"); + + // setting the content length header to 0 + cachedResponseHeader.put(httpHeader.CONTENT_LENGTH, Integer.toString(0)); + + // send cached header with replaced date and added length + httpd.sendRespondHeader(conProp,respond,httpVer,304,cachedResponseHeader); + //respondHeader(respond, "304 OK", cachedResponseHeader); // respond with 'not modified' + } else { + // unconditional request: send content of cache + this.theLogger.logInfo("CACHE HIT/203 " + cacheFile.toString()); + conProp.setProperty(httpd.CONNECTION_PROP_PROXY_RESPOND_CODE,"TCP_HIT"); + + // setting the content header to the proper length + cachedResponseHeader.put(httpHeader.CONTENT_LENGTH, Long.toString(cacheFile.length())); + + // send cached header with replaced date and added length + httpd.sendRespondHeader(conProp,respond,httpVer,203,cachedResponseHeader); + //respondHeader(respond, "203 OK", cachedResponseHeader); // respond with 'non-authoritative' + + // make a transformer + if ((!(transformer.isIdentityTransformer())) && + ((ext == null) || (!(plasmaParser.mediaExtContains(ext)))) && + ((cachedResponseHeader == null) || (plasmaParser.realtimeParsableMimeTypesContains(cachedResponseHeader.mime())))) { + hfos = new htmlFilterOutputStream((chunkedOut != null) ? chunkedOut : respond, null, transformer, (ext.length() == 0)); + } else { + hfos = (gzippedOut != null) ? gzippedOut : ((chunkedOut != null)? chunkedOut : respond); + } + + // send also the complete body now from the cache + // simply read the file and transfer to out socket + serverFileUtils.copy(cacheFile,hfos); + + if (hfos instanceof htmlFilterOutputStream) ((htmlFilterOutputStream) hfos).finalize(); + if (gzippedOut != null) gzippedOut.finish(); + if (chunkedOut != null) chunkedOut.finish(); + } + // that's it! + } catch (Exception e) { + // this happens if the client stops loading the file + // we do nothing here + if (conProp.containsKey(httpd.CONNECTION_PROP_PROXY_RESPOND_HEADER)) { + this.theLogger.logWarning("Error while trying to send cached message body."); + conProp.setProperty(httpd.CONNECTION_PROP_PERSISTENT,"close"); + } else { + httpd.sendRespondError(conProp,respond,4,503,"socket error: " + e.getMessage(),"socket error: " + e.getMessage(), e); + } } finally { - if (fis != null) try { fis.close(); } catch (Exception e) {} + respond.flush(); } + return; } - public void doHead(Properties conProp, httpHeader requestHeader, OutputStream respond) throws IOException { - String method = conProp.getProperty("METHOD"); - String host = conProp.getProperty("HOST"); - String path = conProp.getProperty("PATH"); - String args = conProp.getProperty("ARGS"); // may be null if no args were given - int port; - int pos; - if ((pos = host.indexOf(":")) < 0) { - port = 80; - } else { - port = Integer.parseInt(host.substring(pos + 1)); - host = host.substring(0, pos); - } - // check the blacklist, inspired by [AS]: respond a 404 for all AGIS (all you get is shit) servers - String hostlow = host.toLowerCase(); + private void removeHopByHopHeaders(httpHeader headers) { + /* + * - Connection + - Keep-Alive + - Proxy-Authenticate + - Proxy-Authorization + - TE + - Trailers + - Transfer-Encoding + - Upgrade + */ + + headers.remove(httpHeader.CONNECTION); + headers.remove(httpHeader.PROXY_CONNECTION); + headers.remove(httpHeader.PROXY_AUTHENTICATE); + headers.remove(httpHeader.PROXY_AUTHORIZATION); + + // special headers inserted by squid + headers.remove(httpHeader.X_CACHE); + headers.remove(httpHeader.X_CACHE_LOOKUP); + } + + public void doHead(Properties conProp, httpHeader requestHeader, OutputStream respond) throws IOException { + this.currentConProp = conProp; + + String method = conProp.getProperty("METHOD"); + String host = conProp.getProperty("HOST"); + String path = conProp.getProperty("PATH"); + String args = conProp.getProperty("ARGS"); // may be null if no args were given + String httpVer = conProp.getProperty(httpd.CONNECTION_PROP_HTTP_VER); + + int port; + int pos; + if ((pos = host.indexOf(":")) < 0) { + port = 80; + } else { + port = Integer.parseInt(host.substring(pos + 1)); + host = host.substring(0, pos); + } + + // check the blacklist, inspired by [AS]: respond a 404 for all AGIS (all you get is shit) servers + String hostlow = host.toLowerCase(); if (blacklistedURL(hostlow, path)) { try { - respondHeader(respond,"404 Not Found (AGIS)", new httpHeader(null)); - respond.write(("404 (generated): URL '" + hostlow + "' blocked by yacy proxy (blacklisted)\r\n").getBytes()); - respond.flush(); - serverLog.logInfo("PROXY", "AGIS blocking of host '" + hostlow + "'"); // debug + byte[] errorMsg = ("404 (generated): URL '" + hostlow + "' blocked by yacy proxy (blacklisted)\r\n").getBytes(); + httpd.sendRespondHeader(conProp,respond,httpVer,404,"Not Found (AGIS)",0); + this.theLogger.logInfo("AGIS blocking of host '" + hostlow + "'"); // debug return; } catch (Exception ee) {} } - - // set another userAgent, if not yellowlisted - if (!(yellowList.contains(domain(hostlow)))) { - // change the User-Agent - requestHeader.put(httpHeader.USER_AGENT, userAgent); - } - + + // set another userAgent, if not yellowlisted + if (!(yellowList.contains(domain(hostlow)))) { + // change the User-Agent + requestHeader.put(httpHeader.USER_AGENT, userAgent); + } + // resolve yacy and yacyh domains String yAddress = yacyCore.seedDB.resolveYacyAddress(host); // re-calc the url path - String remotePath = (args == null) ? path : (path + "?" + args); + String remotePath = (args == null) ? path : (path + "?" + args); // attach possible yacy-sublevel-domain if ((yAddress != null) && ((pos = yAddress.indexOf("/")) >= 0)) remotePath = yAddress.substring(pos) + remotePath; - - httpc remote = null; - httpc.response res = null; - - try { - // open the connection + + httpc remote = null; + httpc.response res = null; + + try { + // open the connection if (yAddress == null) { remote = newhttpc(host, port, timeout); } else { remote = newhttpc(yAddress, timeout); // with [AS] patch } - res = remote.HEAD(remotePath, requestHeader); - respondHeader(respond, res.status, res.responseHeader); - } catch (Exception e) { - try { - respondHeader(respond,"404 Not Found", new httpHeader(null)); - respond.write(("Exception occurred:\r\n").getBytes()); - respond.write((e.toString() + "\r\n").getBytes()); - respond.write(("[TRACE: ").getBytes()); - e.printStackTrace(new PrintStream(respond)); - respond.write(("]\r\n").getBytes()); - } catch (Exception ee) {} - } finally { - if (remote != null) httpc.returnInstance(remote); + res = remote.HEAD(remotePath, requestHeader); + httpd.sendRespondHeader(conProp,respond,httpVer,Integer.parseInt(res.status.split(" ")[0]),res.responseHeader); + //respondHeader(respond, res.status, res.responseHeader); + } catch (Exception e) { + try { + if (!conProp.containsKey(httpd.CONNECTION_PROP_PROXY_RESPOND_HEADER)) { + httpd.sendRespondError(conProp,respond,4,404,null,"Not Found",e); + } else { + conProp.setProperty(httpd.CONNECTION_PROP_PERSISTENT,"close"); + } + } catch (Exception ee) { + conProp.setProperty(httpd.CONNECTION_PROP_PERSISTENT,"close"); + } + } finally { + if (remote != null) httpc.returnInstance(remote); + } + + respond.flush(); } - respond.flush(); - } - public void doPost(Properties conProp, httpHeader requestHeader, OutputStream respond, PushbackInputStream body) throws IOException { - String host = conProp.getProperty("HOST"); - String path = conProp.getProperty("PATH"); - String args = conProp.getProperty("ARGS"); // may be null if no args were given - int port; - int pos; - if ((pos = host.indexOf(":")) < 0) { - port = 80; - } else { - port = Integer.parseInt(host.substring(pos + 1)); - host = host.substring(0, pos); - } - - // set another userAgent, if not yellowlisted - if (!(yellowList.contains(domain(host).toLowerCase()))) { - // change the User-Agent - requestHeader.put(httpHeader.USER_AGENT, userAgent); - } - - // resolve yacy and yacyh domains - String yAddress = yacyCore.seedDB.resolveYacyAddress(host); - // re-calc the url path - String remotePath = (args == null) ? path : (path + "?" + args); + this.currentConProp = conProp; - // attach possible yacy-sublevel-domain - if ((yAddress != null) && ((pos = yAddress.indexOf("/")) >= 0)) remotePath = yAddress.substring(pos) + remotePath; - - httpc remote = null; - httpc.response res = null; - - try { - if (yAddress == null) { - remote = newhttpc(host, port, timeout); + try { + // remembering the starting time of the request + Date requestDate = new Date(); // remember the time... + this.currentConProp.put(httpd.CONNECTION_PROP_REQUEST_START,new Long(requestDate.getTime())); + + // using an ByteCount OutputStream to count the send bytes + respond = new httpdByteCountOutputStream(respond,conProp.getProperty(httpd.CONNECTION_PROP_REQUESTLINE).length() + 2); + + String host = conProp.getProperty(httpd.CONNECTION_PROP_HOST); + String path = conProp.getProperty(httpd.CONNECTION_PROP_PATH); + String args = conProp.getProperty(httpd.CONNECTION_PROP_ARGS); // may be null if no args were given + String httpVer = conProp.getProperty(httpd.CONNECTION_PROP_HTTP_VER); + + int port, pos; + if ((pos = host.indexOf(":")) < 0) { + port = 80; } else { - remote = newhttpc(yAddress, timeout); + port = Integer.parseInt(host.substring(pos + 1)); + host = host.substring(0, pos); } - res = remote.POST(remotePath, requestHeader, body); - respondHeader(respond, res.status, res.responseHeader); - res.writeContent(respond, null); - remote.close(); - } catch (Exception e) { - try { - respondHeader(respond,"404 Not Found", new httpHeader(null)); - respond.write(("Exception occurred:\r\n").getBytes()); - respond.write((e.toString() + "\r\n").getBytes()); - respond.write(("[TRACE: ").getBytes()); - e.printStackTrace(new PrintStream(respond)); - respond.write(("]\r\n").getBytes()); - } catch (Exception ee) {} - } finally { - if (remote != null) httpc.returnInstance(remote); - } - respond.flush(); + + // set another userAgent, if not yellowlisted + if (!(yellowList.contains(domain(host).toLowerCase()))) { + // change the User-Agent + requestHeader.put(httpHeader.USER_AGENT, userAgent); + } + + // resolve yacy and yacyh domains + String yAddress = yacyCore.seedDB.resolveYacyAddress(host); + + // re-calc the url path + String remotePath = (args == null) ? path : (path + "?" + args); + + // attach possible yacy-sublevel-domain + if ((yAddress != null) && ((pos = yAddress.indexOf("/")) >= 0)) remotePath = yAddress.substring(pos) + remotePath; + + httpc remote = null; + httpc.response res = null; + try { + remote = (yAddress == null) ? newhttpc(host, port, timeout) : newhttpc(yAddress, timeout); + res = remote.POST(remotePath, requestHeader, body); + + // if the content length is not set we need to use chunked content encoding + long contentLength = res.responseHeader.contentLength(); + httpChunkedOutputStream chunked = null; + if (contentLength <= 0) { + res.responseHeader.put(httpHeader.TRANSFER_ENCODING, "chunked"); + res.responseHeader.remove(httpHeader.CONTENT_LENGTH); + chunked = new httpChunkedOutputStream(respond); + } + + // filtering out unwanted headers + this.removeHopByHopHeaders(res.responseHeader); + + httpd.sendRespondHeader(conProp,respond,httpVer,Integer.parseInt(res.status.split(" ")[0]),res.responseHeader); + // respondHeader(respond, res.status, res.responseHeader); + res.writeContent((chunked != null) ? chunked : respond, null); + if (chunked != null) chunked.finish(); + + remote.close(); + } catch (Exception e) { + try { + if (!conProp.containsKey(httpd.CONNECTION_PROP_PROXY_RESPOND_HEADER)) { + httpd.sendRespondError(conProp,respond,4,404,null,"Not Found",e); + } else { + conProp.setProperty(httpd.CONNECTION_PROP_PERSISTENT,"close"); + } + } catch (Exception ee) { + conProp.setProperty(httpd.CONNECTION_PROP_PERSISTENT,"close"); + } + } finally { + if (remote != null) httpc.returnInstance(remote); + } + respond.flush(); + } catch (Exception e) { + String errorMsg = "Unexpected Error. " + e.getClass().getName() + ": " + e.getMessage(); + System.err.println("PROXY: " + errorMsg); + this.theLogger.logError(errorMsg); + } finally { + respond.flush(); + if (respond instanceof httpdByteCountOutputStream) ((httpdByteCountOutputStream)respond).finish(); + + this.currentConProp.put(httpd.CONNECTION_PROP_REQUEST_END,new Long(System.currentTimeMillis())); + this.currentConProp.put(httpd.CONNECTION_PROP_PROXY_RESPOND_SIZE,new Long(((httpdByteCountOutputStream)respond).getCount())); + this.logProxyAccess(); + } } - + public void doConnect(Properties conProp, de.anomic.http.httpHeader requestHeader, InputStream clientIn, OutputStream clientOut) throws IOException { + this.currentConProp = conProp; + String host = conProp.getProperty("HOST"); - int port = Integer.parseInt(conProp.getProperty("PORT")); - String httpVersion = conProp.getProperty("HTTP"); - int timeout = Integer.parseInt(switchboard.getConfig("clientTimeout", "10000")); - - // possibly branch into PROXY-PROXY connection - if (remoteProxyUse) { + int port = Integer.parseInt(conProp.getProperty("PORT")); + String httpVersion = conProp.getProperty("HTTP"); + int timeout = Integer.parseInt(switchboard.getConfig("clientTimeout", "10000")); + + // possibly branch into PROXY-PROXY connection + if (remoteProxyUse) { httpc remoteProxy = null; try { - remoteProxy = httpc.getInstance(host, port, timeout, false, remoteProxyHost, remoteProxyPort); - httpc.response response = remoteProxy.CONNECT(host, port, requestHeader); - response.print(); - if (response.success()) { - // replace connection details - host = remoteProxyHost; - port = remoteProxyPort; - // go on (see below) + remoteProxy = httpc.getInstance(host, port, timeout, false, remoteProxyHost, remoteProxyPort); + httpc.response response = remoteProxy.CONNECT(host, port, requestHeader); + response.print(); + if (response.success()) { + // replace connection details + host = remoteProxyHost; + port = remoteProxyPort; + // go on (see below) } else { - // pass error response back to client - respondHeader(clientOut, response.status, response.responseHeader); - return; + // pass error response back to client + httpd.sendRespondHeader(conProp,clientOut,httpVersion,Integer.parseInt(response.status.split(" ")[0]),response.responseHeader); + //respondHeader(clientOut, response.status, response.responseHeader); + return; } } catch (Exception e) { throw new IOException(e.getMessage()); } finally { if (remoteProxy != null) httpc.returnInstance(remoteProxy); } - } - - // try to establish connection to remote host - Socket sslSocket = new Socket(host, port); - sslSocket.setSoTimeout(timeout); // waiting time for write - sslSocket.setSoLinger(true, timeout); // waiting time for read - InputStream promiscuousIn = sslSocket.getInputStream(); - OutputStream promiscuousOut = sslSocket.getOutputStream(); - - // now then we can return a success message - clientOut.write((httpVersion + " 200 Connection established" + serverCore.crlfString + - "Proxy-agent: YACY" + serverCore.crlfString + - serverCore.crlfString).getBytes()); - - log.logInfo("SSL CONNECTION TO " + host + ":" + port + " ESTABLISHED"); - - // start stream passing with mediate processes - try { - Mediate cs = new Mediate(sslSocket, clientIn, promiscuousOut); - Mediate sc = new Mediate(sslSocket, promiscuousIn, clientOut); - cs.start(); - sc.start(); - while ((sslSocket != null) && - (sslSocket.isBound()) && - (!(sslSocket.isClosed())) && - (sslSocket.isConnected()) && - ((cs.isAlive()) || (sc.isAlive()))) { - // idle - try {Thread.currentThread().sleep(1000);} catch (InterruptedException e) {} // wait a while - } - // set stop mode - cs.pleaseTerminate(); - sc.pleaseTerminate(); - // wake up thread - cs.interrupt(); - sc.interrupt(); - // ...hope they have terminated... - } catch (IOException e) { - //System.out.println("promiscuous termination: " + e.getMessage()); - } - + } + + // try to establish connection to remote host + Socket sslSocket = new Socket(host, port); + sslSocket.setSoTimeout(timeout); // waiting time for write + sslSocket.setSoLinger(true, timeout); // waiting time for read + InputStream promiscuousIn = sslSocket.getInputStream(); + OutputStream promiscuousOut = sslSocket.getOutputStream(); + + // now then we can return a success message + clientOut.write((httpVersion + " 200 Connection established" + serverCore.crlfString + + "Proxy-agent: YACY" + serverCore.crlfString + + serverCore.crlfString).getBytes()); + + this.theLogger.logInfo("SSL CONNECTION TO " + host + ":" + port + " ESTABLISHED"); + + // start stream passing with mediate processes + try { + Mediate cs = new Mediate(sslSocket, clientIn, promiscuousOut); + Mediate sc = new Mediate(sslSocket, promiscuousIn, clientOut); + cs.start(); + sc.start(); + while ((sslSocket != null) && + (sslSocket.isBound()) && + (!(sslSocket.isClosed())) && + (sslSocket.isConnected()) && + ((cs.isAlive()) || (sc.isAlive()))) { + // idle + try {Thread.currentThread().sleep(1000);} catch (InterruptedException e) {} // wait a while + } + // set stop mode + cs.pleaseTerminate(); + sc.pleaseTerminate(); + // wake up thread + cs.interrupt(); + sc.interrupt(); + // ...hope they have terminated... + } catch (IOException e) { + //System.out.println("promiscuous termination: " + e.getMessage()); + } + } - + public class Mediate extends Thread { - - boolean terminate; - Socket socket; - InputStream in; - OutputStream out; - - public Mediate(Socket socket, InputStream in, OutputStream out) throws IOException { - this.terminate = false; - this.in = in; - this.out = out; - this.socket = socket; - } - - public void run() { - byte[] buffer = new byte[512]; - int len; - try { - while ((socket != null) && - (socket.isBound()) && - (!(socket.isClosed())) && - (socket.isConnected()) && - (!(terminate)) && - (in != null) && - (out != null) && - ((len = in.read(buffer)) >= 0) - ) { - out.write(buffer, 0, len); - } - } catch (IOException e) {} - } - - public void pleaseTerminate() { - terminate = true; - } + + boolean terminate; + Socket socket; + InputStream in; + OutputStream out; + + public Mediate(Socket socket, InputStream in, OutputStream out) throws IOException { + this.terminate = false; + this.in = in; + this.out = out; + this.socket = socket; + } + + public void run() { + byte[] buffer = new byte[512]; + int len; + try { + while ((socket != null) && + (socket.isBound()) && + (!(socket.isClosed())) && + (socket.isConnected()) && + (!(terminate)) && + (in != null) && + (out != null) && + ((len = in.read(buffer)) >= 0) + ) { + out.write(buffer, 0, len); + } + } catch (IOException e) {} + } + + public void pleaseTerminate() { + terminate = true; + } } - + private httpc newhttpc(String server, int port, int timeout) throws IOException { - // a new httpc connection, combined with possible remote proxy + // a new httpc connection, combined with possible remote proxy boolean useProxy = remoteProxyUse; // check no-proxy rule if ((useProxy) && (!(remoteProxyAllowProxySet.contains(server)))) { @@ -947,12 +1152,12 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt if (useProxy) { return httpc.getInstance(server, port, timeout, false, remoteProxyHost, remoteProxyPort); } else { - return httpc.getInstance(server, port, timeout, false); + return httpc.getInstance(server, port, timeout, false); } } - + private httpc newhttpc(String address, int timeout) throws IOException { - // a new httpc connection for :/ syntax + // a new httpc connection for :/ syntax // this is called when a '.yacy'-domain is used int p = address.indexOf(":"); if (p < 0) return null; @@ -966,74 +1171,106 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt return newhttpc(server, port, timeout); } - private void respondHeader(OutputStream respond, String status, httpHeader header) throws IOException, SocketException { - - // prepare header - //header.put("Server", "AnomicHTTPD (www.anomic.de)"); - if (!(header.containsKey(httpHeader.DATE))) header.put(httpHeader.DATE, httpc.dateString(httpc.nowDate())); - if (!(header.containsKey(httpHeader.CONTENT_TYPE))) header.put(httpHeader.CONTENT_TYPE, "text/html"); // fix this - - StringBuffer headerStringBuffer = new StringBuffer(200); - - // write status line - headerStringBuffer.append("HTTP/1.1 ").append(status).append("\r\n"); - - //System.out.println("HEADER: PROXY TO CLIENT = " + header.toString()); // DEBUG - - // write header - Iterator i = header.keySet().iterator(); - String key; - String value; - char tag; - int count; - //System.out.println("vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv"); - while (i.hasNext()) { - key = (String) i.next(); - tag = key.charAt(0); - if ((tag != '*') && (tag != '#')) { // '#' in key is reserved for proxy attributes as artificial header values - count = header.keyCount(key); - for (int j = 0; j < count; j++) { - headerStringBuffer.append(key).append(": ").append((String) header.getSingle(key, j)).append("\r\n"); - } - //System.out.println("#" + key + ": " + value); - } - } - headerStringBuffer.append("\r\n"); - - // end header - respond.write(headerStringBuffer.toString().getBytes()); - respond.flush(); - } - - private void textMessage(OutputStream out, String body) throws IOException { - out.write(("HTTP/1.1 200 OK\r\n").getBytes()); - out.write((httpHeader.SERVER + ": AnomicHTTPD (www.anomic.de)\r\n").getBytes()); - out.write((httpHeader.DATE + ": " + httpc.dateString(httpc.nowDate()) + "\r\n").getBytes()); - out.write((httpHeader.CONTENT_TYPE + ": text/plain\r\n").getBytes()); - out.write((httpHeader.CONTENT_LENGTH + ": " + body.length() +"\r\n").getBytes()); - out.write(("\r\n").getBytes()); - out.flush(); - out.write(body.getBytes()); - out.flush(); + out.write(("HTTP/1.1 200 OK\r\n").getBytes()); + out.write((httpHeader.SERVER + ": AnomicHTTPD (www.anomic.de)\r\n").getBytes()); + out.write((httpHeader.DATE + ": " + httpc.dateString(httpc.nowDate()) + "\r\n").getBytes()); + out.write((httpHeader.CONTENT_TYPE + ": text/plain\r\n").getBytes()); + out.write((httpHeader.CONTENT_LENGTH + ": " + body.length() +"\r\n").getBytes()); + out.write(("\r\n").getBytes()); + out.flush(); + out.write(body.getBytes()); + out.flush(); } - private void transferFile(OutputStream out, File f) throws IOException { - InputStream source = new FileInputStream(f); - byte[] buffer = new byte[4096]; - int bytes_read; - while ((bytes_read = source.read(buffer)) > 0) out.write(buffer, 0, bytes_read); - out.flush(); - source.close(); + /** + * This function is used to generate a logging message according to the + * squid logging format.

+ * e.g.
+ * 1117528623.857 178 192.168.1.201 TCP_MISS/200 1069 GET http://www.yacy.de/ - DIRECT/81.169.145.74 text/html + */ + private final void logProxyAccess() { + + if (!doAccessLogging) return; + + this.logMessage.setLength(0); + + // Timestamp + String currentTimestamp = Long.toString(System.currentTimeMillis()); + int offset = currentTimestamp.length()-3; + + this.logMessage.append(currentTimestamp.substring(0,offset)); + this.logMessage.append('.'); + this.logMessage.append(currentTimestamp.substring(offset)); + this.logMessage.append(' '); + + // Elapsed time + Long requestStart = (Long) this.currentConProp.get(httpd.CONNECTION_PROP_REQUEST_START); + Long requestEnd = (Long) this.currentConProp.get(httpd.CONNECTION_PROP_REQUEST_END); + String elapsed = Long.toString(requestEnd.longValue()-requestStart.longValue()); + + for (int i=0; i<6-elapsed.length(); i++) this.logMessage.append(' '); + this.logMessage.append(elapsed); + this.logMessage.append(' '); + + // Remote Host + String clientIP = this.currentConProp.getProperty(httpd.CONNECTION_PROP_CLIENTIP); + this.logMessage.append(clientIP); + this.logMessage.append(' '); + + // Code/Status + String respondStatus = this.currentConProp.getProperty(httpd.CONNECTION_PROP_PROXY_RESPOND_STATUS); + this.logMessage.append("UNKNOWN/"); + this.logMessage.append(respondStatus); + this.logMessage.append(' '); + + // Bytes + Long bytes = (Long) this.currentConProp.get(httpd.CONNECTION_PROP_PROXY_RESPOND_SIZE); + this.logMessage.append(bytes.toString()); + this.logMessage.append(' '); + + // Method + String requestMethod = this.currentConProp.getProperty(httpd.CONNECTION_PROP_METHOD); + this.logMessage.append(requestMethod); + this.logMessage.append(' '); + + // URL + String requestURL = this.currentConProp.getProperty(httpd.CONNECTION_PROP_URL); + this.logMessage.append(requestURL); + this.logMessage.append(' '); + + // Rfc931 + this.logMessage.append("-"); + this.logMessage.append(' '); + + // Peerstatus/Peerhost + String host = this.currentConProp.getProperty(httpd.CONNECTION_PROP_HOST); + this.logMessage.append("DIRECT/"); + this.logMessage.append(host); + this.logMessage.append(' '); + + // Type + String mime = "-"; + if (this.currentConProp.containsKey(httpd.CONNECTION_PROP_PROXY_RESPOND_HEADER)) { + httpHeader proxyRespondHeader = (httpHeader) this.currentConProp.get(httpd.CONNECTION_PROP_PROXY_RESPOND_HEADER); + mime = proxyRespondHeader.mime(); + if (mime.indexOf(";") != -1) { + mime = mime.substring(0,mime.indexOf(";")); + } + } + this.logMessage.append(mime); + + // sending the logging message to the logger + this.proxyLog.logDebug(this.logMessage.toString()); } - + } /* -proxy test: - -http://www.chipchapin.com/WebTools/cookietest.php? -http://xlists.aza.org/moderator/cookietest/cookietest1.php -http://vancouver-webpages.com/proxy/cache-test.html - -*/ + proxy test: + + http://www.chipchapin.com/WebTools/cookietest.php? + http://xlists.aza.org/moderator/cookietest/cookietest1.php + http://vancouver-webpages.com/proxy/cache-test.html + + */