diff --git a/htroot/robots.java b/htroot/robots.java index 3432a2c76..4b29a60e3 100644 --- a/htroot/robots.java +++ b/htroot/robots.java @@ -10,18 +10,17 @@ import java.util.ArrayList; import net.yacy.cora.protocol.RequestHeader; import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; - import de.anomic.http.server.RobotsTxtConfig; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.server.servletProperties; public class robots { - + public static servletProperties respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { final servletProperties prop = new servletProperties(); final RobotsTxtConfig rbc = ((Switchboard)env).robotstxtConfig; - + if (rbc.isAllDisallowed()) { prop.put(RobotsTxtConfig.ALL, 1); } else { @@ -35,7 +34,7 @@ public class robots { if (rbc.isSurftipsDisallowed()) prop.put(RobotsTxtConfig.ALL + "_" + RobotsTxtConfig.SURFTIPS, "1"); if (rbc.isWikiDisallowed()) prop.put(RobotsTxtConfig.ALL + "_" + RobotsTxtConfig.WIKI, "1"); if (rbc.isProfileDisallowed()) prop.put(RobotsTxtConfig.ALL + "_" + RobotsTxtConfig.PROFILE, "1"); - + if (rbc.isLockedDisallowed() || rbc.isDirsDisallowed()) { final ArrayList[] p = getFiles(env.getConfig(SwitchboardConstants.HTROOT_PATH, SwitchboardConstants.HTROOT_PATH_DEFAULT)); if (rbc.isLockedDisallowed()) { @@ -50,10 +49,10 @@ public class robots { } } } - + return prop; } - + @SuppressWarnings("unchecked") private static ArrayList[] getFiles(final String htrootPath) { final File htroot = new File(htrootPath); @@ -75,6 +74,6 @@ public class robots { htrootFiles.add(htroots[i]); } } - return new ArrayList[] { htrootFiles, htrootDirs }; + return (ArrayList[]) new Object[] { htrootFiles, htrootDirs }; } } diff --git a/source/de/anomic/http/server/AugmentedHtmlStream.java b/source/de/anomic/http/server/AugmentedHtmlStream.java new file mode 100644 index 000000000..82d944e9c --- /dev/null +++ b/source/de/anomic/http/server/AugmentedHtmlStream.java @@ -0,0 +1,72 @@ +package de.anomic.http.server; + +import java.io.BufferedWriter; +import java.io.ByteArrayOutputStream; +import java.io.FilterOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.io.Writer; +import java.nio.charset.Charset; + +import net.yacy.cora.protocol.RequestHeader; +import net.yacy.interaction.AugmentHtmlStream; +import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.search.Switchboard; + +public class AugmentedHtmlStream extends FilterOutputStream { + private Writer out; + private ByteArrayOutputStream buffer; + private Charset charset; + private DigestURI url; + private byte[] urlhash; + private RequestHeader requestHeader; + + public AugmentedHtmlStream(OutputStream out, Charset charset, DigestURI url, byte[] urlhash, RequestHeader requestHeader) { + super(out); + this.out = new BufferedWriter(new OutputStreamWriter(out, charset)); + this.buffer = new ByteArrayOutputStream(); + this.charset = charset; + this.url = url; + this.urlhash = urlhash; + this.requestHeader = requestHeader; + } + + public void write(int b) throws IOException { + this.buffer.write(b); + } + + public void write(byte[] b, int off, int len) throws IOException { + this.buffer.write(b, off, len); + } + + public void close() throws IOException { + StringBuffer b = new StringBuffer(this.buffer.toString(charset.name())); + b = process(b); + out.write(b.toString()); + out.close(); + } + + public StringBuffer process(StringBuffer data) { + + if (Switchboard.getSwitchboard().getConfigBool("proxyAugmentation", false) == true) { + + if (!this.url.toNormalform(false, true).contains("currentyacypeer/")) { + + return AugmentHtmlStream.process (data, charset, url, requestHeader); + + } else { + return data; + } + + } else { + return data; + } + } + + public static boolean supportsMime(String mime) { +// System.out.println("mime" +mime); + return mime.split(";")[0].equals("text/html"); + } + +} diff --git a/source/de/anomic/http/server/HTTPDProxyHandler.java b/source/de/anomic/http/server/HTTPDProxyHandler.java index 53aefa7d0..b19df0d07 100644 --- a/source/de/anomic/http/server/HTTPDProxyHandler.java +++ b/source/de/anomic/http/server/HTTPDProxyHandler.java @@ -1,1648 +1,1666 @@ -// HTTPDProxyHandler.java -// (C) 2004 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany -// first published 2004 on http://yacy.net -// -// $LastChangedDate$ -// $LastChangedRevision$ -// $LastChangedBy$ -// -// LICENSE -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -// Contributions: -// [AS] Alexander Schier: Blacklist (404 response for AGIS hosts) -// [TL] Timo Leise: url-wildcards for blacklists - -/* - Class documentation: - This class is a servlet to the httpd daemon. It is accessed each time - an URL in a GET, HEAD or POST command contains the whole host information - or a host is given in the header host field of an HTTP/1.0 / HTTP/1.1 - command. - Transparency is maintained, whenever appropriate. We change header - attributes if necessary for the indexing mechanism; i.e. we do not - support gzip-ed encoding. We also do not support unrealistic - 'expires' values that would force a cache to be flushed immediately - pragma non-cache attributes are supported -*/ - - -package de.anomic.http.server; - -import java.io.BufferedReader; -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.OutputStream; -import java.io.PrintWriter; -import java.net.BindException; -import java.net.ConnectException; -import java.net.InetAddress; -import java.net.MalformedURLException; -import java.net.NoRouteToHostException; -import java.net.Socket; -import java.net.SocketException; -import java.net.SocketTimeoutException; -import java.net.UnknownHostException; -import java.util.Arrays; -import java.util.Date; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; -import java.util.logging.FileHandler; -import java.util.logging.Level; -import java.util.logging.LogManager; -import java.util.logging.Logger; - -import net.yacy.cora.document.UTF8; -import net.yacy.cora.protocol.ClientIdentification; -import net.yacy.cora.protocol.Domains; -import net.yacy.cora.protocol.HeaderFramework; -import net.yacy.cora.protocol.RequestHeader; -import net.yacy.cora.protocol.ResponseHeader; -import net.yacy.cora.protocol.http.HTTPClient; -import net.yacy.cora.protocol.http.ProxySettings; -import net.yacy.cora.util.NumberTools; -import net.yacy.document.TextParser; -import net.yacy.document.parser.html.ContentTransformer; -import net.yacy.document.parser.html.Transformer; -import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.io.ByteCountOutputStream; -import net.yacy.kelondro.logging.Log; -import net.yacy.kelondro.util.FileUtils; -import net.yacy.repository.Blacklist; -import net.yacy.search.Switchboard; -import net.yacy.search.SwitchboardConstants; -import de.anomic.crawler.Cache; -import de.anomic.crawler.retrieval.Request; -import de.anomic.crawler.retrieval.Response; -import de.anomic.server.serverCore; -import de.anomic.server.serverObjects; - -public final class HTTPDProxyHandler { - - - private static final String yacyProxyUserAgent = "yacyproxy (" + ClientIdentification.yacySystem +") http://yacy.net/bot.html"; - - // static variables - // can only be instantiated upon first instantiation of this class object - private static Switchboard sb = null; - private static final HashSet yellowList; - private static int timeout = 60000; - private static boolean yacyTrigger = true; - public static boolean isTransparentProxy = false; - private static Process redirectorProcess = null; - private static boolean redirectorEnabled = false; - private static PrintWriter redirectorWriter = null; - private static BufferedReader redirectorReader = null; - - private static Transformer transformer = null; - private static File htRootPath = null; - - //private Properties connectionProperties = null; - // creating a logger - private static final Log log = new Log("PROXY"); - - private static boolean doAccessLogging = false; - /** - * Do logging configuration for special proxy access log file - */ - static { - - // get a switchboard - sb = Switchboard.getSwitchboard(); - if (sb != null) { - - isTransparentProxy = Boolean.parseBoolean(sb.getConfig("isTransparentProxy","false")); - - // set timeout - timeout = Integer.parseInt(sb.getConfig("proxy.clientTimeout", "10000")); - - // create a htRootPath: system pages - htRootPath = new File(sb.getAppPath(), sb.getConfig("htRootPath","htroot")); - if (!(htRootPath.exists())) { - if(!htRootPath.mkdir()) - Log.logSevere("PROXY", "could not create htRoot "+ htRootPath); - } - - // do logger initialization - try { - log.logInfo("Configuring proxy access logging ..."); - - // getting the logging manager - final LogManager manager = LogManager.getLogManager(); - final String className = HTTPDProxyHandler.class.getName(); - - // determining if proxy access logging is enabled - final String enabled = manager.getProperty(className + ".logging.enabled"); - if ("true".equalsIgnoreCase(enabled)) { - - // reading out some needed configuration properties - int limit = 1024*1024, count = 20; - String pattern = manager.getProperty(className + ".logging.FileHandler.pattern"); - if (pattern == null) pattern = "DATA/LOG/proxyAccess%u%g.log"; - // make pattern absolute - if (!new File(pattern).isAbsolute()) pattern = new File(sb.getDataPath(), pattern).getAbsolutePath(); - - final String limitStr = manager.getProperty(className + ".logging.FileHandler.limit"); - if (limitStr != null) try { limit = Integer.parseInt(limitStr); } catch (final NumberFormatException e) {} - - final String countStr = manager.getProperty(className + ".logging.FileHandler.count"); - if (countStr != null) try { count = Integer.parseInt(countStr); } catch (final NumberFormatException e) {} - - // creating the proxy access logger - final Logger proxyLogger = Logger.getLogger("PROXY.access"); - proxyLogger.setUseParentHandlers(false); - proxyLogger.setLevel(Level.FINEST); - - final FileHandler txtLog = new FileHandler(pattern, limit, count, true); - txtLog.setFormatter(new ProxyLogFormatter()); - txtLog.setLevel(Level.FINEST); - proxyLogger.addHandler(txtLog); - - doAccessLogging = true; - log.logInfo("Proxy access logging configuration done." + - "\n\tFilename: " + pattern + - "\n\tLimit: " + limitStr + - "\n\tCount: " + countStr); - } else { - log.logInfo("Proxy access logging is deactivated."); - } - } catch (final Exception e) { - log.logSevere("Unable to configure proxy access logging.",e); - } - - // load a transformer - transformer = new ContentTransformer(); - transformer.init(new File(sb.getAppPath(), sb.getConfig(SwitchboardConstants.LIST_BLUE, "")).toString()); - - // load the yellow-list - final String f = sb.getConfig("proxyYellowList", null); - if (f != null) { - yellowList = FileUtils.loadList(new File(f)); - log.logConfig("loaded yellow-list from file " + f + ", " + yellowList.size() + " entries"); - } else { - yellowList = new HashSet(); - } - - final String redirectorPath = sb.getConfig("externalRedirector", ""); - if (redirectorPath.length() > 0 && !redirectorEnabled) { - try { - redirectorProcess=Runtime.getRuntime().exec(redirectorPath); - redirectorWriter = new PrintWriter(redirectorProcess.getOutputStream()); - redirectorReader = new BufferedReader(new InputStreamReader(redirectorProcess.getInputStream())); - redirectorEnabled=true; - } catch (final IOException e) { - System.out.println("redirector not Found"); - } - } - } else { - yellowList = null; - } - } - - /** - * Special logger instance for proxy access logging much similar - * to the squid access.log file - */ - private static final Log proxyLog = new Log("PROXY.access"); - - /** - * Reusable {@link StringBuilder} for logging - */ - private static final StringBuilder logMessage = new StringBuilder(); - - /** - * Reusable {@link StringBuilder} to generate the useragent string - */ - private static final StringBuilder userAgentStr = new StringBuilder(); - - private static void handleOutgoingCookies(final RequestHeader requestHeader, final String targethost, final String clienthost) { - /* - The syntax for the header is: - - cookie = "Cookie:" cookie-version - 1*((";" | ",") cookie-value) - cookie-value = NAME "=" VALUE [";" path] [";" domain] - cookie-version = "$Version" "=" value - NAME = attr - VALUE = value - path = "$Path" "=" value - domain = "$Domain" "=" value - */ - if (sb.getConfigBool("proxy.monitorCookies", false)) { - if (requestHeader.containsKey(RequestHeader.COOKIE)) { - final Object[] entry = new Object[]{new Date(), clienthost, requestHeader.getMultiple(RequestHeader.COOKIE)}; - synchronized(sb.outgoingCookies) { - sb.outgoingCookies.put(targethost, entry); - } - } - } - } - - private static void handleIncomingCookies(final ResponseHeader respondHeader, final String serverhost, final String targetclient) { - /* - The syntax for the Set-Cookie response header is - - set-cookie = "Set-Cookie:" cookies - cookies = 1#cookie - cookie = NAME "=" VALUE *(";" cookie-av) - NAME = attr - VALUE = value - cookie-av = "Comment" "=" value - | "Domain" "=" value - | "Max-Age" "=" value - | "Path" "=" value - | "Secure" - | "Version" "=" 1*DIGIT - */ - if (sb.getConfigBool("proxy.monitorCookies", false)) { - if (respondHeader.containsKey(HeaderFramework.SET_COOKIE)) { - final Object[] entry = new Object[]{new Date(), targetclient, respondHeader.getMultiple(HeaderFramework.SET_COOKIE)}; - synchronized(sb.incomingCookies) { - sb.incomingCookies.put(serverhost, entry); - } - } - } - } - - /** - * @param conProp a collection of properties about the connection, like URL - * @param requestHeader The header lines of the connection from the request - * @param respond the OutputStream to the client - * @see de.anomic.http.httpdHandler#doGet(java.util.Properties, net.yacy.cora.protocol.HeaderFramework, java.io.OutputStream) - */ - public static void doGet(final HashMap conProp, final RequestHeader requestHeader, final OutputStream respond) { - ByteCountOutputStream countedRespond = null; - try { - final int reqID = requestHeader.hashCode(); - // remembering the starting time of the request - final Date requestDate = new Date(); // remember the time... - conProp.put(HeaderFramework.CONNECTION_PROP_REQUEST_START, Long.valueOf(requestDate.getTime())); - if (yacyTrigger) net.yacy.peers.Network.triggerOnlineAction(); - sb.proxyLastAccess = System.currentTimeMillis(); - - // using an ByteCount OutputStream to count the send bytes (needed for the logfile) - countedRespond = new ByteCountOutputStream(respond,((String) conProp.get(HeaderFramework.CONNECTION_PROP_REQUESTLINE)).length() + 2,"PROXY"); - - String host = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HOST); - String path = (String) conProp.get(HeaderFramework.CONNECTION_PROP_PATH); // always starts with leading '/' - final String args = (String) conProp.get(HeaderFramework.CONNECTION_PROP_ARGS); // may be null if no args were given - final String ip = (String) conProp.get(HeaderFramework.CONNECTION_PROP_CLIENTIP); // the ip from the connecting peer - int pos=0; - int port=0; - - DigestURI url = null; - try { - url = new DigestURI(HeaderFramework.getRequestURL(conProp)); - if (log.isFine()) log.logFine(reqID +" GET "+ url); - if (log.isFinest()) log.logFinest(reqID +" header: "+ requestHeader); - - //redirector - if (redirectorEnabled){ - synchronized(redirectorProcess){ - redirectorWriter.println(url.toNormalform(false, true)); - redirectorWriter.flush(); - } - final String newUrl = redirectorReader.readLine(); - if (!newUrl.equals("")) { - try { - url = new DigestURI(newUrl); - } catch(final MalformedURLException e){}//just keep the old one - } - if (log.isFinest()) log.logFinest(reqID +" using redirector to "+ url); - conProp.put(HeaderFramework.CONNECTION_PROP_HOST, url.getHost()+":"+url.getPort()); - conProp.put(HeaderFramework.CONNECTION_PROP_PATH, url.getPath()); - requestHeader.put(HeaderFramework.HOST, url.getHost()+":"+url.getPort()); - requestHeader.put(HeaderFramework.CONNECTION_PROP_PATH, url.getPath()); - } - } catch (final MalformedURLException e) { - final String errorMsg = "ERROR: internal error with url generation: host=" + - host + ", port=" + port + ", path=" + path + ", args=" + args; - log.logSevere(errorMsg); - HTTPDemon.sendRespondError(conProp,countedRespond,4,501,null,errorMsg,e); - return; - } - - if ((pos = host.indexOf(':')) < 0) { - port = 80; - } else { - port = NumberTools.parseIntDecSubstring(host, pos + 1); - host = host.substring(0, pos); - } - - // check the blacklist - // blacklist idea inspired by [AS]: - // respond a 404 for all AGIS ("all you get is shit") servers - final String hostlow = host.toLowerCase(); - if (args != null) { path = path + "?" + args; } - if (Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_PROXY, hostlow, path)) { - log.logInfo("AGIS blocking of host '" + hostlow + "'"); - HTTPDemon.sendRespondError(conProp,countedRespond,4,403,null, - "URL '" + hostlow + "' blocked by yacy proxy (blacklisted)",null); - return; - } - - // handle outgoing cookies - handleOutgoingCookies(requestHeader, host, ip); - prepareRequestHeader(conProp, requestHeader, hostlow); - final ResponseHeader cachedResponseHeader = Cache.getResponseHeader(url.hash()); - - // why are files unzipped upon arrival? why not zip all files in cache? - // This follows from the following premises - // (a) no file shall be unzip-ed more than once to prevent unnecessary computing time - // (b) old cache entries shall be comparable with refill-entries to detect/distinguish case 3+4 - // (c) the indexing mechanism needs files unzip-ed, a schedule could do that later - // case b and c contradicts, if we use a scheduler, because files in a stale cache would be unzipped - // and the newly arrival would be zipped and would have to be unzipped upon load. But then the - // scheduler is superfluous. Therefore the only reminding case is - // (d) cached files shall be either all zipped or unzipped - // case d contradicts with a, because files need to be unzipped for indexing. Therefore - // the only remaining case is to unzip files right upon load. Thats what we do here. - - // finally use existing cache if appropriate - // here we must decide weather or not to save the data - // to a cache - // we distinguish four CACHE STATE cases: - // 1. cache fill - // 2. cache fresh - no refill - // 3. cache stale - refill - necessary - // 4. cache stale - refill - superfluous - // in two of these cases we trigger a scheduler to handle newly arrived files: - // case 1 and case 3 - if (cachedResponseHeader == null) { - if (log.isFinest()) log.logFinest(reqID + " page not in cache: fulfill request from web"); - fulfillRequestFromWeb(conProp, url, requestHeader, cachedResponseHeader, countedRespond); - } else { - final Request request = new Request( - null, - url, - requestHeader.referer() == null ? null : new DigestURI(requestHeader.referer()).hash(), - "", - cachedResponseHeader.lastModified(), - sb.crawler.defaultProxyProfile.handle(), - 0, - 0, - 0, - 0); - final Response response = new Response( - request, - requestHeader, - cachedResponseHeader, - "200 OK", - sb.crawler.defaultProxyProfile, - false - ); - final byte[] cacheContent = Cache.getContent(url.hash()); - if (cacheContent != null && response.isFreshForProxy()) { - if (log.isFinest()) log.logFinest(reqID + " fulfill request from cache"); - fulfillRequestFromCache(conProp, url, requestHeader, cachedResponseHeader, cacheContent, countedRespond); - } else { - if (log.isFinest()) log.logFinest(reqID + " fulfill request from web"); - fulfillRequestFromWeb(conProp, url, requestHeader, cachedResponseHeader, countedRespond); - } - } - - - } catch (final Exception e) { - try { - final String exTxt = e.getMessage(); - if ((exTxt!=null)&&(exTxt.startsWith("Socket closed"))) { - forceConnectionClose(conProp); - } else if (!conProp.containsKey(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_HEADER)) { - final String errorMsg = "Unexpected Error. " + e.getClass().getName() + ": " + e.getMessage(); - HTTPDemon.sendRespondError(conProp,countedRespond,4,501,null,errorMsg,e); - log.logSevere(errorMsg); - } else { - forceConnectionClose(conProp); - } - } catch (final Exception ee) { - forceConnectionClose(conProp); - } - } finally { - try { if(countedRespond != null) countedRespond.flush(); else if(respond != null) respond.flush(); } catch (final Exception e) {} - if (countedRespond != null) countedRespond.finish(); - - conProp.put(HeaderFramework.CONNECTION_PROP_REQUEST_END, Long.valueOf(System.currentTimeMillis())); - conProp.put(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_SIZE,(countedRespond != null) ? Long.toString(countedRespond.getCount()) : -1L); - logProxyAccess(conProp); - } - } - - private static void fulfillRequestFromWeb(final HashMap conProp, final DigestURI url, final RequestHeader requestHeader, final ResponseHeader cachedResponseHeader, final OutputStream respond) { - try { - final int reqID = requestHeader.hashCode(); - - String host = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HOST); - final String path = (String) conProp.get(HeaderFramework.CONNECTION_PROP_PATH); // always starts with leading '/' - final String args = (String) conProp.get(HeaderFramework.CONNECTION_PROP_ARGS); // may be null if no args were given - final String ip = (String) conProp.get(HeaderFramework.CONNECTION_PROP_CLIENTIP); // the ip from the connecting peer - final String httpVer = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HTTP_VER); // the ip from the connecting peer - - int port, pos; - if ((pos = host.indexOf(':')) < 0) { - port = 80; - } else { - port = NumberTools.parseIntDecSubstring(host, pos + 1); - host = host.substring(0, pos); - } - - // resolve yacy and yacyh domains - String yAddress = resolveYacyDomains(host); - - // re-calc the url path - final String remotePath = (args == null) ? path : (path + "?" + args); // with leading '/' - - // remove yacy-subdomain-path, when accessing /env - if ( (yAddress != null) - && (remotePath.startsWith("/env")) - && ((pos = yAddress.indexOf('/')) != -1) - ) yAddress = yAddress.substring(0, yAddress.indexOf('/')); - - modifyProxyHeaders(requestHeader, httpVer); - - final String connectHost = hostPart(host, port, yAddress); - final String getUrl = "http://"+ connectHost + remotePath; - - final HTTPClient client = setupHttpClient(requestHeader, connectHost); - - // send request - try { - client.GET(getUrl); - if (log.isFinest()) log.logFinest(reqID +" response status: "+ client.getHttpResponse().getStatusLine()); - conProp.put(HeaderFramework.CONNECTION_PROP_CLIENT_REQUEST_HEADER, requestHeader); - - final ResponseHeader responseHeader = new ResponseHeader(client.getHttpResponse().getAllHeaders()); - // determine if it's an internal error of the httpc - if (responseHeader.isEmpty()) { - throw new Exception(client.getHttpResponse().getStatusLine().toString()); - } - - final ChunkedOutputStream chunkedOut = setTransferEncoding(conProp, responseHeader, client.getHttpResponse().getStatusLine().getStatusCode(), respond); - - // the cache does either not exist or is (supposed to be) stale - long sizeBeforeDelete = -1; - if (cachedResponseHeader != null) { - // delete the cache - final ResponseHeader rh = Cache.getResponseHeader(url.hash()); - if (rh != null && (sizeBeforeDelete = rh.getContentLength()) == 0) { - final byte[] b = Cache.getContent(url.hash()); - if (b != null) sizeBeforeDelete = b.length; - } - Cache.delete(url.hash()); - conProp.put(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE, "TCP_REFRESH_MISS"); - } - - // reserver cache entry - final Request request = new Request( - null, - url, - requestHeader.referer() == null ? null : new DigestURI(requestHeader.referer()).hash(), - "", - responseHeader.lastModified(), - sb.crawler.defaultProxyProfile.handle(), - 0, - 0, - 0, - sizeBeforeDelete < 0 ? 0 : sizeBeforeDelete); - - - // handle incoming cookies - handleIncomingCookies(responseHeader, host, ip); - -// prepareResponseHeader(responseHeader, res.getHttpVer()); - prepareResponseHeader(responseHeader, client.getHttpResponse().getProtocolVersion().toString()); - - // sending the respond header back to the client - if (chunkedOut != null) { - responseHeader.put(HeaderFramework.TRANSFER_ENCODING, "chunked"); - } - - if (log.isFinest()) log.logFinest(reqID +" sending response header: "+ responseHeader); - HTTPDemon.sendRespondHeader( - conProp, - respond, - httpVer, - client.getHttpResponse().getStatusLine().getStatusCode(), - client.getHttpResponse().getStatusLine().toString(), // status text - responseHeader); - - if (hasBody(client.getHttpResponse().getStatusLine().getStatusCode())) { - - final OutputStream outStream = chunkedOut != null ? chunkedOut : respond; - final Response response = new Response( - request, - requestHeader, - responseHeader, - Integer.toString(client.getHttpResponse().getStatusLine().getStatusCode()), - sb.crawler.defaultProxyProfile, - false - ); - final String storeError = response.shallStoreCacheForProxy(); - final boolean storeHTCache = response.profile().storeHTCache(); - final String supportError = TextParser.supports(response.url(), response.getMimeType()); - if ( - /* - * Now we store the response into the htcache directory if - * a) the response is cacheable AND - */ - (storeError == null) && - /* - * b) the user has configured to use the htcache OR - * c) the content should be indexed - */ - ((storeHTCache) || (supportError != null)) - ) { - // we don't write actually into a file, only to RAM, and schedule writing the file. -// int l = res.getResponseHeader().size(); - final int l = responseHeader.size(); - final ByteArrayOutputStream byteStream = new ByteArrayOutputStream((l < 32) ? 32 : l); - - final OutputStream toClientAndMemory = new MultiOutputStream(new OutputStream[] {outStream, byteStream}); -// FileUtils.copy(res.getDataAsStream(), toClientAndMemory); - client.writeTo(toClientAndMemory); - // cached bytes - byte[] cacheArray; - if (byteStream.size() > 0) { - cacheArray = byteStream.toByteArray(); - } else { - cacheArray = null; - } - if (log.isFine()) log.logFine(reqID +" writeContent of " + url + " produced cacheArray = " + ((cacheArray == null) ? "null" : ("size=" + cacheArray.length))); - - if (sizeBeforeDelete == -1) { - // totally fresh file - response.setContent(cacheArray); - try { - Cache.store(response.url(), response.getResponseHeader(), cacheArray); - sb.toIndexer(response); - } catch (final IOException e) { - log.logWarning("cannot write " + response.url() + " to Cache (1): " + e.getMessage(), e); - } - conProp.put(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE, "TCP_MISS"); - } else if (cacheArray != null && sizeBeforeDelete == cacheArray.length) { - // before we came here we deleted a cache entry - cacheArray = null; - //cacheManager.push(cacheEntry); // unnecessary update - conProp.put(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE, "TCP_REF_FAIL_HIT"); - } else { - // before we came here we deleted a cache entry - response.setContent(cacheArray); - try { - Cache.store(response.url(), response.getResponseHeader(), cacheArray); - sb.toIndexer(response); - } catch (final IOException e) { - log.logWarning("cannot write " + response.url() + " to Cache (2): " + e.getMessage(), e); - } - conProp.put(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE, "TCP_REFRESH_MISS"); - } - } else { - // no caching - if (log.isFine()) log.logFine(reqID +" "+ url.toString() + " not cached." + - " StoreError=" + ((storeError==null)?"None":storeError) + - " StoreHTCache=" + storeHTCache + - " SupportError=" + supportError); - -// FileUtils.copy(res.getDataAsStream(), outStream); - client.writeTo(outStream); - - conProp.put(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE,"TCP_MISS"); - } - - if (chunkedOut != null) { - chunkedOut.finish(); - chunkedOut.flush(); - } - } // end hasBody - } catch(final SocketException se) { - // if opened ... -// if(res != null) { -// // client cut proxy connection, abort download -// res.abort(); -// } - client.finish(); - handleProxyException(se,conProp,respond,url); - } finally { - // if opened ... -// if(res != null) { -// // ... close connection -// res.closeStream(); -// } - client.finish(); - } - } catch (final Exception e) { - handleProxyException(e,conProp,respond,url); - } - } - - /** - * determines if the response should have a body - * - * @param statusCode - * @param responseHeader - * @return - */ - private static boolean hasBody(final int statusCode) { - // "All 1xx (informational), 204 (no content), and 304 (not modified) responses MUST NOT - // include a message-body." - // [RFC 2616 HTTP/1.1, Sect. 4.3] and like [RFC 1945 HTTP/1.0, Sect. 7.2] - if((statusCode >= 100 && statusCode < 200) || statusCode == 204 || statusCode == 304) { - return false; - } - return true; - } - - private static void fulfillRequestFromCache( - final HashMap conProp, - final DigestURI url, - final RequestHeader requestHeader, - final ResponseHeader cachedResponseHeader, - final byte[] cacheEntry, - final OutputStream respond - ) throws IOException { - - final String httpVer = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HTTP_VER); - - // we respond on the request by using the cache, the cache is fresh - try { - prepareResponseHeader(cachedResponseHeader, httpVer); - - // replace date field in old header by actual date, this is according to RFC - cachedResponseHeader.put(HeaderFramework.DATE, HeaderFramework.formatRFC1123(new Date())); - - // check if we can send a 304 instead the complete content - if (requestHeader.containsKey(RequestHeader.IF_MODIFIED_SINCE)) { - // conditional request: freshness of cache for that condition was already - // checked within shallUseCache(). Now send only a 304 response - log.logInfo("CACHE HIT/304 " + url.toString()); - conProp.put(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE,"TCP_REFRESH_HIT"); - - // setting the content length header to 0 - cachedResponseHeader.put(HeaderFramework.CONTENT_LENGTH, Integer.toString(0)); - - // send cached header with replaced date and added length - HTTPDemon.sendRespondHeader(conProp,respond,httpVer,304,cachedResponseHeader); - //respondHeader(respond, "304 OK", cachedResponseHeader); // respond with 'not modified' - } else { - // unconditional request: send content of cache - log.logInfo("CACHE HIT/203 " + url.toString()); - conProp.put(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE,"TCP_HIT"); - - // setting the content header to the proper length - cachedResponseHeader.put(HeaderFramework.CONTENT_LENGTH, Long.toString(cacheEntry.length)); - - // send cached header with replaced date and added length - HTTPDemon.sendRespondHeader(conProp,respond,httpVer,203,cachedResponseHeader); - //respondHeader(respond, "203 OK", cachedResponseHeader); // respond with 'non-authoritative' - - // send also the complete body now from the cache - // simply read the file and transfer to out socket - FileUtils.copy(cacheEntry, respond); - } - // that's it! - } catch (final Exception e) { - // this happens if the client stops loading the file - // we do nothing here - if (conProp.containsKey(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_HEADER)) { - log.logWarning("Error while trying to send cached message body."); - conProp.put(HeaderFramework.CONNECTION_PROP_PERSISTENT,"close"); - } else { - HTTPDemon.sendRespondError(conProp,respond,4,503,"socket error: " + e.getMessage(),"socket error: " + e.getMessage(), e); - } - } finally { - try { respond.flush(); } catch (final Exception e) {} - } - return; - } - - public static void doHead(final HashMap conProp, final RequestHeader requestHeader, OutputStream respond) { - -// ResponseContainer res = null; - DigestURI url = null; - try { - final int reqID = requestHeader.hashCode(); - // remembering the starting time of the request - final Date requestDate = new Date(); // remember the time... - conProp.put(HeaderFramework.CONNECTION_PROP_REQUEST_START, Long.valueOf(requestDate.getTime())); - if (yacyTrigger) net.yacy.peers.Network.triggerOnlineAction(); - sb.proxyLastAccess = System.currentTimeMillis(); - - // using an ByteCount OutputStream to count the send bytes - respond = new ByteCountOutputStream(respond,((String) conProp.get(HeaderFramework.CONNECTION_PROP_REQUESTLINE)).length() + 2,"PROXY"); - - String host = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HOST); - final String path = (String) conProp.get(HeaderFramework.CONNECTION_PROP_PATH); - final String args = (String) conProp.get(HeaderFramework.CONNECTION_PROP_ARGS); - final String httpVer = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HTTP_VER); - - int port, pos; - if ((pos = host.indexOf(':')) < 0) { - port = 80; - } else { - port = NumberTools.parseIntDecSubstring(host, pos + 1); - host = host.substring(0, pos); - } - - try { - url = new DigestURI("http", host, port, (args == null) ? path : path + "?" + args); - } catch (final MalformedURLException e) { - final String errorMsg = "ERROR: internal error with url generation: host=" + - host + ", port=" + port + ", path=" + path + ", args=" + args; - log.logSevere(errorMsg); - HTTPDemon.sendRespondError(conProp,respond,4,501,null,errorMsg,e); - return; - } - if (log.isFine()) log.logFine(reqID +" HEAD "+ url); - if (log.isFinest()) log.logFinest(reqID +" header: "+ requestHeader); - - // check the blacklist, inspired by [AS]: respond a 404 for all AGIS (all you get is shit) servers - final String hostlow = host.toLowerCase(); - - // re-calc the url path - final String remotePath = (args == null) ? path : (path + "?" + args); - - if (Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_PROXY, hostlow, remotePath)) { - HTTPDemon.sendRespondError(conProp,respond,4,403,null, - "URL '" + hostlow + "' blocked by yacy proxy (blacklisted)",null); - log.logInfo("AGIS blocking of host '" + hostlow + "'"); - return; - } - - prepareRequestHeader(conProp, requestHeader, hostlow); - - // resolve yacy and yacyh domains - String yAddress = resolveYacyDomains(host); - - // remove yacy-subdomain-path, when accessing /env - if ( (yAddress != null) - && (remotePath.startsWith("/env")) - && ((pos = yAddress.indexOf('/')) != -1) - ) yAddress = yAddress.substring(0, yAddress.indexOf('/')); - - modifyProxyHeaders(requestHeader, httpVer); - - // generate request-url - final String connectHost = hostPart(host, port, yAddress); - final String getUrl = "http://"+ connectHost + remotePath; - if (log.isFinest()) log.logFinest(reqID +" using url: "+ getUrl); - - final HTTPClient client = setupHttpClient(requestHeader, connectHost); - - // send request -// try { -// res = client.HEAD(getUrl); -// if (log.isFinest()) log.logFinest(reqID +" response status: "+ res.getStatusLine()); - client.HEADResponse(getUrl); - if (log.isFinest()) log.logFinest(reqID +" response status: "+ client.getHttpResponse().getStatusLine()); - - // determine if it's an internal error of the httpc -// final ResponseHeader responseHeader = res.getResponseHeader(); -// if (responseHeader.isEmpty()) { -// throw new Exception(res.getStatusLine()); -// } - final ResponseHeader responseHeader = new ResponseHeader(client.getHttpResponse().getAllHeaders()); - if (responseHeader.isEmpty()) { - throw new Exception(client.getHttpResponse().getStatusLine().toString()); - } - -// prepareResponseHeader(responseHeader, res.getHttpVer()); - prepareResponseHeader(responseHeader, client.getHttpResponse().getStatusLine().getProtocolVersion().toString()); - - // sending the server respond back to the client - if (log.isFinest()) log.logFinest(reqID +" sending response header: "+ responseHeader); -// HTTPDemon.sendRespondHeader(conProp,respond,httpVer,res.getStatusCode(),res.getStatusLine().substring(4),responseHeader); - HTTPDemon.sendRespondHeader( - conProp, - respond, - httpVer, - client.getHttpResponse().getStatusLine().getStatusCode(), - client.getHttpResponse().getStatusLine().toString(), - responseHeader); - respond.flush(); -// } finally { -// if(res != null) { -// // ... close connection -// res.closeStream(); -// } -// } - } catch (final Exception e) { - handleProxyException(e,conProp,respond,url); - } - } - - public static void doPost(final HashMap conProp, final RequestHeader requestHeader, final OutputStream respond, final InputStream body) throws IOException { - assert conProp != null : "precondition violated: conProp != null"; - assert requestHeader != null : "precondition violated: requestHeader != null"; - assert body != null : "precondition violated: body != null"; - DigestURI url = null; - ByteCountOutputStream countedRespond = null; - try { - final int reqID = requestHeader.hashCode(); - // remembering the starting time of the request - final Date requestDate = new Date(); // remember the time... - conProp.put(HeaderFramework.CONNECTION_PROP_REQUEST_START, Long.valueOf(requestDate.getTime())); - if (yacyTrigger) net.yacy.peers.Network.triggerOnlineAction(); - sb.proxyLastAccess = System.currentTimeMillis(); - - // using an ByteCount OutputStream to count the send bytes - countedRespond = new ByteCountOutputStream(respond,((String) conProp.get(HeaderFramework.CONNECTION_PROP_REQUESTLINE)).length() + 2,"PROXY"); - - String host = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HOST); - final String path = (String) conProp.get(HeaderFramework.CONNECTION_PROP_PATH); - final String args = (String) conProp.get(HeaderFramework.CONNECTION_PROP_ARGS); // may be null if no args were given - final String httpVer = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HTTP_VER); - - int port, pos; - if ((pos = host.indexOf(':')) < 0) { - port = 80; - } else { - port = NumberTools.parseIntDecSubstring(host, pos + 1); - host = host.substring(0, pos); - } - - try { - url = new DigestURI("http", host, port, (args == null) ? path : path + "?" + args); - } catch (final MalformedURLException e) { - final String errorMsg = "ERROR: internal error with url generation: host=" + - host + ", port=" + port + ", path=" + path + ", args=" + args; - log.logSevere(errorMsg); - HTTPDemon.sendRespondError(conProp,countedRespond,4,501,null,errorMsg,e); - return; - } - if (log.isFine()) log.logFine(reqID +" POST "+ url); - if (log.isFinest()) log.logFinest(reqID +" header: "+ requestHeader); - - prepareRequestHeader(conProp, requestHeader, host.toLowerCase()); - - String yAddress = resolveYacyDomains(host); - - // re-calc the url path - final String remotePath = (args == null) ? path : (path + "?" + args); - - // remove yacy-subdomain-path, when accessing /env - if ( (yAddress != null) - && (remotePath.startsWith("/env")) - && ((pos = yAddress.indexOf('/')) != -1) - ) yAddress = yAddress.substring(0, yAddress.indexOf('/')); - - modifyProxyHeaders(requestHeader, httpVer); - - final String connectHost = hostPart(host, port, yAddress); - final String getUrl = "http://"+ connectHost + remotePath; - if (log.isFinest()) log.logFinest(reqID +" using url: "+ getUrl); - - // the CONTENT_LENGTH will be added by entity and cause a ClientProtocolException if set - final int contentLength = requestHeader.getContentLength(); - requestHeader.remove(HeaderFramework.CONTENT_LENGTH); - - final HTTPClient client = setupHttpClient(requestHeader, connectHost); - - // check input - if(body == null) { - log.logSevere("no body to POST!"); - } - try { - // sending the request - client.POST(getUrl, body, contentLength); - if (log.isFinest()) log.logFinest(reqID +" response status: "+ client.getHttpResponse().getStatusLine()); - - final ResponseHeader responseHeader = new ResponseHeader(client.getHttpResponse().getAllHeaders()); - // determine if it's an internal error of the httpc - if (responseHeader.isEmpty()) { - throw new Exception(client.getHttpResponse().getStatusLine().toString()); - } - - final ChunkedOutputStream chunked = setTransferEncoding(conProp, responseHeader, client.getHttpResponse().getStatusLine().getStatusCode(), countedRespond); - - prepareResponseHeader(responseHeader, client.getHttpResponse().getProtocolVersion().toString()); - - // sending the respond header back to the client - if (chunked != null) { - responseHeader.put(HeaderFramework.TRANSFER_ENCODING, "chunked"); - } - - // sending response headers - if (log.isFinest()) log.logFinest(reqID +" sending response header: "+ responseHeader); - HTTPDemon.sendRespondHeader(conProp, - countedRespond, - httpVer, - client.getHttpResponse().getStatusLine().getStatusCode(), - client.getHttpResponse().getStatusLine().toString(), // status text - responseHeader); - - final OutputStream outStream = (chunked != null) ? chunked : countedRespond; - client.writeTo(outStream); - - if (chunked != null) { - chunked.finish(); - } - outStream.flush(); - } catch(final SocketException se) { - // connection closed by client, abort download - client.finish(); - } finally { - client.finish(); - } - } catch (final Exception e) { - handleProxyException(e,conProp,countedRespond,url); - } finally { - if(countedRespond != null) { - countedRespond.flush(); - countedRespond.finish(); - } - if(respond != null) { - respond.flush(); - } - - conProp.put(HeaderFramework.CONNECTION_PROP_REQUEST_END, Long.valueOf(System.currentTimeMillis())); - conProp.put(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_SIZE,(countedRespond != null) ? Long.toString(countedRespond.getCount()) : "-1"); - logProxyAccess(conProp); - } - } - - /** - * resolve yacy and yacyh domains - * - * @param host - * @return - */ - private static String resolveYacyDomains(final String host) { - return (HTTPDemon.getAlternativeResolver() == null) ? null : HTTPDemon.getAlternativeResolver().resolve(host); - } - - /** - * @param host - * @param port - * @param yAddress - * @return - */ - private static String hostPart(final String host, final int port, final String yAddress) { - final String connectHost = (yAddress == null) ? host +":"+ port : yAddress; - return connectHost; - } - - /** - * @param conProp - * @param requestHeader - * @param hostlow - */ - private static void prepareRequestHeader(final HashMap conProp, final RequestHeader requestHeader, final String hostlow) { - // set another userAgent, if not yellow-listed - if ((yellowList != null) && (!(yellowList.contains(domain(hostlow))))) { - // change the User-Agent - requestHeader.put(HeaderFramework.USER_AGENT, generateUserAgent(requestHeader)); - } - - // only gzip-encoding is supported, remove other encodings (e. g. deflate) - if ((requestHeader.get(HeaderFramework.ACCEPT_ENCODING,"")).indexOf("gzip",0) != -1) { - requestHeader.put(HeaderFramework.ACCEPT_ENCODING, "gzip"); - } else { - requestHeader.put(HeaderFramework.ACCEPT_ENCODING, ""); - } - - addXForwardedForHeader(conProp, requestHeader); - } - - private static String domain(final String host) { - String domain = host; - int pos = domain.lastIndexOf('.'); - if (pos >= 0) { - // truncate from last part - domain = domain.substring(0, pos); - pos = domain.lastIndexOf('.'); - if (pos >= 0) { - // truncate from first part - domain = domain.substring(pos + 1); - } - } - return domain; - } - - /** - * creates a new HttpClient and sets parameters according to proxy needs - * - * @param requestHeader - * @param connectHost may be 'host:port' or 'host:port/path' - * @return - */ - private static HTTPClient setupHttpClient(final RequestHeader requestHeader, final String connectHost) { - // setup HTTP-client - final HTTPClient client = new HTTPClient(); - client.setTimout(timeout); - client.setHeader(requestHeader.entrySet()); - client.setRedirecting(false); - return client; - } - - /** - * determines in which form the response should be send and sets header accordingly - * if the content length is not set we need to use chunked content encoding - * Implemented: - * if !content-length - * switch httpVer - * case 0.9: - * case 1.0: - * close connection after transfer - * break; - * default: - * new ChunkedStream around respond - * end if - * - * @param conProp - * @param responseHeader - * @param statusCode - * @param respond - * @return - */ - private static ChunkedOutputStream setTransferEncoding( - final HashMap conProp, final ResponseHeader responseHeader, - final int statusCode, final OutputStream respond) { - final String httpVer = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HTTP_VER); - ChunkedOutputStream chunkedOut = null; - // gzipped response is ungzipped an therefor the length is unknown - if (responseHeader.gzip() || responseHeader.getContentLength() < 0) { - // according to http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html - // a 204,304 message must not contain a message body. - // Therefore we need to set the content-length to 0. - if (statusCode == 204 || statusCode == 304) { - responseHeader.put(HeaderFramework.CONTENT_LENGTH, "0"); - } else { - if (httpVer.equals(HeaderFramework.HTTP_VERSION_0_9) || httpVer.equals(HeaderFramework.HTTP_VERSION_1_0)) { - forceConnectionClose(conProp); - } else { - chunkedOut = new ChunkedOutputStream(respond); - } - responseHeader.remove(HeaderFramework.CONTENT_LENGTH); - } - } - return chunkedOut; - } - - /** - * @param res - * @param responseHeader - */ - private static void prepareResponseHeader(final ResponseHeader responseHeader, final String httpVer) { - modifyProxyHeaders(responseHeader, httpVer); - correctContentEncoding(responseHeader); - } - - /** - * @param responseHeader - */ - private static void correctContentEncoding(final ResponseHeader responseHeader) { - // TODO gzip again? set "correct" encoding? - if(responseHeader.gzip()) { - responseHeader.remove(HeaderFramework.CONTENT_ENCODING); - responseHeader.remove(HeaderFramework.CONTENT_LENGTH); // remove gziped length - } - } - - /** - * adds the client-IP of conProp to the requestHeader - * - * @param conProp - * @param requestHeader - */ - private static void addXForwardedForHeader(final HashMap conProp, final RequestHeader requestHeader) { - // setting the X-Forwarded-For Header - if (sb.getConfigBool("proxy.sendXForwardedForHeader", true)) { - requestHeader.put(HeaderFramework.X_FORWARDED_FOR, (String) conProp.get(HeaderFramework.CONNECTION_PROP_CLIENTIP)); - } - } - - /** - * removing hop by hop headers and adding additional headers - * - * @param requestHeader - * @param httpVer - */ - private static void modifyProxyHeaders(final HeaderFramework requestHeader, final String httpVer) { - removeHopByHopHeaders(requestHeader); - setViaHeader(requestHeader, httpVer); - } - - private static void removeHopByHopHeaders(final HeaderFramework headers) { - /* - - Trailers - */ - - headers.remove(RequestHeader.CONNECTION); - headers.remove(RequestHeader.KEEP_ALIVE); - headers.remove(RequestHeader.UPGRADE); - headers.remove(RequestHeader.TE); - headers.remove(RequestHeader.PROXY_CONNECTION); - headers.remove(RequestHeader.PROXY_AUTHENTICATE); - headers.remove(RequestHeader.PROXY_AUTHORIZATION); - - // special headers inserted by squid - headers.remove(RequestHeader.X_CACHE); - headers.remove(RequestHeader.X_CACHE_LOOKUP); - - // remove transfer encoding header - headers.remove(HeaderFramework.TRANSFER_ENCODING); - - //removing yacy status headers - headers.remove(HeaderFramework.X_YACY_KEEP_ALIVE_REQUEST_COUNT); - headers.remove(HeaderFramework.X_YACY_ORIGINAL_REQUEST_LINE); - } - - private static void setViaHeader(final HeaderFramework header, final String httpVer) { - if (!sb.getConfigBool("proxy.sendViaHeader", true)) return; - final String myAddress = (HTTPDemon.getAlternativeResolver() == null) ? null : HTTPDemon.getAlternativeResolver().myAlternativeAddress(); - if (myAddress != null) { - - // getting header set by other proxies in the chain - final StringBuilder viaValue = new StringBuilder(80); - if (header.containsKey(HeaderFramework.VIA)) viaValue.append(header.get(HeaderFramework.VIA)); - if (viaValue.length() > 0) viaValue.append(", "); - - // appending info about this peer - viaValue - .append(httpVer).append(" ") - .append(myAddress).append(" ") - .append("(YaCy ").append(sb.getConfig("vString", "0.0")).append(")"); - - // storing header back - header.put(HeaderFramework.VIA, viaValue.toString()); - } - } - - public static void doConnect(final HashMap conProp, final RequestHeader requestHeader, final InputStream clientIn, final OutputStream clientOut) throws IOException { - - sb.proxyLastAccess = System.currentTimeMillis(); - - String host = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HOST); - final String httpVersion = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HTTP_VER); - String path = (String) conProp.get(HeaderFramework.CONNECTION_PROP_PATH); - final String args = (String) conProp.get(HeaderFramework.CONNECTION_PROP_ARGS); - if (args != null) { path = path + "?" + args; } - - int port, pos; - if ((pos = host.indexOf(':')) < 0) { - port = 80; - } else { - port = NumberTools.parseIntDecSubstring(host, pos + 1); - host = host.substring(0, pos); - } - - // check the blacklist - // blacklist idea inspired by [AS]: - // respond a 404 for all AGIS ("all you get is shit") servers - final String hostlow = host.toLowerCase(); - if (Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_PROXY, hostlow, path)) { - HTTPDemon.sendRespondError(conProp,clientOut,4,403,null, - "URL '" + hostlow + "' blocked by yacy proxy (blacklisted)",null); - log.logInfo("AGIS blocking of host '" + hostlow + "'"); - forceConnectionClose(conProp); - return; - } - - // possibly branch into PROXY-PROXY connection - if (ProxySettings.use && ProxySettings.use4ssl) { - final HTTPClient remoteProxy = setupHttpClient(requestHeader, host); - - try { - remoteProxy.HEADResponse("http://" + host + ":" + port); - final ResponseHeader header = new ResponseHeader(remoteProxy.getHttpResponse().getAllHeaders()); - - // outputs a logline to the serverlog with the current status - log.logInfo("CONNECT-RESPONSE: status=" + remoteProxy.getHttpResponse().getStatusLine() + ", header=" + header.toString()); - final boolean success = remoteProxy.getHttpResponse().getStatusLine().getStatusCode() >= 200 && remoteProxy.getHttpResponse().getStatusLine().getStatusCode() <= 399; - if (success) { - // replace connection details - host = ProxySettings.host; - port = ProxySettings.port; - // go on (see below) - } else { - // pass error response back to client - HTTPDemon.sendRespondHeader( - conProp, - clientOut, - httpVersion, - remoteProxy.getHttpResponse().getStatusLine().getStatusCode(), - remoteProxy.getHttpResponse().getStatusLine().toString(), - header); - //respondHeader(clientOut, response.status, response.responseHeader); - forceConnectionClose(conProp); - return; - } - } catch (final Exception e) { - throw new IOException(e.getMessage()); - } - } - - // try to establish connection to remote host - final Socket sslSocket = new Socket(host, port); - sslSocket.setSoTimeout(timeout); // waiting time for write - sslSocket.setSoLinger(true, timeout); // waiting time for read - final InputStream promiscuousIn = sslSocket.getInputStream(); - final OutputStream promiscuousOut = sslSocket.getOutputStream(); - - // now then we can return a success message - clientOut.write(UTF8.getBytes(httpVersion + " 200 Connection established" + serverCore.CRLF_STRING + - "Proxy-agent: YACY" + serverCore.CRLF_STRING + - serverCore.CRLF_STRING)); - - log.logInfo("SSL connection to " + host + ":" + port + " established."); - - // start stream passing with mediate processes - final Mediate cs = new Mediate(sslSocket, clientIn, promiscuousOut); - final Mediate sc = new Mediate(sslSocket, promiscuousIn, clientOut); - cs.start(); - sc.start(); - while ((sslSocket != null) && - (sslSocket.isBound()) && - (!(sslSocket.isClosed())) && - (sslSocket.isConnected()) && - ((cs.isAlive()) || (sc.isAlive()))) { - // idle - try {Thread.sleep(1000);} catch (final InterruptedException e) {} // wait a while - } - // set stop mode - cs.pleaseTerminate(); - sc.pleaseTerminate(); - // wake up thread - cs.interrupt(); - sc.interrupt(); - // ...hope they have terminated... - } - - public static class Mediate extends Thread { - - boolean terminate; - Socket socket; - InputStream in; - OutputStream out; - - public Mediate(final Socket socket, final InputStream in, final OutputStream out) { - this.terminate = false; - this.in = in; - this.out = out; - this.socket = socket; - } - - @Override - public void run() { - final byte[] buffer = new byte[512]; - int len; - try { - while ((this.socket != null) && - (this.socket.isBound()) && - (!(this.socket.isClosed())) && - (this.socket.isConnected()) && - (!(this.terminate)) && - (this.in != null) && - (this.out != null) && - ((len = this.in.read(buffer)) >= 0) - ) { - this.out.write(buffer, 0, len); - } - } catch (final IOException e) { - // do nothing - } catch (final Exception e) { - Log.logException(e); - } - } - - public void pleaseTerminate() { - this.terminate = true; - } - } - - private static void handleProxyException(final Exception e, final HashMap conProp, final OutputStream respond, final DigestURI url) { - // this may happen if - // - the targeted host does not exist - // - anything with the remote server was wrong. - // - the client unexpectedly closed the connection ... - try { - - - // doing some errorhandling ... - int httpStatusCode = 404; - String httpStatusText = null; - String errorMessage = null; - Exception errorExc = null; - boolean unknownError = false; - - // for customized error messages - boolean detailedErrorMsg = false; - String detailedErrorMsgFile = null; - serverObjects detailedErrorMsgMap = null; - - if (e instanceof ConnectException) { - httpStatusCode = 403; httpStatusText = "Connection refused"; - errorMessage = "Connection refused by destination host"; - } else if (e instanceof BindException) { - errorMessage = "Unable to establish a connection to the destination host"; - } else if (e instanceof NoRouteToHostException) { - errorMessage = "No route to destination host"; - } else if (e instanceof UnknownHostException) { - //errorMessage = "IP address of the destination host could not be determined"; - try { - detailedErrorMsgMap = unknownHostHandling(conProp); - httpStatusText = "Unknown Host"; - detailedErrorMsg = true; - detailedErrorMsgFile = "proxymsg/unknownHost.inc"; - } catch (final Exception e1) { - errorMessage = "IP address of the destination host could not be determined"; - } - } else if (e instanceof SocketTimeoutException) { - errorMessage = "Unable to establish a connection to the destination host. Connect timed out."; - } else { - final String exceptionMsg = e.getMessage(); - if ((exceptionMsg != null) && (exceptionMsg.indexOf("Corrupt GZIP trailer",0) >= 0)) { - // just do nothing, we leave it this way - if (log.isFine()) log.logFine("ignoring bad gzip trail for URL " + url + " (" + e.getMessage() + ")"); - forceConnectionClose(conProp); - } else if ((exceptionMsg != null) && (exceptionMsg.indexOf("Connection reset",0)>= 0)) { - errorMessage = "Connection reset"; - } else if ((exceptionMsg != null) && (exceptionMsg.indexOf("unknown host",0)>=0)) { - try { - detailedErrorMsgMap = unknownHostHandling(conProp); - httpStatusText = "Unknown Host"; - detailedErrorMsg = true; - detailedErrorMsgFile = "proxymsg/unknownHost.inc"; - } catch (final Exception e1) { - errorMessage = "IP address of the destination host could not be determined"; - } - } else if ((exceptionMsg != null) && - ( - (exceptionMsg.indexOf("socket write error",0)>=0) || - (exceptionMsg.indexOf("Read timed out",0) >= 0) || - (exceptionMsg.indexOf("Broken pipe",0) >= 0) || - (exceptionMsg.indexOf("server has closed connection",0) >= 0) - )) { - errorMessage = exceptionMsg; - Log.logException(e); - } else { - errorMessage = "Unexpected Error. " + e.getClass().getName() + ": " + e.getMessage(); - unknownError = true; - errorExc = e; - } - } - - // sending back an error message to the client - if (!conProp.containsKey(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_HEADER)) { - if (detailedErrorMsg) { - HTTPDemon.sendRespondError(conProp,respond, httpStatusCode, httpStatusText, new File(detailedErrorMsgFile), detailedErrorMsgMap, errorExc); - } else { - HTTPDemon.sendRespondError(conProp,respond,4,httpStatusCode,httpStatusText,errorMessage,errorExc); - } - } else { - if (unknownError) { - log.logSevere("Unknown Error while processing request '" + - conProp.get(HeaderFramework.CONNECTION_PROP_REQUESTLINE) + "':" + - "\n" + Thread.currentThread().getName() + - "\n" + errorMessage,e); - } else { - log.logWarning("Error while processing request '" + - conProp.get(HeaderFramework.CONNECTION_PROP_REQUESTLINE) + "':" + - "\n" + Thread.currentThread().getName() + - "\n" + errorMessage); - } - forceConnectionClose(conProp); - } - } catch (final Exception ee) { - forceConnectionClose(conProp); - } - - } - - private static void forceConnectionClose(final HashMap conProp) { - if (conProp != null) { - conProp.put(HeaderFramework.CONNECTION_PROP_PERSISTENT,"close"); - } - } - - private static serverObjects unknownHostHandling(final HashMap conProp) throws Exception { - final serverObjects detailedErrorMsgMap = new serverObjects(); - - // generic toplevel domains - final HashSet topLevelDomains = new HashSet(Arrays.asList(new String[]{ - "aero", // Fluggesellschaften/Luftfahrt - "arpa", // Einrichtung des ARPANet - "biz", // Business - "com", // Commercial - "coop", // genossenschaftliche Unternehmen - "edu", // Education - "gov", // Government - "info", // Informationsangebote - "int", // International - "jobs", // Jobangebote von Unternemen - "mil", // Military (US-Militaer) - // "museum", // Museen - "name", // Privatpersonen - "nato", // NATO (veraltet) - "net", // Net (Netzwerkbetreiber) - "org", // Organization (Nichtkommerzielle Organisation) - "pro", // Professionals - "travel", // Touristikindustrie - - // some country tlds - "de", - "at", - "ch", - "it", - "uk" - })); - - // getting some connection properties - String orgHostPort = "80"; - String orgHostName = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HOST); - if (orgHostName == null) orgHostName = "unknown"; - orgHostName = orgHostName.toLowerCase(); - int pos = orgHostName.indexOf(':'); - if (pos != -1) { - orgHostPort = orgHostName.substring(pos+1); - orgHostName = orgHostName.substring(0,pos); - } - String orgHostPath = (String) conProp.get(HeaderFramework.CONNECTION_PROP_PATH); if (orgHostPath == null) orgHostPath = ""; - String orgHostArgs = (String) conProp.get(HeaderFramework.CONNECTION_PROP_ARGS); if (orgHostArgs == null) orgHostArgs = ""; - if (orgHostArgs.length() > 0) orgHostArgs = "?" + orgHostArgs; - detailedErrorMsgMap.put("hostName", orgHostName); - - // guessing hostnames - final HashSet testHostNames = new HashSet(); - String testHostName = null; - if (!orgHostName.startsWith("www.")) { - testHostName = "www." + orgHostName; - final InetAddress addr = Domains.dnsResolve(testHostName); - if (addr != null) testHostNames.add(testHostName); - } else if (orgHostName.startsWith("www.")) { - testHostName = orgHostName.substring(4); - final InetAddress addr = Domains.dnsResolve(testHostName); - if (addr != null) if (addr != null) testHostNames.add(testHostName); - } - if (orgHostName.length()>4 && orgHostName.startsWith("www") && (orgHostName.charAt(3) != '.')) { - testHostName = orgHostName.substring(0,3) + "." + orgHostName.substring(3); - final InetAddress addr = Domains.dnsResolve(testHostName); - if (addr != null) if (addr != null) testHostNames.add(testHostName); - } - - pos = orgHostName.lastIndexOf('.'); - if (pos != -1) { - final Iterator iter = topLevelDomains.iterator(); - while (iter.hasNext()) { - final String topLevelDomain = iter.next(); - testHostName = orgHostName.substring(0,pos) + "." + topLevelDomain; - final InetAddress addr = Domains.dnsResolve(testHostName); - if (addr != null) if (addr != null) testHostNames.add(testHostName); - } - } - - int hostNameCount = 0; - final Iterator iter = testHostNames.iterator(); - while (iter.hasNext()) { - testHostName = iter.next(); - detailedErrorMsgMap.put("list_" + hostNameCount + "_hostName",testHostName); - detailedErrorMsgMap.put("list_" + hostNameCount + "_hostPort",orgHostPort); - detailedErrorMsgMap.put("list_" + hostNameCount + "_hostPath",orgHostPath); - detailedErrorMsgMap.put("list_" + hostNameCount + "_hostArgs",orgHostArgs); - hostNameCount++; - } - - detailedErrorMsgMap.put("list", hostNameCount); - - if (hostNameCount != 0) { - detailedErrorMsgMap.put("showList", 1); - } else { - detailedErrorMsgMap.put("showList", 0); - } - - return detailedErrorMsgMap; - } - - private static synchronized String generateUserAgent(final HeaderFramework requestHeaders) { - userAgentStr.setLength(0); - - final String browserUserAgent = requestHeaders.get(HeaderFramework.USER_AGENT, yacyProxyUserAgent); - final int pos = browserUserAgent.lastIndexOf(')'); - if (pos >= 0) { - userAgentStr - .append(browserUserAgent.substring(0,pos)) - .append("; YaCy ") - .append(sb.getConfig("vString","0.1")) - .append("; yacy.net") - .append(browserUserAgent.substring(pos)); - } else { - userAgentStr.append(browserUserAgent); - } - - return userAgentStr.toString(); - } - - /** - * This function is used to generate a logging message according to the - * squid logging format.

- * e.g.
- * 1117528623.857 178 192.168.1.201 TCP_MISS/200 1069 GET http://www.yacy.de/ - DIRECT/81.169.145.74 text/html - */ - private final static synchronized void logProxyAccess(final HashMap conProp) { - - if (!doAccessLogging) return; - - logMessage.setLength(0); - - // Timestamp - final String currentTimestamp = Long.toString(System.currentTimeMillis()); - final int offset = currentTimestamp.length()-3; - - logMessage.append(currentTimestamp.substring(0,offset)); - logMessage.append('.'); - logMessage.append(currentTimestamp.substring(offset)); - logMessage.append(' '); - - // Elapsed time - final Long requestStart = (Long) conProp.get(HeaderFramework.CONNECTION_PROP_REQUEST_START); - final Long requestEnd = (Long) conProp.get(HeaderFramework.CONNECTION_PROP_REQUEST_END); - final String elapsed = Long.toString(requestEnd.longValue()-requestStart.longValue()); - - for (int i=0; i<6-elapsed.length(); i++) logMessage.append(' '); - logMessage.append(elapsed); - logMessage.append(' '); - - // Remote Host - final String clientIP = (String) conProp.get(HeaderFramework.CONNECTION_PROP_CLIENTIP); - logMessage.append(clientIP); - logMessage.append(' '); - - // Code/Status - final String respondStatus = (String) conProp.get(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_STATUS); - String respondCode = (String) conProp.get(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE); - if (respondCode == null) respondCode = "UNKNOWN"; - logMessage.append(respondCode); - logMessage.append("/"); - logMessage.append(respondStatus); - logMessage.append(' '); - - // Bytes - final String bytes = (String) conProp.get(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_SIZE); - logMessage.append(bytes.toString()); - logMessage.append(' '); - - // Method - final String requestMethod = (String) conProp.get(HeaderFramework.CONNECTION_PROP_METHOD); - logMessage.append(requestMethod); - logMessage.append(' '); - - // URL - final String requestURL = (String) conProp.get(HeaderFramework.CONNECTION_PROP_URL); - final String requestArgs = (String) conProp.get(HeaderFramework.CONNECTION_PROP_ARGS); - logMessage.append(requestURL); - if (requestArgs != null) { - logMessage.append("?") - .append(requestArgs); - } - logMessage.append(' '); - - // Rfc931 - logMessage.append("-"); - logMessage.append(' '); - - // Peerstatus/Peerhost - final String host = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HOST); - logMessage.append("DIRECT/"); - logMessage.append(host); - logMessage.append(' '); - - // Type - String mime = "-"; - if (conProp.containsKey(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_HEADER)) { - final HeaderFramework proxyRespondHeader = (HeaderFramework) conProp.get(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_HEADER); - mime = proxyRespondHeader.mime(); - if (mime.indexOf(';') != -1) { - mime = mime.substring(0,mime.indexOf(';')); - } - } - logMessage.append(mime); - - // sending the logging message to the logger - if (proxyLog.isFine()) proxyLog.logFine(logMessage.toString()); - } - -} - -/* - proxy test: - - http://www.chipchapin.com/WebTools/cookietest.php? - http://xlists.aza.org/moderator/cookietest/cookietest1.php - http://vancouver-webpages.com/proxy/cache-test.html - - */ +// HTTPDProxyHandler.java +// (C) 2004 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 2004 on http://yacy.net +// +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +// Contributions: +// [AS] Alexander Schier: Blacklist (404 response for AGIS hosts) +// [TL] Timo Leise: url-wildcards for blacklists + +/* + Class documentation: + This class is a servlet to the httpd daemon. It is accessed each time + an URL in a GET, HEAD or POST command contains the whole host information + or a host is given in the header host field of an HTTP/1.0 / HTTP/1.1 + command. + Transparency is maintained, whenever appropriate. We change header + attributes if necessary for the indexing mechanism; i.e. we do not + support gzip-ed encoding. We also do not support unrealistic + 'expires' values that would force a cache to be flushed immediately + pragma non-cache attributes are supported +*/ + + +package de.anomic.http.server; + +import java.io.BufferedReader; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.OutputStream; +import java.io.PrintWriter; +import java.net.BindException; +import java.net.ConnectException; +import java.net.InetAddress; +import java.net.MalformedURLException; +import java.net.NoRouteToHostException; +import java.net.Socket; +import java.net.SocketException; +import java.net.SocketTimeoutException; +import java.net.UnknownHostException; +import java.util.Arrays; +import java.util.Date; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.logging.FileHandler; +import java.util.logging.Level; +import java.util.logging.LogManager; +import java.util.logging.Logger; + +import net.yacy.cora.document.UTF8; +import net.yacy.cora.protocol.ClientIdentification; +import net.yacy.cora.protocol.Domains; +import net.yacy.cora.protocol.HeaderFramework; +import net.yacy.cora.protocol.RequestHeader; +import net.yacy.cora.protocol.ResponseHeader; +import net.yacy.cora.protocol.http.HTTPClient; +import net.yacy.cora.protocol.http.ProxySettings; +import net.yacy.document.TextParser; +import net.yacy.document.parser.html.ContentTransformer; +import net.yacy.document.parser.html.Transformer; +import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.io.ByteCountOutputStream; +import net.yacy.kelondro.logging.Log; +import net.yacy.kelondro.util.FileUtils; +import net.yacy.repository.Blacklist; +import net.yacy.search.Switchboard; +import net.yacy.search.SwitchboardConstants; +import de.anomic.crawler.Cache; +import de.anomic.crawler.retrieval.Request; +import de.anomic.crawler.retrieval.Response; +import de.anomic.server.serverCore; +import de.anomic.server.serverObjects; + +public final class HTTPDProxyHandler { + + + private static final String yacyProxyUserAgent = "yacyproxy (" + ClientIdentification.yacySystem +") http://yacy.net/bot.html"; + + // static variables + // can only be instantiated upon first instantiation of this class object + private static Switchboard sb = null; + private static final HashSet yellowList; + private static int timeout = 60000; + private static boolean yacyTrigger = true; + public static boolean isTransparentProxy = false; + private static Process redirectorProcess = null; + private static boolean redirectorEnabled = false; + private static PrintWriter redirectorWriter = null; + private static BufferedReader redirectorReader = null; + + private static Transformer transformer = null; + private static File htRootPath = null; + + //private Properties connectionProperties = null; + // creating a logger + private static final Log log = new Log("PROXY"); + + private static boolean doAccessLogging = false; + /** + * Do logging configuration for special proxy access log file + */ + static { + + // get a switchboard + sb = Switchboard.getSwitchboard(); + if (sb != null) { + + isTransparentProxy = Boolean.parseBoolean(sb.getConfig("isTransparentProxy","false")); + + // set timeout + timeout = Integer.parseInt(sb.getConfig("proxy.clientTimeout", "10000")); + + // create a htRootPath: system pages + htRootPath = new File(sb.getAppPath(), sb.getConfig("htRootPath","htroot")); + if (!(htRootPath.exists())) { + if(!htRootPath.mkdir()) + Log.logSevere("PROXY", "could not create htRoot "+ htRootPath); + } + + // do logger initialization + try { + log.logInfo("Configuring proxy access logging ..."); + + // getting the logging manager + final LogManager manager = LogManager.getLogManager(); + final String className = HTTPDProxyHandler.class.getName(); + + // determining if proxy access logging is enabled + final String enabled = manager.getProperty(className + ".logging.enabled"); + if ("true".equalsIgnoreCase(enabled)) { + + // reading out some needed configuration properties + int limit = 1024*1024, count = 20; + String pattern = manager.getProperty(className + ".logging.FileHandler.pattern"); + if (pattern == null) pattern = "DATA/LOG/proxyAccess%u%g.log"; + // make pattern absolute + if (!new File(pattern).isAbsolute()) pattern = new File(sb.getDataPath(), pattern).getAbsolutePath(); + + final String limitStr = manager.getProperty(className + ".logging.FileHandler.limit"); + if (limitStr != null) try { limit = Integer.parseInt(limitStr); } catch (final NumberFormatException e) {} + + final String countStr = manager.getProperty(className + ".logging.FileHandler.count"); + if (countStr != null) try { count = Integer.parseInt(countStr); } catch (final NumberFormatException e) {} + + // creating the proxy access logger + final Logger proxyLogger = Logger.getLogger("PROXY.access"); + proxyLogger.setUseParentHandlers(false); + proxyLogger.setLevel(Level.FINEST); + + final FileHandler txtLog = new FileHandler(pattern, limit, count, true); + txtLog.setFormatter(new ProxyLogFormatter()); + txtLog.setLevel(Level.FINEST); + proxyLogger.addHandler(txtLog); + + doAccessLogging = true; + log.logInfo("Proxy access logging configuration done." + + "\n\tFilename: " + pattern + + "\n\tLimit: " + limitStr + + "\n\tCount: " + countStr); + } else { + log.logInfo("Proxy access logging is deactivated."); + } + } catch (final Exception e) { + log.logSevere("Unable to configure proxy access logging.",e); + } + + // load a transformer + transformer = new ContentTransformer(); + transformer.init(new File(sb.getAppPath(), sb.getConfig(SwitchboardConstants.LIST_BLUE, "")).toString()); + + // load the yellow-list + final String f = sb.getConfig("proxyYellowList", null); + if (f != null) { + yellowList = FileUtils.loadList(new File(f)); + log.logConfig("loaded yellow-list from file " + f + ", " + yellowList.size() + " entries"); + } else { + yellowList = new HashSet(); + } + + final String redirectorPath = sb.getConfig("externalRedirector", ""); + if (redirectorPath.length() > 0 && !redirectorEnabled) { + try { + redirectorProcess=Runtime.getRuntime().exec(redirectorPath); + redirectorWriter = new PrintWriter(redirectorProcess.getOutputStream()); + redirectorReader = new BufferedReader(new InputStreamReader(redirectorProcess.getInputStream())); + redirectorEnabled=true; + } catch (final IOException e) { + System.out.println("redirector not Found"); + } + } + } else { + yellowList = null; + } + } + + /** + * Special logger instance for proxy access logging much similar + * to the squid access.log file + */ + private static final Log proxyLog = new Log("PROXY.access"); + + /** + * Reusable {@link StringBuilder} for logging + */ + private static final StringBuilder logMessage = new StringBuilder(); + + /** + * Reusable {@link StringBuilder} to generate the useragent string + */ + private static final StringBuilder userAgentStr = new StringBuilder(); + + private static void handleOutgoingCookies(final RequestHeader requestHeader, final String targethost, final String clienthost) { + /* + The syntax for the header is: + + cookie = "Cookie:" cookie-version + 1*((";" | ",") cookie-value) + cookie-value = NAME "=" VALUE [";" path] [";" domain] + cookie-version = "$Version" "=" value + NAME = attr + VALUE = value + path = "$Path" "=" value + domain = "$Domain" "=" value + */ + if (sb.getConfigBool("proxy.monitorCookies", false)) { + if (requestHeader.containsKey(RequestHeader.COOKIE)) { + final Object[] entry = new Object[]{new Date(), clienthost, requestHeader.getMultiple(RequestHeader.COOKIE)}; + synchronized(sb.outgoingCookies) { + sb.outgoingCookies.put(targethost, entry); + } + } + } + } + + private static void handleIncomingCookies(final ResponseHeader respondHeader, final String serverhost, final String targetclient) { + /* + The syntax for the Set-Cookie response header is + + set-cookie = "Set-Cookie:" cookies + cookies = 1#cookie + cookie = NAME "=" VALUE *(";" cookie-av) + NAME = attr + VALUE = value + cookie-av = "Comment" "=" value + | "Domain" "=" value + | "Max-Age" "=" value + | "Path" "=" value + | "Secure" + | "Version" "=" 1*DIGIT + */ + if (sb.getConfigBool("proxy.monitorCookies", false)) { + if (respondHeader.containsKey(HeaderFramework.SET_COOKIE)) { + final Object[] entry = new Object[]{new Date(), targetclient, respondHeader.getMultiple(HeaderFramework.SET_COOKIE)}; + synchronized(sb.incomingCookies) { + sb.incomingCookies.put(serverhost, entry); + } + } + } + } + + /** + * @param conProp a collection of properties about the connection, like URL + * @param requestHeader The header lines of the connection from the request + * @param respond the OutputStream to the client + * @see de.anomic.http.httpdHandler#doGet(java.util.Properties, net.yacy.cora.protocol.HeaderFramework, java.io.OutputStream) + */ + public static void doGet(final HashMap conProp, final RequestHeader requestHeader, final OutputStream respond) { + ByteCountOutputStream countedRespond = null; + try { + final int reqID = requestHeader.hashCode(); + // remembering the starting time of the request + final Date requestDate = new Date(); // remember the time... + conProp.put(HeaderFramework.CONNECTION_PROP_REQUEST_START, Long.valueOf(requestDate.getTime())); + if (yacyTrigger) net.yacy.peers.Network.triggerOnlineAction(); + sb.proxyLastAccess = System.currentTimeMillis(); + + // using an ByteCount OutputStream to count the send bytes (needed for the logfile) + countedRespond = new ByteCountOutputStream(respond,((String) conProp.get(HeaderFramework.CONNECTION_PROP_REQUESTLINE)).length() + 2,"PROXY"); + + String host = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HOST); + String path = (String) conProp.get(HeaderFramework.CONNECTION_PROP_PATH); // always starts with leading '/' + final String args = (String) conProp.get(HeaderFramework.CONNECTION_PROP_ARGS); // may be null if no args were given + final String ip = (String) conProp.get(HeaderFramework.CONNECTION_PROP_CLIENTIP); // the ip from the connecting peer + int pos=0; + int port=0; + + DigestURI url = null; + try { + url = new DigestURI(HeaderFramework.getRequestURL(conProp)); + if (log.isFine()) log.logFine(reqID +" GET "+ url); + if (log.isFinest()) log.logFinest(reqID +" header: "+ requestHeader); + + //redirector + if (redirectorEnabled){ + synchronized(redirectorProcess){ + redirectorWriter.println(url.toNormalform(false, true)); + redirectorWriter.flush(); + } + final String newUrl = redirectorReader.readLine(); + if (!newUrl.equals("")) { + try { + url = new DigestURI(newUrl); + } catch(final MalformedURLException e){}//just keep the old one + } + if (log.isFinest()) log.logFinest(reqID +" using redirector to "+ url); + conProp.put(HeaderFramework.CONNECTION_PROP_HOST, url.getHost()+":"+url.getPort()); + conProp.put(HeaderFramework.CONNECTION_PROP_PATH, url.getPath()); + requestHeader.put(HeaderFramework.HOST, url.getHost()+":"+url.getPort()); + requestHeader.put(HeaderFramework.CONNECTION_PROP_PATH, url.getPath()); + } + } catch (final MalformedURLException e) { + final String errorMsg = "ERROR: internal error with url generation: host=" + + host + ", port=" + port + ", path=" + path + ", args=" + args; + log.logSevere(errorMsg); + HTTPDemon.sendRespondError(conProp,countedRespond,4,501,null,errorMsg,e); + return; + } + + if ((pos = host.indexOf(':')) < 0) { + port = 80; + } else { + port = Integer.parseInt(host.substring(pos + 1)); + host = host.substring(0, pos); + } + + // check the blacklist + // blacklist idea inspired by [AS]: + // respond a 404 for all AGIS ("all you get is shit") servers + final String hostlow = host.toLowerCase(); + if (args != null) { path = path + "?" + args; } + if (Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_PROXY, hostlow, path)) { + log.logInfo("AGIS blocking of host '" + hostlow + "'"); + HTTPDemon.sendRespondError(conProp,countedRespond,4,403,null, + "URL '" + hostlow + "' blocked by yacy proxy (blacklisted)",null); + return; + } + + // handle outgoing cookies + handleOutgoingCookies(requestHeader, host, ip); + prepareRequestHeader(conProp, requestHeader, hostlow); + final ResponseHeader cachedResponseHeader = Cache.getResponseHeader(url.hash()); + + // why are files unzipped upon arrival? why not zip all files in cache? + // This follows from the following premises + // (a) no file shall be unzip-ed more than once to prevent unnecessary computing time + // (b) old cache entries shall be comparable with refill-entries to detect/distinguish case 3+4 + // (c) the indexing mechanism needs files unzip-ed, a schedule could do that later + // case b and c contradicts, if we use a scheduler, because files in a stale cache would be unzipped + // and the newly arrival would be zipped and would have to be unzipped upon load. But then the + // scheduler is superfluous. Therefore the only reminding case is + // (d) cached files shall be either all zipped or unzipped + // case d contradicts with a, because files need to be unzipped for indexing. Therefore + // the only remaining case is to unzip files right upon load. Thats what we do here. + + // finally use existing cache if appropriate + // here we must decide weather or not to save the data + // to a cache + // we distinguish four CACHE STATE cases: + // 1. cache fill + // 2. cache fresh - no refill + // 3. cache stale - refill - necessary + // 4. cache stale - refill - superfluous + // in two of these cases we trigger a scheduler to handle newly arrived files: + // case 1 and case 3 + if (cachedResponseHeader == null) { + if (log.isFinest()) log.logFinest(reqID + " page not in cache: fulfill request from web"); + fulfillRequestFromWeb(conProp, url, requestHeader, cachedResponseHeader, countedRespond); + } else { + final Request request = new Request( + null, + url, + requestHeader.referer() == null ? null : new DigestURI(requestHeader.referer()).hash(), + "", + cachedResponseHeader.lastModified(), + sb.crawler.defaultProxyProfile.handle(), + 0, + 0, + 0, + 0); + final Response response = new Response( + request, + requestHeader, + cachedResponseHeader, + "200 OK", + sb.crawler.defaultProxyProfile, + true + ); + final byte[] cacheContent = Cache.getContent(url.hash()); + if (cacheContent != null && response.isFreshForProxy()) { + if (log.isFinest()) log.logFinest(reqID + " fulfill request from cache"); + fulfillRequestFromCache(conProp, url, requestHeader, cachedResponseHeader, cacheContent, countedRespond); + } else { + if (log.isFinest()) log.logFinest(reqID + " fulfill request from web"); + fulfillRequestFromWeb(conProp, url, requestHeader, cachedResponseHeader, countedRespond); + } + } + + + } catch (final Exception e) { + try { + final String exTxt = e.getMessage(); + if ((exTxt!=null)&&(exTxt.startsWith("Socket closed"))) { + forceConnectionClose(conProp); + } else if (!conProp.containsKey(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_HEADER)) { + final String errorMsg = "Unexpected Error. " + e.getClass().getName() + ": " + e.getMessage(); + HTTPDemon.sendRespondError(conProp,countedRespond,4,501,null,errorMsg,e); + log.logSevere(errorMsg); + } else { + forceConnectionClose(conProp); + } + } catch (final Exception ee) { + forceConnectionClose(conProp); + } + } finally { + try { if(countedRespond != null) countedRespond.flush(); else if(respond != null) respond.flush(); } catch (final Exception e) {} + if (countedRespond != null) countedRespond.finish(); + + conProp.put(HeaderFramework.CONNECTION_PROP_REQUEST_END, Long.valueOf(System.currentTimeMillis())); + conProp.put(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_SIZE,(countedRespond != null) ? Long.toString(countedRespond.getCount()) : -1L); + logProxyAccess(conProp); + } + } + + private static void fulfillRequestFromWeb(final HashMap conProp, final DigestURI url, final RequestHeader requestHeader, final ResponseHeader cachedResponseHeader, final OutputStream respond) { + try { + final int reqID = requestHeader.hashCode(); + + String host = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HOST); + final String path = (String) conProp.get(HeaderFramework.CONNECTION_PROP_PATH); // always starts with leading '/' + final String args = (String) conProp.get(HeaderFramework.CONNECTION_PROP_ARGS); // may be null if no args were given + final String ip = (String) conProp.get(HeaderFramework.CONNECTION_PROP_CLIENTIP); // the ip from the connecting peer + final String httpVer = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HTTP_VER); // the ip from the connecting peer + + int port, pos; + if ((pos = host.indexOf(':')) < 0) { + port = 80; + } else { + port = Integer.parseInt(host.substring(pos + 1)); + host = host.substring(0, pos); + } + + // resolve yacy and yacyh domains + String yAddress = resolveYacyDomains(host); + + // re-calc the url path + final String remotePath = (args == null) ? path : (path + "?" + args); // with leading '/' + + // remove yacy-subdomain-path, when accessing /env + if ( (yAddress != null) + && (remotePath.startsWith("/env")) + && ((pos = yAddress.indexOf('/')) != -1) + ) yAddress = yAddress.substring(0, yAddress.indexOf('/')); + + modifyProxyHeaders(requestHeader, httpVer); + + final String connectHost = hostPart(host, port, yAddress); + final String getUrl = "http://"+ connectHost + remotePath; + + requestHeader.remove(HeaderFramework.HOST); + + final HTTPClient client = setupHttpClient(requestHeader, connectHost); + + // send request + try { + client.GET(getUrl); + if (log.isFinest()) log.logFinest(reqID +" response status: "+ client.getHttpResponse().getStatusLine()); + conProp.put(HeaderFramework.CONNECTION_PROP_CLIENT_REQUEST_HEADER, requestHeader); + + final ResponseHeader responseHeader = new ResponseHeader(client.getHttpResponse().getAllHeaders()); + // determine if it's an internal error of the httpc + if (responseHeader.isEmpty()) { + throw new Exception(client.getHttpResponse().getStatusLine().toString()); + } + + if(AugmentedHtmlStream.supportsMime(responseHeader.mime())) { + // enable chunk encoding, because we don't know the length after annotating + responseHeader.remove(HeaderFramework.CONTENT_LENGTH); + responseHeader.put(HeaderFramework.TRANSFER_ENCODING, "chunked"); + + } + + ChunkedOutputStream chunkedOut = setTransferEncoding(conProp, responseHeader, client.getHttpResponse().getStatusLine().getStatusCode(), respond); + + // the cache does either not exist or is (supposed to be) stale + long sizeBeforeDelete = -1; + if (cachedResponseHeader != null) { + // delete the cache + final ResponseHeader rh = Cache.getResponseHeader(url.hash()); + if (rh != null && (sizeBeforeDelete = rh.getContentLength()) == 0) { + final byte[] b = Cache.getContent(url.hash()); + if (b != null) sizeBeforeDelete = b.length; + } + Cache.delete(url.hash()); + conProp.put(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE, "TCP_REFRESH_MISS"); + } + + // reserver cache entry + final Request request = new Request( + null, + url, + requestHeader.referer() == null ? null : new DigestURI(requestHeader.referer()).hash(), + "", + responseHeader.lastModified(), + sb.crawler.defaultProxyProfile.handle(), + 0, + 0, + 0, + sizeBeforeDelete < 0 ? 0 : sizeBeforeDelete); + + + // handle incoming cookies + handleIncomingCookies(responseHeader, host, ip); + +// prepareResponseHeader(responseHeader, res.getHttpVer()); + prepareResponseHeader(responseHeader, client.getHttpResponse().getProtocolVersion().toString()); + + if(AugmentedHtmlStream.supportsMime(responseHeader.mime())) { + // chunked encoding disables somewhere, add it again + responseHeader.put(HeaderFramework.TRANSFER_ENCODING, "chunked"); + } + + if (log.isFinest()) log.logFinest(reqID +" sending response header: "+ responseHeader); + HTTPDemon.sendRespondHeader( + conProp, + respond, + httpVer, + client.getHttpResponse().getStatusLine().getStatusCode(), + client.getHttpResponse().getStatusLine().toString(), // status text + responseHeader); + + if (hasBody(client.getHttpResponse().getStatusLine().getStatusCode())) { + + OutputStream outStream = chunkedOut != null ? chunkedOut : respond; + final Response response = new Response( + request, + requestHeader, + responseHeader, + Integer.toString(client.getHttpResponse().getStatusLine().getStatusCode()), + sb.crawler.defaultProxyProfile, + true + ); + final String storeError = response.shallStoreCacheForProxy(); + final boolean storeHTCache = response.profile().storeHTCache(); + final String supportError = TextParser.supports(response.url(), response.getMimeType()); + + if(AugmentedHtmlStream.supportsMime(responseHeader.mime())) { + outStream = new AugmentedHtmlStream(outStream, responseHeader.getCharSet(), url, url.hash(), requestHeader); + } + if ( + /* + * Now we store the response into the htcache directory if + * a) the response is cacheable AND + */ + (storeError == null) && + /* + * b) the user has configured to use the htcache OR + * c) the content should be indexed + */ + ((storeHTCache) || (supportError != null)) + ) { + // we don't write actually into a file, only to RAM, and schedule writing the file. +// int l = res.getResponseHeader().size(); + final int l = responseHeader.size(); + final ByteArrayOutputStream byteStream = new ByteArrayOutputStream((l < 32) ? 32 : l); + + final OutputStream toClientAndMemory = new MultiOutputStream(new OutputStream[] {outStream, byteStream}); +// FileUtils.copy(res.getDataAsStream(), toClientAndMemory); + client.writeTo(toClientAndMemory); + // cached bytes + byte[] cacheArray; + if (byteStream.size() > 0) { + cacheArray = byteStream.toByteArray(); + } else { + cacheArray = null; + } + if (log.isFine()) log.logFine(reqID +" writeContent of " + url + " produced cacheArray = " + ((cacheArray == null) ? "null" : ("size=" + cacheArray.length))); + + if (sizeBeforeDelete == -1) { + // totally fresh file + response.setContent(cacheArray); + try { + Cache.store(response.url(), response.getResponseHeader(), cacheArray); + sb.toIndexer(response); + } catch (final IOException e) { + log.logWarning("cannot write " + response.url() + " to Cache (1): " + e.getMessage(), e); + } + conProp.put(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE, "TCP_MISS"); + } else if (cacheArray != null && sizeBeforeDelete == cacheArray.length) { + // before we came here we deleted a cache entry + cacheArray = null; + //cacheManager.push(cacheEntry); // unnecessary update + conProp.put(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE, "TCP_REF_FAIL_HIT"); + } else { + // before we came here we deleted a cache entry + response.setContent(cacheArray); + try { + Cache.store(response.url(), response.getResponseHeader(), cacheArray); + sb.toIndexer(response); + } catch (final IOException e) { + log.logWarning("cannot write " + response.url() + " to Cache (2): " + e.getMessage(), e); + } + conProp.put(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE, "TCP_REFRESH_MISS"); + } + } else { + // no caching + if (log.isFine()) log.logFine(reqID +" "+ url.toString() + " not cached." + + " StoreError=" + ((storeError==null)?"None":storeError) + + " StoreHTCache=" + storeHTCache + + " SupportError=" + supportError); + +// FileUtils.copy(res.getDataAsStream(), outStream); + client.writeTo(outStream); + + conProp.put(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE,"TCP_MISS"); + } + + outStream.close(); + if (chunkedOut != null) { + chunkedOut.finish(); + chunkedOut.flush(); + } + } // end hasBody + } catch(final SocketException se) { + // if opened ... +// if(res != null) { +// // client cut proxy connection, abort download +// res.abort(); +// } + client.finish(); + handleProxyException(se,conProp,respond,url); + } finally { + // if opened ... +// if(res != null) { +// // ... close connection +// res.closeStream(); +// } + client.finish(); + } + } catch (final Exception e) { + handleProxyException(e,conProp,respond,url); + } + } + + /** + * determines if the response should have a body + * + * @param statusCode + * @param responseHeader + * @return + */ + private static boolean hasBody(final int statusCode) { + // "All 1xx (informational), 204 (no content), and 304 (not modified) responses MUST NOT + // include a message-body." + // [RFC 2616 HTTP/1.1, Sect. 4.3] and like [RFC 1945 HTTP/1.0, Sect. 7.2] + if((statusCode >= 100 && statusCode < 200) || statusCode == 204 || statusCode == 304) { + return false; + } + return true; + } + + private static void fulfillRequestFromCache( + final HashMap conProp, + final DigestURI url, + final RequestHeader requestHeader, + final ResponseHeader cachedResponseHeader, + final byte[] cacheEntry, + OutputStream respond + ) throws IOException { + + final String httpVer = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HTTP_VER); + + // we respond on the request by using the cache, the cache is fresh + try { + prepareResponseHeader(cachedResponseHeader, httpVer); + + // replace date field in old header by actual date, this is according to RFC + cachedResponseHeader.put(HeaderFramework.DATE, HeaderFramework.formatRFC1123(new Date())); + + // check if we can send a 304 instead the complete content + if (requestHeader.containsKey(RequestHeader.IF_MODIFIED_SINCE)) { + // conditional request: freshness of cache for that condition was already + // checked within shallUseCache(). Now send only a 304 response + log.logInfo("CACHE HIT/304 " + url.toString()); + conProp.put(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE,"TCP_REFRESH_HIT"); + + // setting the content length header to 0 + cachedResponseHeader.put(HeaderFramework.CONTENT_LENGTH, Integer.toString(0)); + + // send cached header with replaced date and added length + HTTPDemon.sendRespondHeader(conProp,respond,httpVer,304,cachedResponseHeader); + //respondHeader(respond, "304 OK", cachedResponseHeader); // respond with 'not modified' + } else { + // unconditional request: send content of cache + log.logInfo("CACHE HIT/203 " + url.toString()); + conProp.put(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE,"TCP_HIT"); + + // setting the content header to the proper length + cachedResponseHeader.put(HeaderFramework.CONTENT_LENGTH, Long.toString(cacheEntry.length)); + + // send cached header with replaced date and added length + HTTPDemon.sendRespondHeader(conProp,respond,httpVer,203,cachedResponseHeader); + //respondHeader(respond, "203 OK", cachedResponseHeader); // respond with 'non-authoritative' + + if(AugmentedHtmlStream.supportsMime(cachedResponseHeader.mime())) { + respond = new AugmentedHtmlStream(respond, cachedResponseHeader.getCharSet(), url, url.hash(), requestHeader); + } + + // send also the complete body now from the cache + // simply read the file and transfer to out socket + FileUtils.copy(cacheEntry, respond); + } + // that's it! + } catch (final Exception e) { + // this happens if the client stops loading the file + // we do nothing here + if (conProp.containsKey(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_HEADER)) { + log.logWarning("Error while trying to send cached message body."); + conProp.put(HeaderFramework.CONNECTION_PROP_PERSISTENT,"close"); + } else { + HTTPDemon.sendRespondError(conProp,respond,4,503,"socket error: " + e.getMessage(),"socket error: " + e.getMessage(), e); + } + } finally { + try { respond.flush(); } catch (final Exception e) {} + } + return; + } + + public static void doHead(final HashMap conProp, final RequestHeader requestHeader, OutputStream respond) { + +// ResponseContainer res = null; + DigestURI url = null; + try { + final int reqID = requestHeader.hashCode(); + // remembering the starting time of the request + final Date requestDate = new Date(); // remember the time... + conProp.put(HeaderFramework.CONNECTION_PROP_REQUEST_START, Long.valueOf(requestDate.getTime())); + if (yacyTrigger) net.yacy.peers.Network.triggerOnlineAction(); + sb.proxyLastAccess = System.currentTimeMillis(); + + // using an ByteCount OutputStream to count the send bytes + respond = new ByteCountOutputStream(respond,((String) conProp.get(HeaderFramework.CONNECTION_PROP_REQUESTLINE)).length() + 2,"PROXY"); + + String host = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HOST); + final String path = (String) conProp.get(HeaderFramework.CONNECTION_PROP_PATH); + final String args = (String) conProp.get(HeaderFramework.CONNECTION_PROP_ARGS); + final String httpVer = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HTTP_VER); + + int port, pos; + if ((pos = host.indexOf(':')) < 0) { + port = 80; + } else { + port = Integer.parseInt(host.substring(pos + 1)); + host = host.substring(0, pos); + } + + try { + url = new DigestURI("http", host, port, (args == null) ? path : path + "?" + args); + } catch (final MalformedURLException e) { + final String errorMsg = "ERROR: internal error with url generation: host=" + + host + ", port=" + port + ", path=" + path + ", args=" + args; + log.logSevere(errorMsg); + HTTPDemon.sendRespondError(conProp,respond,4,501,null,errorMsg,e); + return; + } + if (log.isFine()) log.logFine(reqID +" HEAD "+ url); + if (log.isFinest()) log.logFinest(reqID +" header: "+ requestHeader); + + // check the blacklist, inspired by [AS]: respond a 404 for all AGIS (all you get is shit) servers + final String hostlow = host.toLowerCase(); + + // re-calc the url path + final String remotePath = (args == null) ? path : (path + "?" + args); + + if (Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_PROXY, hostlow, remotePath)) { + HTTPDemon.sendRespondError(conProp,respond,4,403,null, + "URL '" + hostlow + "' blocked by yacy proxy (blacklisted)",null); + log.logInfo("AGIS blocking of host '" + hostlow + "'"); + return; + } + + prepareRequestHeader(conProp, requestHeader, hostlow); + + // resolve yacy and yacyh domains + String yAddress = resolveYacyDomains(host); + + // remove yacy-subdomain-path, when accessing /env + if ( (yAddress != null) + && (remotePath.startsWith("/env")) + && ((pos = yAddress.indexOf('/')) != -1) + ) yAddress = yAddress.substring(0, yAddress.indexOf('/')); + + modifyProxyHeaders(requestHeader, httpVer); + + // generate request-url + final String connectHost = hostPart(host, port, yAddress); + final String getUrl = "http://"+ connectHost + remotePath; + if (log.isFinest()) log.logFinest(reqID +" using url: "+ getUrl); + + final HTTPClient client = setupHttpClient(requestHeader, connectHost); + + // send request +// try { +// res = client.HEAD(getUrl); +// if (log.isFinest()) log.logFinest(reqID +" response status: "+ res.getStatusLine()); + client.HEADResponse(getUrl); + if (log.isFinest()) log.logFinest(reqID +" response status: "+ client.getHttpResponse().getStatusLine()); + + // determine if it's an internal error of the httpc +// final ResponseHeader responseHeader = res.getResponseHeader(); +// if (responseHeader.isEmpty()) { +// throw new Exception(res.getStatusLine()); +// } + final ResponseHeader responseHeader = new ResponseHeader(client.getHttpResponse().getAllHeaders()); + if (responseHeader.isEmpty()) { + throw new Exception(client.getHttpResponse().getStatusLine().toString()); + } + +// prepareResponseHeader(responseHeader, res.getHttpVer()); + prepareResponseHeader(responseHeader, client.getHttpResponse().getStatusLine().getProtocolVersion().toString()); + + // sending the server respond back to the client + if (log.isFinest()) log.logFinest(reqID +" sending response header: "+ responseHeader); +// HTTPDemon.sendRespondHeader(conProp,respond,httpVer,res.getStatusCode(),res.getStatusLine().substring(4),responseHeader); + HTTPDemon.sendRespondHeader( + conProp, + respond, + httpVer, + client.getHttpResponse().getStatusLine().getStatusCode(), + client.getHttpResponse().getStatusLine().toString(), + responseHeader); + respond.flush(); +// } finally { +// if(res != null) { +// // ... close connection +// res.closeStream(); +// } +// } + } catch (final Exception e) { + handleProxyException(e,conProp,respond,url); + } + } + + public static void doPost(final HashMap conProp, final RequestHeader requestHeader, final OutputStream respond, final InputStream body) throws IOException { + assert conProp != null : "precondition violated: conProp != null"; + assert requestHeader != null : "precondition violated: requestHeader != null"; + assert body != null : "precondition violated: body != null"; + DigestURI url = null; + ByteCountOutputStream countedRespond = null; + try { + final int reqID = requestHeader.hashCode(); + // remembering the starting time of the request + final Date requestDate = new Date(); // remember the time... + conProp.put(HeaderFramework.CONNECTION_PROP_REQUEST_START, Long.valueOf(requestDate.getTime())); + if (yacyTrigger) net.yacy.peers.Network.triggerOnlineAction(); + sb.proxyLastAccess = System.currentTimeMillis(); + + // using an ByteCount OutputStream to count the send bytes + countedRespond = new ByteCountOutputStream(respond,((String) conProp.get(HeaderFramework.CONNECTION_PROP_REQUESTLINE)).length() + 2,"PROXY"); + + String host = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HOST); + final String path = (String) conProp.get(HeaderFramework.CONNECTION_PROP_PATH); + final String args = (String) conProp.get(HeaderFramework.CONNECTION_PROP_ARGS); // may be null if no args were given + final String httpVer = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HTTP_VER); + + int port, pos; + if ((pos = host.indexOf(':')) < 0) { + port = 80; + } else { + port = Integer.parseInt(host.substring(pos + 1)); + host = host.substring(0, pos); + } + + try { + url = new DigestURI("http", host, port, (args == null) ? path : path + "?" + args); + } catch (final MalformedURLException e) { + final String errorMsg = "ERROR: internal error with url generation: host=" + + host + ", port=" + port + ", path=" + path + ", args=" + args; + log.logSevere(errorMsg); + HTTPDemon.sendRespondError(conProp,countedRespond,4,501,null,errorMsg,e); + return; + } + if (log.isFine()) log.logFine(reqID +" POST "+ url); + if (log.isFinest()) log.logFinest(reqID +" header: "+ requestHeader); + + prepareRequestHeader(conProp, requestHeader, host.toLowerCase()); + + String yAddress = resolveYacyDomains(host); + + // re-calc the url path + final String remotePath = (args == null) ? path : (path + "?" + args); + + // remove yacy-subdomain-path, when accessing /env + if ( (yAddress != null) + && (remotePath.startsWith("/env")) + && ((pos = yAddress.indexOf('/')) != -1) + ) yAddress = yAddress.substring(0, yAddress.indexOf('/')); + + modifyProxyHeaders(requestHeader, httpVer); + + final String connectHost = hostPart(host, port, yAddress); + final String getUrl = "http://"+ connectHost + remotePath; + if (log.isFinest()) log.logFinest(reqID +" using url: "+ getUrl); + + // the CONTENT_LENGTH will be added by entity and cause a ClientProtocolException if set + final int contentLength = requestHeader.getContentLength(); + requestHeader.remove(HeaderFramework.CONTENT_LENGTH); + + final HTTPClient client = setupHttpClient(requestHeader, connectHost); + + // check input + if(body == null) { + log.logSevere("no body to POST!"); + } + try { + // sending the request + client.POST(getUrl, body, contentLength); + if (log.isFinest()) log.logFinest(reqID +" response status: "+ client.getHttpResponse().getStatusLine()); + + final ResponseHeader responseHeader = new ResponseHeader(client.getHttpResponse().getAllHeaders()); + // determine if it's an internal error of the httpc + if (responseHeader.isEmpty()) { + throw new Exception(client.getHttpResponse().getStatusLine().toString()); + } + + final ChunkedOutputStream chunked = setTransferEncoding(conProp, responseHeader, client.getHttpResponse().getStatusLine().getStatusCode(), countedRespond); + + prepareResponseHeader(responseHeader, client.getHttpResponse().getProtocolVersion().toString()); + + // sending the respond header back to the client + if (chunked != null) { + responseHeader.put(HeaderFramework.TRANSFER_ENCODING, "chunked"); + } + + // sending response headers + if (log.isFinest()) log.logFinest(reqID +" sending response header: "+ responseHeader); + HTTPDemon.sendRespondHeader(conProp, + countedRespond, + httpVer, + client.getHttpResponse().getStatusLine().getStatusCode(), + client.getHttpResponse().getStatusLine().toString(), // status text + responseHeader); + + final OutputStream outStream = (chunked != null) ? chunked : countedRespond; + client.writeTo(outStream); + + if (chunked != null) { + chunked.finish(); + } + outStream.flush(); + } catch(final SocketException se) { + // connection closed by client, abort download + client.finish(); + } finally { + client.finish(); + } + } catch (final Exception e) { + handleProxyException(e,conProp,countedRespond,url); + } finally { + if(countedRespond != null) { + countedRespond.flush(); + countedRespond.finish(); + } + if(respond != null) { + respond.flush(); + } + + conProp.put(HeaderFramework.CONNECTION_PROP_REQUEST_END, Long.valueOf(System.currentTimeMillis())); + conProp.put(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_SIZE,(countedRespond != null) ? Long.toString(countedRespond.getCount()) : "-1"); + logProxyAccess(conProp); + } + } + + /** + * resolve yacy and yacyh domains + * + * @param host + * @return + */ + private static String resolveYacyDomains(final String host) { + return (HTTPDemon.getAlternativeResolver() == null) ? null : HTTPDemon.getAlternativeResolver().resolve(host); + } + + /** + * @param host + * @param port + * @param yAddress + * @return + */ + private static String hostPart(final String host, final int port, final String yAddress) { + final String connectHost = (yAddress == null) ? host +":"+ port : yAddress; + return connectHost; + } + + /** + * @param conProp + * @param requestHeader + * @param hostlow + */ + private static void prepareRequestHeader(final HashMap conProp, final RequestHeader requestHeader, final String hostlow) { + // set another userAgent, if not yellow-listed + if ((yellowList != null) && (!(yellowList.contains(domain(hostlow))))) { + // change the User-Agent + requestHeader.put(HeaderFramework.USER_AGENT, generateUserAgent(requestHeader)); + } + + // only gzip-encoding is supported, remove other encodings (e. g. deflate) + if ((requestHeader.get(HeaderFramework.ACCEPT_ENCODING,"")).indexOf("gzip",0) != -1) { + requestHeader.put(HeaderFramework.ACCEPT_ENCODING, "gzip"); + } else { + requestHeader.put(HeaderFramework.ACCEPT_ENCODING, ""); + } + + addXForwardedForHeader(conProp, requestHeader); + } + + private static String domain(final String host) { + String domain = host; + int pos = domain.lastIndexOf('.'); + if (pos >= 0) { + // truncate from last part + domain = domain.substring(0, pos); + pos = domain.lastIndexOf('.'); + if (pos >= 0) { + // truncate from first part + domain = domain.substring(pos + 1); + } + } + return domain; + } + + /** + * creates a new HttpClient and sets parameters according to proxy needs + * + * @param requestHeader + * @param connectHost may be 'host:port' or 'host:port/path' + * @return + */ + private static HTTPClient setupHttpClient(final RequestHeader requestHeader, final String connectHost) { + // setup HTTP-client + final HTTPClient client = new HTTPClient(); + client.setTimout(timeout); + client.setHeader(requestHeader.entrySet()); + client.setRedirecting(false); + return client; + } + + /** + * determines in which form the response should be send and sets header accordingly + * if the content length is not set we need to use chunked content encoding + * Implemented: + * if !content-length + * switch httpVer + * case 0.9: + * case 1.0: + * close connection after transfer + * break; + * default: + * new ChunkedStream around respond + * end if + * + * @param conProp + * @param responseHeader + * @param statusCode + * @param respond + * @return + */ + private static ChunkedOutputStream setTransferEncoding( + final HashMap conProp, final ResponseHeader responseHeader, + final int statusCode, final OutputStream respond) { + final String httpVer = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HTTP_VER); + ChunkedOutputStream chunkedOut = null; + // gzipped response is ungzipped an therefor the length is unknown + if (responseHeader.gzip() || responseHeader.getContentLength() < 0) { + // according to http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html + // a 204,304 message must not contain a message body. + // Therefore we need to set the content-length to 0. + if (statusCode == 204 || statusCode == 304) { + responseHeader.put(HeaderFramework.CONTENT_LENGTH, "0"); + } else { + if (httpVer.equals(HeaderFramework.HTTP_VERSION_0_9) || httpVer.equals(HeaderFramework.HTTP_VERSION_1_0)) { + forceConnectionClose(conProp); + } else { + chunkedOut = new ChunkedOutputStream(respond); + responseHeader.put(HeaderFramework.TRANSFER_ENCODING, "chunked"); + } + responseHeader.remove(HeaderFramework.CONTENT_LENGTH); + } + } + return chunkedOut; + } + + /** + * @param res + * @param responseHeader + */ + private static void prepareResponseHeader(final ResponseHeader responseHeader, final String httpVer) { + modifyProxyHeaders(responseHeader, httpVer); + correctContentEncoding(responseHeader); + } + + /** + * @param responseHeader + */ + private static void correctContentEncoding(final ResponseHeader responseHeader) { + // TODO gzip again? set "correct" encoding? + if(responseHeader.gzip()) { + responseHeader.remove(HeaderFramework.CONTENT_ENCODING); + responseHeader.remove(HeaderFramework.CONTENT_LENGTH); // remove gziped length + } + } + + /** + * adds the client-IP of conProp to the requestHeader + * + * @param conProp + * @param requestHeader + */ + private static void addXForwardedForHeader(final HashMap conProp, final RequestHeader requestHeader) { + // setting the X-Forwarded-For Header + if (sb.getConfigBool("proxy.sendXForwardedForHeader", true)) { + requestHeader.put(HeaderFramework.X_FORWARDED_FOR, (String) conProp.get(HeaderFramework.CONNECTION_PROP_CLIENTIP)); + } + } + + /** + * removing hop by hop headers and adding additional headers + * + * @param requestHeader + * @param httpVer + */ + private static void modifyProxyHeaders(final HeaderFramework requestHeader, final String httpVer) { + removeHopByHopHeaders(requestHeader); + setViaHeader(requestHeader, httpVer); + } + + private static void removeHopByHopHeaders(final HeaderFramework headers) { + /* + - Trailers + */ + + headers.remove(RequestHeader.CONNECTION); + headers.remove(RequestHeader.KEEP_ALIVE); + headers.remove(RequestHeader.UPGRADE); + headers.remove(RequestHeader.TE); + headers.remove(RequestHeader.PROXY_CONNECTION); + headers.remove(RequestHeader.PROXY_AUTHENTICATE); + headers.remove(RequestHeader.PROXY_AUTHORIZATION); + + // special headers inserted by squid + headers.remove(RequestHeader.X_CACHE); + headers.remove(RequestHeader.X_CACHE_LOOKUP); + + // remove transfer encoding header + // headers.remove(HeaderFramework.TRANSFER_ENCODING); + + //removing yacy status headers + headers.remove(HeaderFramework.X_YACY_KEEP_ALIVE_REQUEST_COUNT); + headers.remove(HeaderFramework.X_YACY_ORIGINAL_REQUEST_LINE); + } + + private static void setViaHeader(final HeaderFramework header, final String httpVer) { + if (!sb.getConfigBool("proxy.sendViaHeader", true)) return; + final String myAddress = (HTTPDemon.getAlternativeResolver() == null) ? null : HTTPDemon.getAlternativeResolver().myAlternativeAddress(); + if (myAddress != null) { + + // getting header set by other proxies in the chain + final StringBuilder viaValue = new StringBuilder(80); + if (header.containsKey(HeaderFramework.VIA)) viaValue.append(header.get(HeaderFramework.VIA)); + if (viaValue.length() > 0) viaValue.append(", "); + + // appending info about this peer + viaValue + .append(httpVer).append(" ") + .append(myAddress).append(" ") + .append("(YaCy ").append(sb.getConfig("vString", "0.0")).append(")"); + + // storing header back + header.put(HeaderFramework.VIA, viaValue.toString()); + } + } + + public static void doConnect(final HashMap conProp, final RequestHeader requestHeader, final InputStream clientIn, final OutputStream clientOut) throws IOException { + + sb.proxyLastAccess = System.currentTimeMillis(); + + String host = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HOST); + final String httpVersion = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HTTP_VER); + String path = (String) conProp.get(HeaderFramework.CONNECTION_PROP_PATH); + final String args = (String) conProp.get(HeaderFramework.CONNECTION_PROP_ARGS); + if (args != null) { path = path + "?" + args; } + + int port, pos; + if ((pos = host.indexOf(':')) < 0) { + port = 80; + } else { + port = Integer.parseInt(host.substring(pos + 1)); + host = host.substring(0, pos); + } + + // check the blacklist + // blacklist idea inspired by [AS]: + // respond a 404 for all AGIS ("all you get is shit") servers + final String hostlow = host.toLowerCase(); + if (Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_PROXY, hostlow, path)) { + HTTPDemon.sendRespondError(conProp,clientOut,4,403,null, + "URL '" + hostlow + "' blocked by yacy proxy (blacklisted)",null); + log.logInfo("AGIS blocking of host '" + hostlow + "'"); + forceConnectionClose(conProp); + return; + } + + // possibly branch into PROXY-PROXY connection + if (ProxySettings.use && ProxySettings.use4ssl) { + final HTTPClient remoteProxy = setupHttpClient(requestHeader, host); + + try { + remoteProxy.HEADResponse("http://" + host + ":" + port); + final ResponseHeader header = new ResponseHeader(remoteProxy.getHttpResponse().getAllHeaders()); + + // outputs a logline to the serverlog with the current status + log.logInfo("CONNECT-RESPONSE: status=" + remoteProxy.getHttpResponse().getStatusLine() + ", header=" + header.toString()); + final boolean success = remoteProxy.getHttpResponse().getStatusLine().getStatusCode() >= 200 && remoteProxy.getHttpResponse().getStatusLine().getStatusCode() <= 399; + if (success) { + // replace connection details + host = ProxySettings.host; + port = ProxySettings.port; + // go on (see below) + } else { + // pass error response back to client + HTTPDemon.sendRespondHeader( + conProp, + clientOut, + httpVersion, + remoteProxy.getHttpResponse().getStatusLine().getStatusCode(), + remoteProxy.getHttpResponse().getStatusLine().toString(), + header); + //respondHeader(clientOut, response.status, response.responseHeader); + forceConnectionClose(conProp); + return; + } + } catch (final Exception e) { + throw new IOException(e.getMessage()); + } + } + + // try to establish connection to remote host + final Socket sslSocket = new Socket(host, port); + sslSocket.setSoTimeout(timeout); // waiting time for write + sslSocket.setSoLinger(true, timeout); // waiting time for read + final InputStream promiscuousIn = sslSocket.getInputStream(); + final OutputStream promiscuousOut = sslSocket.getOutputStream(); + + // now then we can return a success message + clientOut.write(UTF8.getBytes(httpVersion + " 200 Connection established" + serverCore.CRLF_STRING + + "Proxy-agent: YACY" + serverCore.CRLF_STRING + + serverCore.CRLF_STRING)); + + log.logInfo("SSL connection to " + host + ":" + port + " established."); + + // start stream passing with mediate processes + final Mediate cs = new Mediate(sslSocket, clientIn, promiscuousOut); + final Mediate sc = new Mediate(sslSocket, promiscuousIn, clientOut); + cs.start(); + sc.start(); + while ((sslSocket != null) && + (sslSocket.isBound()) && + (!(sslSocket.isClosed())) && + (sslSocket.isConnected()) && + ((cs.isAlive()) || (sc.isAlive()))) { + // idle + try {Thread.sleep(1000);} catch (final InterruptedException e) {} // wait a while + } + // set stop mode + cs.pleaseTerminate(); + sc.pleaseTerminate(); + // wake up thread + cs.interrupt(); + sc.interrupt(); + // ...hope they have terminated... + } + + public static class Mediate extends Thread { + + boolean terminate; + Socket socket; + InputStream in; + OutputStream out; + + public Mediate(final Socket socket, final InputStream in, final OutputStream out) { + this.terminate = false; + this.in = in; + this.out = out; + this.socket = socket; + } + + @Override + public void run() { + final byte[] buffer = new byte[512]; + int len; + try { + while ((this.socket != null) && + (this.socket.isBound()) && + (!(this.socket.isClosed())) && + (this.socket.isConnected()) && + (!(this.terminate)) && + (this.in != null) && + (this.out != null) && + ((len = this.in.read(buffer)) >= 0) + ) { + this.out.write(buffer, 0, len); + } + } catch (final IOException e) { + // do nothing + } catch (final Exception e) { + Log.logException(e); + } + } + + public void pleaseTerminate() { + this.terminate = true; + } + } + + private static void handleProxyException(final Exception e, final HashMap conProp, final OutputStream respond, final DigestURI url) { + // this may happen if + // - the targeted host does not exist + // - anything with the remote server was wrong. + // - the client unexpectedly closed the connection ... + try { + + + // doing some errorhandling ... + int httpStatusCode = 404; + String httpStatusText = null; + String errorMessage = null; + Exception errorExc = null; + boolean unknownError = false; + + // for customized error messages + boolean detailedErrorMsg = false; + String detailedErrorMsgFile = null; + serverObjects detailedErrorMsgMap = null; + + if (e instanceof ConnectException) { + httpStatusCode = 403; httpStatusText = "Connection refused"; + errorMessage = "Connection refused by destination host"; + } else if (e instanceof BindException) { + errorMessage = "Unable to establish a connection to the destination host"; + } else if (e instanceof NoRouteToHostException) { + errorMessage = "No route to destination host"; + } else if (e instanceof UnknownHostException) { + //errorMessage = "IP address of the destination host could not be determined"; + try { + detailedErrorMsgMap = unknownHostHandling(conProp); + httpStatusText = "Unknown Host"; + detailedErrorMsg = true; + detailedErrorMsgFile = "proxymsg/unknownHost.inc"; + } catch (final Exception e1) { + errorMessage = "IP address of the destination host could not be determined"; + } + } else if (e instanceof SocketTimeoutException) { + errorMessage = "Unable to establish a connection to the destination host. Connect timed out."; + } else { + final String exceptionMsg = e.getMessage(); + if ((exceptionMsg != null) && (exceptionMsg.indexOf("Corrupt GZIP trailer",0) >= 0)) { + // just do nothing, we leave it this way + if (log.isFine()) log.logFine("ignoring bad gzip trail for URL " + url + " (" + e.getMessage() + ")"); + forceConnectionClose(conProp); + } else if ((exceptionMsg != null) && (exceptionMsg.indexOf("Connection reset",0)>= 0)) { + errorMessage = "Connection reset"; + } else if ((exceptionMsg != null) && (exceptionMsg.indexOf("unknown host",0)>=0)) { + try { + detailedErrorMsgMap = unknownHostHandling(conProp); + httpStatusText = "Unknown Host"; + detailedErrorMsg = true; + detailedErrorMsgFile = "proxymsg/unknownHost.inc"; + } catch (final Exception e1) { + errorMessage = "IP address of the destination host could not be determined"; + } + } else if ((exceptionMsg != null) && + ( + (exceptionMsg.indexOf("socket write error",0)>=0) || + (exceptionMsg.indexOf("Read timed out",0) >= 0) || + (exceptionMsg.indexOf("Broken pipe",0) >= 0) || + (exceptionMsg.indexOf("server has closed connection",0) >= 0) + )) { + errorMessage = exceptionMsg; + Log.logException(e); + } else { + errorMessage = "Unexpected Error. " + e.getClass().getName() + ": " + e.getMessage(); + unknownError = true; + errorExc = e; + } + } + + // sending back an error message to the client + if (!conProp.containsKey(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_HEADER)) { + if (detailedErrorMsg) { + HTTPDemon.sendRespondError(conProp,respond, httpStatusCode, httpStatusText, new File(detailedErrorMsgFile), detailedErrorMsgMap, errorExc); + } else { + HTTPDemon.sendRespondError(conProp,respond,4,httpStatusCode,httpStatusText,errorMessage,errorExc); + } + } else { + if (unknownError) { + log.logSevere("Unknown Error while processing request '" + + conProp.get(HeaderFramework.CONNECTION_PROP_REQUESTLINE) + "':" + + "\n" + Thread.currentThread().getName() + + "\n" + errorMessage,e); + } else { + log.logWarning("Error while processing request '" + + conProp.get(HeaderFramework.CONNECTION_PROP_REQUESTLINE) + "':" + + "\n" + Thread.currentThread().getName() + + "\n" + errorMessage); + } + forceConnectionClose(conProp); + } + } catch (final Exception ee) { + forceConnectionClose(conProp); + } + + } + + private static void forceConnectionClose(final HashMap conProp) { + if (conProp != null) { + conProp.put(HeaderFramework.CONNECTION_PROP_PERSISTENT,"close"); + } + } + + private static serverObjects unknownHostHandling(final HashMap conProp) throws Exception { + final serverObjects detailedErrorMsgMap = new serverObjects(); + + // generic toplevel domains + final HashSet topLevelDomains = new HashSet(Arrays.asList(new String[]{ + "aero", // Fluggesellschaften/Luftfahrt + "arpa", // Einrichtung des ARPANet + "biz", // Business + "com", // Commercial + "coop", // genossenschaftliche Unternehmen + "edu", // Education + "gov", // Government + "info", // Informationsangebote + "int", // International + "jobs", // Jobangebote von Unternemen + "mil", // Military (US-Militaer) + // "museum", // Museen + "name", // Privatpersonen + "nato", // NATO (veraltet) + "net", // Net (Netzwerkbetreiber) + "org", // Organization (Nichtkommerzielle Organisation) + "pro", // Professionals + "travel", // Touristikindustrie + + // some country tlds + "de", + "at", + "ch", + "it", + "uk" + })); + + // getting some connection properties + String orgHostPort = "80"; + String orgHostName = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HOST); + if (orgHostName == null) orgHostName = "unknown"; + orgHostName = orgHostName.toLowerCase(); + int pos = orgHostName.indexOf(':'); + if (pos != -1) { + orgHostPort = orgHostName.substring(pos+1); + orgHostName = orgHostName.substring(0,pos); + } + String orgHostPath = (String) conProp.get(HeaderFramework.CONNECTION_PROP_PATH); if (orgHostPath == null) orgHostPath = ""; + String orgHostArgs = (String) conProp.get(HeaderFramework.CONNECTION_PROP_ARGS); if (orgHostArgs == null) orgHostArgs = ""; + if (orgHostArgs.length() > 0) orgHostArgs = "?" + orgHostArgs; + detailedErrorMsgMap.put("hostName", orgHostName); + + // guessing hostnames + final HashSet testHostNames = new HashSet(); + String testHostName = null; + if (!orgHostName.startsWith("www.")) { + testHostName = "www." + orgHostName; + final InetAddress addr = Domains.dnsResolve(testHostName); + if (addr != null) testHostNames.add(testHostName); + } else if (orgHostName.startsWith("www.")) { + testHostName = orgHostName.substring(4); + final InetAddress addr = Domains.dnsResolve(testHostName); + if (addr != null) if (addr != null) testHostNames.add(testHostName); + } + if (orgHostName.length()>4 && orgHostName.startsWith("www") && (orgHostName.charAt(3) != '.')) { + testHostName = orgHostName.substring(0,3) + "." + orgHostName.substring(3); + final InetAddress addr = Domains.dnsResolve(testHostName); + if (addr != null) if (addr != null) testHostNames.add(testHostName); + } + + pos = orgHostName.lastIndexOf('.'); + if (pos != -1) { + final Iterator iter = topLevelDomains.iterator(); + while (iter.hasNext()) { + final String topLevelDomain = iter.next(); + testHostName = orgHostName.substring(0,pos) + "." + topLevelDomain; + final InetAddress addr = Domains.dnsResolve(testHostName); + if (addr != null) if (addr != null) testHostNames.add(testHostName); + } + } + + int hostNameCount = 0; + final Iterator iter = testHostNames.iterator(); + while (iter.hasNext()) { + testHostName = iter.next(); + detailedErrorMsgMap.put("list_" + hostNameCount + "_hostName",testHostName); + detailedErrorMsgMap.put("list_" + hostNameCount + "_hostPort",orgHostPort); + detailedErrorMsgMap.put("list_" + hostNameCount + "_hostPath",orgHostPath); + detailedErrorMsgMap.put("list_" + hostNameCount + "_hostArgs",orgHostArgs); + hostNameCount++; + } + + detailedErrorMsgMap.put("list", hostNameCount); + + if (hostNameCount != 0) { + detailedErrorMsgMap.put("showList", 1); + } else { + detailedErrorMsgMap.put("showList", 0); + } + + return detailedErrorMsgMap; + } + + private static synchronized String generateUserAgent(final HeaderFramework requestHeaders) { + userAgentStr.setLength(0); + + final String browserUserAgent = requestHeaders.get(HeaderFramework.USER_AGENT, yacyProxyUserAgent); + final int pos = browserUserAgent.lastIndexOf(')'); + if (pos >= 0) { + userAgentStr + .append(browserUserAgent.substring(0,pos)) + .append("; YaCy ") + .append(sb.getConfig("vString","0.1")) + .append("; yacy.net") + .append(browserUserAgent.substring(pos)); + } else { + userAgentStr.append(browserUserAgent); + } + + return userAgentStr.toString(); + } + + /** + * This function is used to generate a logging message according to the + * squid logging format.

+ * e.g.
+ * 1117528623.857 178 192.168.1.201 TCP_MISS/200 1069 GET http://www.yacy.de/ - DIRECT/81.169.145.74 text/html + */ + private final static synchronized void logProxyAccess(final HashMap conProp) { + + if (!doAccessLogging) return; + + logMessage.setLength(0); + + // Timestamp + final String currentTimestamp = Long.toString(System.currentTimeMillis()); + final int offset = currentTimestamp.length()-3; + + logMessage.append(currentTimestamp.substring(0,offset)); + logMessage.append('.'); + logMessage.append(currentTimestamp.substring(offset)); + logMessage.append(' '); + + // Elapsed time + final Long requestStart = (Long) conProp.get(HeaderFramework.CONNECTION_PROP_REQUEST_START); + final Long requestEnd = (Long) conProp.get(HeaderFramework.CONNECTION_PROP_REQUEST_END); + final String elapsed = Long.toString(requestEnd.longValue()-requestStart.longValue()); + + for (int i=0; i<6-elapsed.length(); i++) logMessage.append(' '); + logMessage.append(elapsed); + logMessage.append(' '); + + // Remote Host + final String clientIP = (String) conProp.get(HeaderFramework.CONNECTION_PROP_CLIENTIP); + logMessage.append(clientIP); + logMessage.append(' '); + + // Code/Status + final String respondStatus = (String) conProp.get(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_STATUS); + String respondCode = (String) conProp.get(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE); + if (respondCode == null) respondCode = "UNKNOWN"; + logMessage.append(respondCode); + logMessage.append("/"); + logMessage.append(respondStatus); + logMessage.append(' '); + + // Bytes + final String bytes = (String) conProp.get(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_SIZE); + logMessage.append(bytes.toString()); + logMessage.append(' '); + + // Method + final String requestMethod = (String) conProp.get(HeaderFramework.CONNECTION_PROP_METHOD); + logMessage.append(requestMethod); + logMessage.append(' '); + + // URL + final String requestURL = (String) conProp.get(HeaderFramework.CONNECTION_PROP_URL); + final String requestArgs = (String) conProp.get(HeaderFramework.CONNECTION_PROP_ARGS); + logMessage.append(requestURL); + if (requestArgs != null) { + logMessage.append("?") + .append(requestArgs); + } + logMessage.append(' '); + + // Rfc931 + logMessage.append("-"); + logMessage.append(' '); + + // Peerstatus/Peerhost + final String host = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HOST); + logMessage.append("DIRECT/"); + logMessage.append(host); + logMessage.append(' '); + + // Type + String mime = "-"; + if (conProp.containsKey(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_HEADER)) { + final HeaderFramework proxyRespondHeader = (HeaderFramework) conProp.get(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_HEADER); + mime = proxyRespondHeader.mime(); + if (mime.indexOf(';') != -1) { + mime = mime.substring(0,mime.indexOf(';')); + } + } + logMessage.append(mime); + + // sending the logging message to the logger + if (proxyLog.isFine()) proxyLog.logFine(logMessage.toString()); + } + +} + +/* + proxy test: + + http://www.chipchapin.com/WebTools/cookietest.php? + http://xlists.aza.org/moderator/cookietest/cookietest1.php + http://vancouver-webpages.com/proxy/cache-test.html + + */ diff --git a/source/net/yacy/document/parser/images/genericImageParser.java b/source/net/yacy/document/parser/images/genericImageParser.java index 08ae5d827..516eb6739 100644 --- a/source/net/yacy/document/parser/images/genericImageParser.java +++ b/source/net/yacy/document/parser/images/genericImageParser.java @@ -87,6 +87,7 @@ public class genericImageParser extends AbstractParser implements Parser { super("Generic Image Parser"); } + @Override public Document[] parse( final MultiProtocolURI location, final String mimeType, @@ -211,10 +212,12 @@ public class genericImageParser extends AbstractParser implements Parser { false)}; // images } + @Override public Set supportedMimeTypes() { return SUPPORTED_MIME_TYPES; } + @Override public Set supportedExtensions() { return SUPPORTED_EXTENSIONS; } diff --git a/source/net/yacy/interaction/AugmentHtmlStream.java b/source/net/yacy/interaction/AugmentHtmlStream.java new file mode 100644 index 000000000..ddb7aeb81 --- /dev/null +++ b/source/net/yacy/interaction/AugmentHtmlStream.java @@ -0,0 +1,25 @@ +package net.yacy.interaction; + +import java.nio.charset.Charset; + +import net.yacy.cora.protocol.RequestHeader; +import net.yacy.kelondro.data.meta.DigestURI; + + +public class AugmentHtmlStream { + + public static StringBuffer process (StringBuffer data, Charset charset, DigestURI url, RequestHeader requestHeader) { + + boolean augmented = false; + + String Doc = data.toString(); + + if (augmented) { + + return (new StringBuffer (Doc)); + } else { + return (data); + } + } + +} diff --git a/source/net/yacy/kelondro/blob/ArrayStack.java b/source/net/yacy/kelondro/blob/ArrayStack.java index e5823db7a..d4b46cf35 100644 --- a/source/net/yacy/kelondro/blob/ArrayStack.java +++ b/source/net/yacy/kelondro/blob/ArrayStack.java @@ -628,7 +628,7 @@ public class ArrayStack implements BLOB { */ @Override public byte[] get(final byte[] key) throws IOException, RowSpaceExceededException { - if (this.blobs.size() == 0) return null; + if (this.blobs == null || this.blobs.size() == 0) return null; if (this.blobs.size() == 1) { final blobItem bi = this.blobs.get(0); return bi.blob.get(key);