diff --git a/htroot/IndexCreateLoaderQueue_p.java b/htroot/IndexCreateLoaderQueue_p.java index 3a19c37fb..758dbbd3c 100644 --- a/htroot/IndexCreateLoaderQueue_p.java +++ b/htroot/IndexCreateLoaderQueue_p.java @@ -53,7 +53,7 @@ public class IndexCreateLoaderQueue_p { for (int i = 0; i < w.length; i++) { if (w[i] == null) continue; - initiator = sb.peers.getConnected(new String(w[i].initiator())); + initiator = sb.peers.getConnected((w[i].initiator() == null) ? "" : new String(w[i].initiator())); prop.put("loader-set_list_"+count+"_dark", dark ? "1" : "0"); prop.putHTML("loader-set_list_"+count+"_initiator", ((initiator == null) ? "proxy" : initiator.getName())); prop.put("loader-set_list_"+count+"_depth", w[i].depth()); diff --git a/htroot/IndexCreateWWWGlobalQueue_p.java b/htroot/IndexCreateWWWGlobalQueue_p.java index b5e5d6ef7..0591654eb 100644 --- a/htroot/IndexCreateWWWGlobalQueue_p.java +++ b/htroot/IndexCreateWWWGlobalQueue_p.java @@ -99,7 +99,7 @@ public class IndexCreateWWWGlobalQueue_p { for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) { urle = crawlerList.get(i); if (urle != null && urle.url() != null) { - initiator = sb.peers.getConnected(urle.initiator() == null ? "" : new String(urle.initiator())); + initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : new String(urle.initiator())); profileHandle = urle.profileHandle(); profileEntry = (profileHandle == null) ? null : sb.crawler.profilesActiveCrawls.getEntry(profileHandle); prop.put("crawler-queue_list_"+showNum+"_dark", dark ? "1" : "0"); diff --git a/htroot/IndexCreateWWWRemoteQueue_p.java b/htroot/IndexCreateWWWRemoteQueue_p.java index 02efe8145..14de3b7c2 100644 --- a/htroot/IndexCreateWWWRemoteQueue_p.java +++ b/htroot/IndexCreateWWWRemoteQueue_p.java @@ -96,7 +96,7 @@ public class IndexCreateWWWRemoteQueue_p { for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) { urle = crawlerList.get(i); if (urle != null && urle.url() != null) { - initiator = sb.peers.getConnected(urle.initiator() == null ? "" : new String(urle.initiator())); + initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : new String(urle.initiator())); profileHandle = urle.profileHandle(); profileEntry = (profileHandle == null) ? null : sb.crawler.profilesActiveCrawls.getEntry(profileHandle); prop.put("crawler-queue_list_" + showNum + "_dark", dark ? "1" : "0"); diff --git a/htroot/Network.java b/htroot/Network.java index fff4f8c09..9dbb61cdf 100644 --- a/htroot/Network.java +++ b/htroot/Network.java @@ -36,11 +36,12 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; +import net.yacy.cora.protocol.Client; import net.yacy.kelondro.util.DateFormatter; import net.yacy.kelondro.util.MapTools; import de.anomic.crawler.retrieval.HTTPLoader; -import de.anomic.http.client.Client; +//import de.anomic.http.client.Client; import de.anomic.http.server.HeaderFramework; import de.anomic.http.server.RequestHeader; import de.anomic.search.Switchboard; diff --git a/htroot/api/queues_p.java b/htroot/api/queues_p.java index 56397b1f0..12faf9518 100755 --- a/htroot/api/queues_p.java +++ b/htroot/api/queues_p.java @@ -57,7 +57,7 @@ public class queues_p { for (int i = 0; i < w.length; i++) { if (w[i] == null) continue; prop.put("list-loader_"+count+"_profile", w[i].profileHandle()); - initiator = sb.peers.getConnected(new String(w[i].initiator())); + initiator = sb.peers.getConnected((w[i].initiator() == null) ? "" : new String(w[i].initiator())); prop.putHTML("list-loader_"+count+"_initiator", ((initiator == null) ? "proxy" : initiator.getName())); prop.put("list-loader_"+count+"_depth", w[i].depth()); prop.putXML("list-loader_"+count+"_url", w[i].url().toString()); @@ -101,7 +101,7 @@ public class queues_p { for (int i = 0; i < crawlerList.size(); i++) { urle = crawlerList.get(i); if ((urle != null) && (urle.url() != null)) { - initiator = sb.peers.getConnected(urle.initiator() == null ? "" : new String(urle.initiator())); + initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : new String(urle.initiator())); prop.put(tableName + "_" + showNum + "_profile", urle.profileHandle()); prop.put(tableName + "_" + showNum + "_initiator", ((initiator == null) ? "proxy" : initiator.getName())); prop.put(tableName + "_" + showNum + "_depth", urle.depth()); diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index efb633c87..209f391c5 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -263,7 +263,7 @@ public class CrawlQueues { if (this.log.isFine()) log.logFine(stats + ": URL=" + urlEntry.url() - + ", initiator=" + new String(urlEntry.initiator()) + + ", initiator=" + ((urlEntry.initiator() == null) ? "" : new String(urlEntry.initiator())) + ", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false") + ", depth=" + urlEntry.depth() + ", crawlDepth=" + profile.depth() diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index c9bf36f14..b6bef6f63 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -155,7 +155,7 @@ public final class CrawlStacker { public void enqueueEntry(final Request entry) { // DEBUG - if (log.isFinest()) log.logFinest("ENQUEUE " + entry.url() + ", referer=" + entry.referrerhash() + ", initiator=" + new String(entry.initiator()) + ", name=" + entry.name() + ", appdate=" + entry.appdate() + ", depth=" + entry.depth()); + if (log.isFinest()) log.logFinest("ENQUEUE " + entry.url() + ", referer=" + entry.referrerhash() + ", initiator=" + ((entry.initiator() == null) ? "" : new String(entry.initiator())) + ", name=" + entry.name() + ", appdate=" + entry.appdate() + ", depth=" + entry.depth()); if (prefetchHost(entry.url().getHost())) { try { diff --git a/source/de/anomic/crawler/RobotsTxt.java b/source/de/anomic/crawler/RobotsTxt.java index 939407ff3..ac0d212e4 100644 --- a/source/de/anomic/crawler/RobotsTxt.java +++ b/source/de/anomic/crawler/RobotsTxt.java @@ -26,8 +26,8 @@ package de.anomic.crawler; -import java.io.BufferedInputStream; -import java.io.BufferedOutputStream; +//import java.io.BufferedInputStream; +//import java.io.BufferedOutputStream; import java.io.IOException; import java.net.MalformedURLException; import java.util.ArrayList; @@ -36,18 +36,20 @@ import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.protocol.Client; import net.yacy.kelondro.blob.BEncodedHeap; import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; -import net.yacy.kelondro.util.ByteBuffer; +//import net.yacy.kelondro.util.ByteBuffer; import net.yacy.kelondro.util.DateFormatter; -import net.yacy.kelondro.util.FileUtils; +//import net.yacy.kelondro.util.FileUtils; import de.anomic.crawler.retrieval.HTTPLoader; -import de.anomic.http.client.Client; +//import de.anomic.http.client.Client; import de.anomic.http.server.HeaderFramework; import de.anomic.http.server.RequestHeader; -import de.anomic.http.server.ResponseContainer; +//import de.anomic.http.server.ResponseContainer; +import de.anomic.http.server.ResponseHeader; public class RobotsTxt { @@ -341,25 +343,35 @@ public class RobotsTxt { // setup http-client //TODO: adding Traffic statistic for robots download? - final Client client = new Client(10000, reqHeaders); - ResponseContainer res = null; +// final Client client = new Client(10000, reqHeaders); +// ResponseContainer res = null; + final Client client = new Client(); + client.setHeader(reqHeaders.entrySet()); try { - // sending the get request - res = client.GET(robotsURL.toString()); - // check for interruption if (Thread.currentThread().isInterrupted()) throw new InterruptedException("Shutdown in progress."); + // sending the get request +// res = client.GET(robotsURL.toString()); + robotsTxt = client.GETbytes(robotsURL.toString()); + final int code = client.getHttpResponse().getStatusLine().getStatusCode(); + final ResponseHeader header = new ResponseHeader(null, client.getHeaderHashMap()); + // check the response status - if (res.getStatusLine().startsWith("2")) { - if (!res.getResponseHeader().mime().startsWith("text/plain")) { +// if (res.getStatusLine().startsWith("2")) { + if (code > 199 && code < 300) { +// if (!res.getResponseHeader().mime().startsWith("text/plain")) { + if (!header.mime().startsWith("text/plain")) { robotsTxt = null; - if (log.isFinest()) log.logFinest("Robots.txt from URL '" + robotsURL + "' has wrong mimetype '" + res.getResponseHeader().mime() + "'."); +// if (log.isFinest()) log.logFinest("Robots.txt from URL '" + robotsURL + "' has wrong mimetype '" + res.getResponseHeader().mime() + "'."); + if (log.isFinest()) log.logFinest("Robots.txt from URL '" + robotsURL + "' has wrong mimetype '" + header.mime() + "'."); } else { // getting some metadata - eTag = res.getResponseHeader().containsKey(HeaderFramework.ETAG)?(res.getResponseHeader().get(HeaderFramework.ETAG)).trim():null; - lastMod = res.getResponseHeader().lastModified(); +// eTag = res.getResponseHeader().containsKey(HeaderFramework.ETAG)?(res.getResponseHeader().get(HeaderFramework.ETAG)).trim():null; +// lastMod = res.getResponseHeader().lastModified(); + eTag = header.containsKey(HeaderFramework.ETAG)?(header.get(HeaderFramework.ETAG)).trim():null; + lastMod = header.lastModified(); // if the robots.txt file was not changed we break here if ((eTag != null) && (oldEtag != null) && (eTag.equals(oldEtag))) { @@ -367,25 +379,30 @@ public class RobotsTxt { return null; } - // downloading the content - final ByteBuffer sbb = new ByteBuffer(); - try { - FileUtils.copyToStream(new BufferedInputStream(res.getDataAsStream()), new BufferedOutputStream(sbb)); - } finally { - res.closeStream(); - } - robotsTxt = sbb.getBytes(); +// // downloading the content +// final ByteBuffer sbb = new ByteBuffer(); +// try { +// FileUtils.copyToStream(new BufferedInputStream(res.getDataAsStream()), new BufferedOutputStream(sbb)); +// } finally { +// res.closeStream(); +// } +// robotsTxt = sbb.getBytes(); downloadEnd = System.currentTimeMillis(); if (log.isFinest()) log.logFinest("Robots.txt successfully loaded from URL '" + robotsURL + "' in " + (downloadEnd-downloadStart) + " ms."); } - } else if (res.getStatusCode() == 304) { +// } else if (res.getStatusCode() == 304) { + } else if (code == 304) { return null; - } else if (res.getStatusLine().startsWith("3")) { +// } else if (res.getStatusLine().startsWith("3")) { + } else if (code > 299 && code < 400) { // getting redirection URL - String redirectionUrlString = res.getResponseHeader().get(HeaderFramework.LOCATION); +// String redirectionUrlString = res.getResponseHeader().get(HeaderFramework.LOCATION); + String redirectionUrlString = header.get(HeaderFramework.LOCATION); if (redirectionUrlString==null) { - if (log.isFinest()) log.logFinest("robots.txt could not be downloaded from URL '" + robotsURL + "' because of missing redirecton header. [" + res.getStatusLine() + "]."); +// if (log.isFinest()) log.logFinest("robots.txt could not be downloaded from URL '" + robotsURL + "' because of missing redirecton header. [" + res.getStatusLine() + "]."); + if (log.isFinest()) + log.logFinest("robots.txt could not be downloaded from URL '" + robotsURL + "' because of missing redirecton header. [" + client.getHttpResponse().getStatusLine() + "]."); robotsTxt = null; } else { @@ -399,20 +416,23 @@ public class RobotsTxt { "\nRedirecting request to: " + redirectionUrl); return downloadRobotsTxt(redirectionUrl,redirectionCount,entry); } - } else if (res.getStatusCode() == 401 || res.getStatusCode() == 403) { +// } else if (res.getStatusCode() == 401 || res.getStatusCode() == 403) { + } else if (code == 401 || code == 403) { accessCompletelyRestricted = true; if (log.isFinest()) log.logFinest("Access to Robots.txt not allowed on URL '" + robotsURL + "'."); } else { - if (log.isFinest()) log.logFinest("robots.txt could not be downloaded from URL '" + robotsURL + "'. [" + res.getStatusLine() + "]."); +// if (log.isFinest()) log.logFinest("robots.txt could not be downloaded from URL '" + robotsURL + "'. [" + res.getStatusLine() + "]."); + if (log.isFinest()) + log.logFinest("robots.txt could not be downloaded from URL '" + robotsURL + "'. [" + client.getHttpResponse().getStatusLine() + "]."); robotsTxt = null; } } catch (final Exception e) { throw e; - } finally { - if(res != null) { - // release connection - res.closeStream(); - } +// } finally { +// if(res != null) { +// // release connection +// res.closeStream(); +// } } return new Object[]{Boolean.valueOf(accessCompletelyRestricted),robotsTxt,eTag,lastMod}; } diff --git a/source/de/anomic/crawler/retrieval/HTTPLoader.java b/source/de/anomic/crawler/retrieval/HTTPLoader.java index 32e21bb88..08672c031 100644 --- a/source/de/anomic/crawler/retrieval/HTTPLoader.java +++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java @@ -28,15 +28,17 @@ import java.io.IOException; import java.util.Date; import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.protocol.Client; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import net.yacy.repository.Blacklist; import de.anomic.crawler.Latency; -import de.anomic.http.client.Client; +//import de.anomic.http.client.Client; import de.anomic.http.server.HeaderFramework; import de.anomic.http.server.RequestHeader; -import de.anomic.http.server.ResponseContainer; +//import de.anomic.http.server.ResponseContainer; +import de.anomic.http.server.ResponseHeader; import de.anomic.search.Segments; import de.anomic.search.Switchboard; @@ -116,21 +118,27 @@ public final class HTTPLoader { requestHeader.put(HeaderFramework.ACCEPT_ENCODING, sb.getConfig("crawler.http.acceptEncoding", DEFAULT_ENCODING)); // HTTP-Client - final Client client = new Client(socketTimeout, requestHeader); - - ResponseContainer res = null; - try { +// final Client client = new Client(socketTimeout, requestHeader); +// ResponseContainer res = null; + final Client client = new Client(); + client.setTimout(socketTimeout); + client.setHeader(requestHeader.entrySet()); +// try { // send request - res = client.GET(request.url().toString(), maxFileSize); +// res = client.GET(request.url().toString(), maxFileSize); + final byte[] responseBody = client.GETbytes(request.url().toString(), maxFileSize); + final ResponseHeader header = new ResponseHeader(null, client.getHeaderHashMap()); + final int code = client.getHttpResponse().getStatusLine().getStatusCode(); // FIXME: 30*-handling (bottom) is never reached // we always get the final content because httpClient.followRedirects = true - if (res.getStatusCode() == 200 || res.getStatusCode() == 203) { +// if (res.getStatusCode() == 200 || res.getStatusCode() == 203) { + if (responseBody != null && (code == 200 || code == 203)) { // the transfer is ok // we write the new cache entry to file system directly - res.setAccountingName("CRAWLER"); - final byte[] responseBody = res.getData(); +// TODO: res.setAccountingName("CRAWLER"); +// final byte[] responseBody = res.getData(); long contentLength = responseBody.length; // check length again in case it was not possible to get the length before loading @@ -143,17 +151,22 @@ public final class HTTPLoader { response = new Response( request, requestHeader, - res.getResponseHeader(), - res.getStatusLine(), +// res.getResponseHeader(), +// res.getStatusLine(), + header, + Integer.toString(code), sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), responseBody ); return response; - } else if (res.getStatusLine().startsWith("30")) { - if (res.getResponseHeader().containsKey(HeaderFramework.LOCATION)) { +// } else if (res.getStatusLine().startsWith("30")) { +// if (res.getResponseHeader().containsKey(HeaderFramework.LOCATION)) { + } else if (code > 299 && code < 310) { + if (header.containsKey(HeaderFramework.LOCATION)) { // getting redirection URL - String redirectionUrlString = res.getResponseHeader().get(HeaderFramework.LOCATION); +// String redirectionUrlString = res.getResponseHeader().get(HeaderFramework.LOCATION); + String redirectionUrlString = header.get(HeaderFramework.LOCATION); redirectionUrlString = redirectionUrlString.trim(); if (redirectionUrlString.length() == 0) { @@ -165,7 +178,8 @@ public final class HTTPLoader { final DigestURI redirectionUrl = new DigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString)); // restart crawling with new url - this.log.logInfo("CRAWLER Redirection detected ('" + res.getStatusLine() + "') for URL " + request.url().toString()); +// this.log.logInfo("CRAWLER Redirection detected ('" + res.getStatusLine() + "') for URL " + request.url().toString()); + this.log.logInfo("CRAWLER Redirection detected ('" + client.getHttpResponse().getStatusLine() + "') for URL " + request.url().toString()); this.log.logInfo("CRAWLER ..Redirecting request to: " + redirectionUrl); // if we are already doing a shutdown we don't need to retry crawling @@ -187,15 +201,17 @@ public final class HTTPLoader { } } else { // if the response has not the right response type then reject file - sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "wrong http status code " + res.getStatusCode() + ")"); - throw new IOException("REJECTED WRONG STATUS TYPE '" + res.getStatusLine() + "' for URL " + request.url().toString()); - } - } finally { - if(res != null) { - // release connection - res.closeStream(); +// sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "wrong http status code " + res.getStatusCode() + ")"); +// throw new IOException("REJECTED WRONG STATUS TYPE '" + res.getStatusLine() + "' for URL " + request.url().toString()); + sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "wrong http status code " + code + ")"); + throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString()); } - } +// } finally { +// if(res != null) { +// // release connection +// res.closeStream(); +// } +// } return response; } @@ -233,37 +249,48 @@ public final class HTTPLoader { requestHeader.put(HeaderFramework.ACCEPT_ENCODING, DEFAULT_ENCODING); // HTTP-Client - final Client client = new Client(20000, requestHeader); - - ResponseContainer res = null; - try { +// final Client client = new Client(20000, requestHeader); +// ResponseContainer res = null; + final Client client = new Client(); + client.setTimout(20000); + client.setHeader(requestHeader.entrySet()); +// try { // send request - res = client.GET(request.url().toString(), Long.MAX_VALUE); +// res = client.GET(request.url().toString(), Long.MAX_VALUE); + final byte[] responseBody = client.GETbytes(request.url().toString(), Long.MAX_VALUE); + final ResponseHeader header = new ResponseHeader(null, client.getHeaderHashMap()); + final int code = client.getHttpResponse().getStatusLine().getStatusCode(); // FIXME: 30*-handling (bottom) is never reached // we always get the final content because httpClient.followRedirects = true - if (res.getStatusCode() == 200 || res.getStatusCode() == 203) { +// if (res.getStatusCode() == 200 || res.getStatusCode() == 203) { + if (responseBody != null && (code == 200 || code == 203)) { // the transfer is ok // we write the new cache entry to file system directly - res.setAccountingName("CRAWLER"); - final byte[] responseBody = res.getData(); +// TODO: res.setAccountingName("CRAWLER"); +// final byte[] responseBody = res.getData(); // create a new cache entry response = new Response( request, requestHeader, - res.getResponseHeader(), - res.getStatusLine(), +// res.getResponseHeader(), +// res.getStatusLine(), + header, + Integer.toString(code), null, responseBody ); return response; - } else if (res.getStatusLine().startsWith("30")) { - if (res.getResponseHeader().containsKey(HeaderFramework.LOCATION)) { +// } else if (res.getStatusLine().startsWith("30")) { +// if (res.getResponseHeader().containsKey(HeaderFramework.LOCATION)) { + } else if (code > 299 && code < 310) { + if (header.containsKey(HeaderFramework.LOCATION)) { // getting redirection URL - String redirectionUrlString = res.getResponseHeader().get(HeaderFramework.LOCATION); +// String redirectionUrlString = res.getResponseHeader().get(HeaderFramework.LOCATION); + String redirectionUrlString = header.get(HeaderFramework.LOCATION); redirectionUrlString = redirectionUrlString.trim(); if (redirectionUrlString.length() == 0) { @@ -285,14 +312,15 @@ public final class HTTPLoader { } } else { // if the response has not the right response type then reject file - throw new IOException("REJECTED WRONG STATUS TYPE '" + res.getStatusLine() + "' for URL " + request.url().toString()); - } - } finally { - if(res != null) { - // release connection - res.closeStream(); +// throw new IOException("REJECTED WRONG STATUS TYPE '" + res.getStatusLine() + "' for URL " + request.url().toString()); + throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString()); } - } +// } finally { +// if(res != null) { +// // release connection +// res.closeStream(); +// } +// } return response; } diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 185d87b0b..93c419de1 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -1700,7 +1700,7 @@ public final class Switchboard extends serverSwitch { ", maxDepth=" + ((response.profile() == null) ? "null" : Integer.toString(response.profile().depth())) + ", must-match=" + ((response.profile() == null) ? "null" : response.profile().mustMatchPattern().toString()) + ", must-not-match=" + ((response.profile() == null) ? "null" : response.profile().mustNotMatchPattern().toString()) + - ", initiatorHash=" + new String(response.initiator()) + + ", initiatorHash=" + ((response.initiator() == null) ? "null" : new String(response.initiator())) + //", responseHeader=" + ((entry.responseHeader() == null) ? "null" : entry.responseHeader().toString()) + ", url=" + response.url()); // DEBUG @@ -2362,7 +2362,7 @@ public final class Switchboard extends serverSwitch { final long start = System.currentTimeMillis(); // header = Client.whead(url.toString(), reqHeader); client.HEADResponse(url.toString()); - header = new ResponseHeader(client.getHeaderHashMap()); + header = new ResponseHeader(null, client.getHeaderHashMap()); final long loadtime = System.currentTimeMillis() - start; // if (header == null) { if (header == null) { diff --git a/source/net/yacy/cora/protocol/Client.java b/source/net/yacy/cora/protocol/Client.java index 4aff45c25..2d66f4dab 100644 --- a/source/net/yacy/cora/protocol/Client.java +++ b/source/net/yacy/cora/protocol/Client.java @@ -79,7 +79,7 @@ public class Client { ConnManagerParams.setMaxTotalConnections(httpParams, maxcon); // for statistics same value should also be set here ConnectionInfo.setMaxcount(maxcon); - // perhaps we need more than 2(default) connections per host? + // connections per host (2 default) final ConnPerRouteBean connPerRoute = new ConnPerRouteBean(2); // Increase max connections for localhost to 100 HttpHost localhost = new HttpHost("locahost"); @@ -118,6 +118,10 @@ public class Client { ClientConnectionManager clientConnectionManager = new ThreadSafeClientConnManager(httpParams, schemeRegistry); httpClient = new DefaultHttpClient(clientConnectionManager, httpParams); + // ask for gzip + ((AbstractHttpClient) httpClient).addRequestInterceptor(new GzipRequestInterceptor()); + // uncompress gzip + ((AbstractHttpClient) httpClient).addResponseInterceptor(new GzipResponseInterceptor()); idledConnectionEvictor = new IdledConnectionEvictor(clientConnectionManager); idledConnectionEvictor.start(); @@ -340,6 +344,13 @@ public class Client { return loc; } + /** + * @return the systemOST + */ + public static String getSystemOST() { + return systemOST; + } + /** * testing * @@ -371,15 +382,19 @@ public class Client { } } // Head some - try { - for (Header header: client.HEADResponse(url).getAllHeaders()) - System.out.println(header.getName() + " : " + header.getValue()); - System.out.println(client.getHttpResponse().getLocale()); - System.out.println(client.getHttpResponse().getProtocolVersion()); - System.out.println(client.getHttpResponse().getStatusLine()); - } catch (IOException e) { - e.printStackTrace(); +// try { +// client.HEADResponse(url); +// } catch (IOException e) { +// e.printStackTrace(); +// } + for (Header header: client.getHttpResponse().getAllHeaders()) { + System.out.println("Header " + header.getName() + " : " + header.getValue()); +// for (HeaderElement element: header.getElements()) +// System.out.println("Element " + element.getName() + " : " + element.getValue()); } + System.out.println(client.getHttpResponse().getLocale()); + System.out.println(client.getHttpResponse().getProtocolVersion()); + System.out.println(client.getHttpResponse().getStatusLine()); // Post some // try { // System.out.println(new String(client.POSTbytes(url, newparts))); diff --git a/source/net/yacy/cora/protocol/GzipDecompressingEntity.java b/source/net/yacy/cora/protocol/GzipDecompressingEntity.java new file mode 100644 index 000000000..4b2f3d128 --- /dev/null +++ b/source/net/yacy/cora/protocol/GzipDecompressingEntity.java @@ -0,0 +1,29 @@ +package net.yacy.cora.protocol; + +import java.io.IOException; +import java.io.InputStream; +import java.util.zip.GZIPInputStream; + +import org.apache.http.HttpEntity; +import org.apache.http.entity.HttpEntityWrapper; + +public class GzipDecompressingEntity extends HttpEntityWrapper { + + public GzipDecompressingEntity(final HttpEntity entity) { + super(entity); + } + + public InputStream getContent() throws IOException, IllegalStateException { + + // the wrapped entity's getContent() decides about repeatability + InputStream wrappedin = wrappedEntity.getContent(); + + return new GZIPInputStream(wrappedin); + } + + public long getContentLength() { + // length of ungzipped content not known in advance + return -1; + } + +} diff --git a/source/net/yacy/cora/protocol/GzipRequestInterceptor.java b/source/net/yacy/cora/protocol/GzipRequestInterceptor.java new file mode 100644 index 000000000..40ae3c772 --- /dev/null +++ b/source/net/yacy/cora/protocol/GzipRequestInterceptor.java @@ -0,0 +1,21 @@ +package net.yacy.cora.protocol; + +import java.io.IOException; + +import org.apache.http.HttpException; +import org.apache.http.HttpRequest; +import org.apache.http.HttpRequestInterceptor; +import org.apache.http.protocol.HttpContext; + +public class GzipRequestInterceptor implements HttpRequestInterceptor { + + private static final String ACCEPT_ENCODING = "Accept-Encoding"; + private static final String GZIP_CODEC = "gzip"; + + public void process(final HttpRequest request, final HttpContext context) throws HttpException, IOException { + if (!request.containsHeader(ACCEPT_ENCODING)) { + request.addHeader(ACCEPT_ENCODING, GZIP_CODEC); + } + } + +} diff --git a/source/net/yacy/cora/protocol/GzipResponseInterceptor.java b/source/net/yacy/cora/protocol/GzipResponseInterceptor.java new file mode 100644 index 000000000..432574d44 --- /dev/null +++ b/source/net/yacy/cora/protocol/GzipResponseInterceptor.java @@ -0,0 +1,37 @@ +package net.yacy.cora.protocol; + +import java.io.IOException; + +import org.apache.http.Header; +import org.apache.http.HeaderElement; +import org.apache.http.HttpEntity; +import org.apache.http.HttpException; +import org.apache.http.HttpResponse; +import org.apache.http.HttpResponseInterceptor; +import org.apache.http.protocol.HttpContext; + +public class GzipResponseInterceptor implements HttpResponseInterceptor { + + private static final String GZIP_CODEC = "gzip"; + + public void process(final HttpResponse response, final HttpContext context) throws HttpException, IOException { + if (context == null) { + throw new IllegalArgumentException("HTTP context may not be null"); + } + HttpEntity entity = response.getEntity(); + if (entity != null) { + Header ceheader = entity.getContentEncoding(); + if (ceheader != null) { + HeaderElement[] codecs = ceheader.getElements(); + for (int i = 0; i < codecs.length; i++) { + if (codecs[i].getName().equalsIgnoreCase(GZIP_CODEC)) { +// response.removeHeader(ceheader); + response.setEntity(new GzipDecompressingEntity(response.getEntity())); + return; + } + } + } + } + } + +}