... migrating to HttpComponents-Client-4.x ...

(gzip decompression, httploader, robots, ...)

+ enable proxy-crawling while log is fine

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7001 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
sixcooler 15 years ago
parent a55af783bf
commit 15e8c13526

@ -53,7 +53,7 @@ public class IndexCreateLoaderQueue_p {
for (int i = 0; i < w.length; i++) {
if (w[i] == null) continue;
initiator = sb.peers.getConnected(new String(w[i].initiator()));
initiator = sb.peers.getConnected((w[i].initiator() == null) ? "" : new String(w[i].initiator()));
prop.put("loader-set_list_"+count+"_dark", dark ? "1" : "0");
prop.putHTML("loader-set_list_"+count+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
prop.put("loader-set_list_"+count+"_depth", w[i].depth());

@ -99,7 +99,7 @@ public class IndexCreateWWWGlobalQueue_p {
for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) {
urle = crawlerList.get(i);
if (urle != null && urle.url() != null) {
initiator = sb.peers.getConnected(urle.initiator() == null ? "" : new String(urle.initiator()));
initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : new String(urle.initiator()));
profileHandle = urle.profileHandle();
profileEntry = (profileHandle == null) ? null : sb.crawler.profilesActiveCrawls.getEntry(profileHandle);
prop.put("crawler-queue_list_"+showNum+"_dark", dark ? "1" : "0");

@ -96,7 +96,7 @@ public class IndexCreateWWWRemoteQueue_p {
for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) {
urle = crawlerList.get(i);
if (urle != null && urle.url() != null) {
initiator = sb.peers.getConnected(urle.initiator() == null ? "" : new String(urle.initiator()));
initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : new String(urle.initiator()));
profileHandle = urle.profileHandle();
profileEntry = (profileHandle == null) ? null : sb.crawler.profilesActiveCrawls.getEntry(profileHandle);
prop.put("crawler-queue_list_" + showNum + "_dark", dark ? "1" : "0");

@ -36,11 +36,12 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import net.yacy.cora.protocol.Client;
import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.util.MapTools;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.http.client.Client;
//import de.anomic.http.client.Client;
import de.anomic.http.server.HeaderFramework;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Switchboard;

@ -57,7 +57,7 @@ public class queues_p {
for (int i = 0; i < w.length; i++) {
if (w[i] == null) continue;
prop.put("list-loader_"+count+"_profile", w[i].profileHandle());
initiator = sb.peers.getConnected(new String(w[i].initiator()));
initiator = sb.peers.getConnected((w[i].initiator() == null) ? "" : new String(w[i].initiator()));
prop.putHTML("list-loader_"+count+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
prop.put("list-loader_"+count+"_depth", w[i].depth());
prop.putXML("list-loader_"+count+"_url", w[i].url().toString());
@ -101,7 +101,7 @@ public class queues_p {
for (int i = 0; i < crawlerList.size(); i++) {
urle = crawlerList.get(i);
if ((urle != null) && (urle.url() != null)) {
initiator = sb.peers.getConnected(urle.initiator() == null ? "" : new String(urle.initiator()));
initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : new String(urle.initiator()));
prop.put(tableName + "_" + showNum + "_profile", urle.profileHandle());
prop.put(tableName + "_" + showNum + "_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
prop.put(tableName + "_" + showNum + "_depth", urle.depth());

@ -263,7 +263,7 @@ public class CrawlQueues {
if (this.log.isFine())
log.logFine(stats + ": URL=" + urlEntry.url()
+ ", initiator=" + new String(urlEntry.initiator())
+ ", initiator=" + ((urlEntry.initiator() == null) ? "" : new String(urlEntry.initiator()))
+ ", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false")
+ ", depth=" + urlEntry.depth()
+ ", crawlDepth=" + profile.depth()

@ -155,7 +155,7 @@ public final class CrawlStacker {
public void enqueueEntry(final Request entry) {
// DEBUG
if (log.isFinest()) log.logFinest("ENQUEUE " + entry.url() + ", referer=" + entry.referrerhash() + ", initiator=" + new String(entry.initiator()) + ", name=" + entry.name() + ", appdate=" + entry.appdate() + ", depth=" + entry.depth());
if (log.isFinest()) log.logFinest("ENQUEUE " + entry.url() + ", referer=" + entry.referrerhash() + ", initiator=" + ((entry.initiator() == null) ? "" : new String(entry.initiator())) + ", name=" + entry.name() + ", appdate=" + entry.appdate() + ", depth=" + entry.depth());
if (prefetchHost(entry.url().getHost())) {
try {

@ -26,8 +26,8 @@
package de.anomic.crawler;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
//import java.io.BufferedInputStream;
//import java.io.BufferedOutputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
@ -36,18 +36,20 @@ import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.Client;
import net.yacy.kelondro.blob.BEncodedHeap;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.ByteBuffer;
//import net.yacy.kelondro.util.ByteBuffer;
import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.util.FileUtils;
//import net.yacy.kelondro.util.FileUtils;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.http.client.Client;
//import de.anomic.http.client.Client;
import de.anomic.http.server.HeaderFramework;
import de.anomic.http.server.RequestHeader;
import de.anomic.http.server.ResponseContainer;
//import de.anomic.http.server.ResponseContainer;
import de.anomic.http.server.ResponseHeader;
public class RobotsTxt {
@ -341,25 +343,35 @@ public class RobotsTxt {
// setup http-client
//TODO: adding Traffic statistic for robots download?
final Client client = new Client(10000, reqHeaders);
ResponseContainer res = null;
// final Client client = new Client(10000, reqHeaders);
// ResponseContainer res = null;
final Client client = new Client();
client.setHeader(reqHeaders.entrySet());
try {
// sending the get request
res = client.GET(robotsURL.toString());
// check for interruption
if (Thread.currentThread().isInterrupted()) throw new InterruptedException("Shutdown in progress.");
// sending the get request
// res = client.GET(robotsURL.toString());
robotsTxt = client.GETbytes(robotsURL.toString());
final int code = client.getHttpResponse().getStatusLine().getStatusCode();
final ResponseHeader header = new ResponseHeader(null, client.getHeaderHashMap());
// check the response status
if (res.getStatusLine().startsWith("2")) {
if (!res.getResponseHeader().mime().startsWith("text/plain")) {
// if (res.getStatusLine().startsWith("2")) {
if (code > 199 && code < 300) {
// if (!res.getResponseHeader().mime().startsWith("text/plain")) {
if (!header.mime().startsWith("text/plain")) {
robotsTxt = null;
if (log.isFinest()) log.logFinest("Robots.txt from URL '" + robotsURL + "' has wrong mimetype '" + res.getResponseHeader().mime() + "'.");
// if (log.isFinest()) log.logFinest("Robots.txt from URL '" + robotsURL + "' has wrong mimetype '" + res.getResponseHeader().mime() + "'.");
if (log.isFinest()) log.logFinest("Robots.txt from URL '" + robotsURL + "' has wrong mimetype '" + header.mime() + "'.");
} else {
// getting some metadata
eTag = res.getResponseHeader().containsKey(HeaderFramework.ETAG)?(res.getResponseHeader().get(HeaderFramework.ETAG)).trim():null;
lastMod = res.getResponseHeader().lastModified();
// eTag = res.getResponseHeader().containsKey(HeaderFramework.ETAG)?(res.getResponseHeader().get(HeaderFramework.ETAG)).trim():null;
// lastMod = res.getResponseHeader().lastModified();
eTag = header.containsKey(HeaderFramework.ETAG)?(header.get(HeaderFramework.ETAG)).trim():null;
lastMod = header.lastModified();
// if the robots.txt file was not changed we break here
if ((eTag != null) && (oldEtag != null) && (eTag.equals(oldEtag))) {
@ -367,25 +379,30 @@ public class RobotsTxt {
return null;
}
// downloading the content
final ByteBuffer sbb = new ByteBuffer();
try {
FileUtils.copyToStream(new BufferedInputStream(res.getDataAsStream()), new BufferedOutputStream(sbb));
} finally {
res.closeStream();
}
robotsTxt = sbb.getBytes();
// // downloading the content
// final ByteBuffer sbb = new ByteBuffer();
// try {
// FileUtils.copyToStream(new BufferedInputStream(res.getDataAsStream()), new BufferedOutputStream(sbb));
// } finally {
// res.closeStream();
// }
// robotsTxt = sbb.getBytes();
downloadEnd = System.currentTimeMillis();
if (log.isFinest()) log.logFinest("Robots.txt successfully loaded from URL '" + robotsURL + "' in " + (downloadEnd-downloadStart) + " ms.");
}
} else if (res.getStatusCode() == 304) {
// } else if (res.getStatusCode() == 304) {
} else if (code == 304) {
return null;
} else if (res.getStatusLine().startsWith("3")) {
// } else if (res.getStatusLine().startsWith("3")) {
} else if (code > 299 && code < 400) {
// getting redirection URL
String redirectionUrlString = res.getResponseHeader().get(HeaderFramework.LOCATION);
// String redirectionUrlString = res.getResponseHeader().get(HeaderFramework.LOCATION);
String redirectionUrlString = header.get(HeaderFramework.LOCATION);
if (redirectionUrlString==null) {
if (log.isFinest()) log.logFinest("robots.txt could not be downloaded from URL '" + robotsURL + "' because of missing redirecton header. [" + res.getStatusLine() + "].");
// if (log.isFinest()) log.logFinest("robots.txt could not be downloaded from URL '" + robotsURL + "' because of missing redirecton header. [" + res.getStatusLine() + "].");
if (log.isFinest())
log.logFinest("robots.txt could not be downloaded from URL '" + robotsURL + "' because of missing redirecton header. [" + client.getHttpResponse().getStatusLine() + "].");
robotsTxt = null;
} else {
@ -399,20 +416,23 @@ public class RobotsTxt {
"\nRedirecting request to: " + redirectionUrl);
return downloadRobotsTxt(redirectionUrl,redirectionCount,entry);
}
} else if (res.getStatusCode() == 401 || res.getStatusCode() == 403) {
// } else if (res.getStatusCode() == 401 || res.getStatusCode() == 403) {
} else if (code == 401 || code == 403) {
accessCompletelyRestricted = true;
if (log.isFinest()) log.logFinest("Access to Robots.txt not allowed on URL '" + robotsURL + "'.");
} else {
if (log.isFinest()) log.logFinest("robots.txt could not be downloaded from URL '" + robotsURL + "'. [" + res.getStatusLine() + "].");
// if (log.isFinest()) log.logFinest("robots.txt could not be downloaded from URL '" + robotsURL + "'. [" + res.getStatusLine() + "].");
if (log.isFinest())
log.logFinest("robots.txt could not be downloaded from URL '" + robotsURL + "'. [" + client.getHttpResponse().getStatusLine() + "].");
robotsTxt = null;
}
} catch (final Exception e) {
throw e;
} finally {
if(res != null) {
// release connection
res.closeStream();
}
// } finally {
// if(res != null) {
// // release connection
// res.closeStream();
// }
}
return new Object[]{Boolean.valueOf(accessCompletelyRestricted),robotsTxt,eTag,lastMod};
}

@ -28,15 +28,17 @@ import java.io.IOException;
import java.util.Date;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.Client;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.repository.Blacklist;
import de.anomic.crawler.Latency;
import de.anomic.http.client.Client;
//import de.anomic.http.client.Client;
import de.anomic.http.server.HeaderFramework;
import de.anomic.http.server.RequestHeader;
import de.anomic.http.server.ResponseContainer;
//import de.anomic.http.server.ResponseContainer;
import de.anomic.http.server.ResponseHeader;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;
@ -116,21 +118,27 @@ public final class HTTPLoader {
requestHeader.put(HeaderFramework.ACCEPT_ENCODING, sb.getConfig("crawler.http.acceptEncoding", DEFAULT_ENCODING));
// HTTP-Client
final Client client = new Client(socketTimeout, requestHeader);
ResponseContainer res = null;
try {
// final Client client = new Client(socketTimeout, requestHeader);
// ResponseContainer res = null;
final Client client = new Client();
client.setTimout(socketTimeout);
client.setHeader(requestHeader.entrySet());
// try {
// send request
res = client.GET(request.url().toString(), maxFileSize);
// res = client.GET(request.url().toString(), maxFileSize);
final byte[] responseBody = client.GETbytes(request.url().toString(), maxFileSize);
final ResponseHeader header = new ResponseHeader(null, client.getHeaderHashMap());
final int code = client.getHttpResponse().getStatusLine().getStatusCode();
// FIXME: 30*-handling (bottom) is never reached
// we always get the final content because httpClient.followRedirects = true
if (res.getStatusCode() == 200 || res.getStatusCode() == 203) {
// if (res.getStatusCode() == 200 || res.getStatusCode() == 203) {
if (responseBody != null && (code == 200 || code == 203)) {
// the transfer is ok
// we write the new cache entry to file system directly
res.setAccountingName("CRAWLER");
final byte[] responseBody = res.getData();
// TODO: res.setAccountingName("CRAWLER");
// final byte[] responseBody = res.getData();
long contentLength = responseBody.length;
// check length again in case it was not possible to get the length before loading
@ -143,17 +151,22 @@ public final class HTTPLoader {
response = new Response(
request,
requestHeader,
res.getResponseHeader(),
res.getStatusLine(),
// res.getResponseHeader(),
// res.getStatusLine(),
header,
Integer.toString(code),
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
responseBody
);
return response;
} else if (res.getStatusLine().startsWith("30")) {
if (res.getResponseHeader().containsKey(HeaderFramework.LOCATION)) {
// } else if (res.getStatusLine().startsWith("30")) {
// if (res.getResponseHeader().containsKey(HeaderFramework.LOCATION)) {
} else if (code > 299 && code < 310) {
if (header.containsKey(HeaderFramework.LOCATION)) {
// getting redirection URL
String redirectionUrlString = res.getResponseHeader().get(HeaderFramework.LOCATION);
// String redirectionUrlString = res.getResponseHeader().get(HeaderFramework.LOCATION);
String redirectionUrlString = header.get(HeaderFramework.LOCATION);
redirectionUrlString = redirectionUrlString.trim();
if (redirectionUrlString.length() == 0) {
@ -165,7 +178,8 @@ public final class HTTPLoader {
final DigestURI redirectionUrl = new DigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString));
// restart crawling with new url
this.log.logInfo("CRAWLER Redirection detected ('" + res.getStatusLine() + "') for URL " + request.url().toString());
// this.log.logInfo("CRAWLER Redirection detected ('" + res.getStatusLine() + "') for URL " + request.url().toString());
this.log.logInfo("CRAWLER Redirection detected ('" + client.getHttpResponse().getStatusLine() + "') for URL " + request.url().toString());
this.log.logInfo("CRAWLER ..Redirecting request to: " + redirectionUrl);
// if we are already doing a shutdown we don't need to retry crawling
@ -187,15 +201,17 @@ public final class HTTPLoader {
}
} else {
// if the response has not the right response type then reject file
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "wrong http status code " + res.getStatusCode() + ")");
throw new IOException("REJECTED WRONG STATUS TYPE '" + res.getStatusLine() + "' for URL " + request.url().toString());
}
} finally {
if(res != null) {
// release connection
res.closeStream();
// sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "wrong http status code " + res.getStatusCode() + ")");
// throw new IOException("REJECTED WRONG STATUS TYPE '" + res.getStatusLine() + "' for URL " + request.url().toString());
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "wrong http status code " + code + ")");
throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
}
}
// } finally {
// if(res != null) {
// // release connection
// res.closeStream();
// }
// }
return response;
}
@ -233,37 +249,48 @@ public final class HTTPLoader {
requestHeader.put(HeaderFramework.ACCEPT_ENCODING, DEFAULT_ENCODING);
// HTTP-Client
final Client client = new Client(20000, requestHeader);
ResponseContainer res = null;
try {
// final Client client = new Client(20000, requestHeader);
// ResponseContainer res = null;
final Client client = new Client();
client.setTimout(20000);
client.setHeader(requestHeader.entrySet());
// try {
// send request
res = client.GET(request.url().toString(), Long.MAX_VALUE);
// res = client.GET(request.url().toString(), Long.MAX_VALUE);
final byte[] responseBody = client.GETbytes(request.url().toString(), Long.MAX_VALUE);
final ResponseHeader header = new ResponseHeader(null, client.getHeaderHashMap());
final int code = client.getHttpResponse().getStatusLine().getStatusCode();
// FIXME: 30*-handling (bottom) is never reached
// we always get the final content because httpClient.followRedirects = true
if (res.getStatusCode() == 200 || res.getStatusCode() == 203) {
// if (res.getStatusCode() == 200 || res.getStatusCode() == 203) {
if (responseBody != null && (code == 200 || code == 203)) {
// the transfer is ok
// we write the new cache entry to file system directly
res.setAccountingName("CRAWLER");
final byte[] responseBody = res.getData();
// TODO: res.setAccountingName("CRAWLER");
// final byte[] responseBody = res.getData();
// create a new cache entry
response = new Response(
request,
requestHeader,
res.getResponseHeader(),
res.getStatusLine(),
// res.getResponseHeader(),
// res.getStatusLine(),
header,
Integer.toString(code),
null,
responseBody
);
return response;
} else if (res.getStatusLine().startsWith("30")) {
if (res.getResponseHeader().containsKey(HeaderFramework.LOCATION)) {
// } else if (res.getStatusLine().startsWith("30")) {
// if (res.getResponseHeader().containsKey(HeaderFramework.LOCATION)) {
} else if (code > 299 && code < 310) {
if (header.containsKey(HeaderFramework.LOCATION)) {
// getting redirection URL
String redirectionUrlString = res.getResponseHeader().get(HeaderFramework.LOCATION);
// String redirectionUrlString = res.getResponseHeader().get(HeaderFramework.LOCATION);
String redirectionUrlString = header.get(HeaderFramework.LOCATION);
redirectionUrlString = redirectionUrlString.trim();
if (redirectionUrlString.length() == 0) {
@ -285,14 +312,15 @@ public final class HTTPLoader {
}
} else {
// if the response has not the right response type then reject file
throw new IOException("REJECTED WRONG STATUS TYPE '" + res.getStatusLine() + "' for URL " + request.url().toString());
}
} finally {
if(res != null) {
// release connection
res.closeStream();
// throw new IOException("REJECTED WRONG STATUS TYPE '" + res.getStatusLine() + "' for URL " + request.url().toString());
throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
}
}
// } finally {
// if(res != null) {
// // release connection
// res.closeStream();
// }
// }
return response;
}

@ -1700,7 +1700,7 @@ public final class Switchboard extends serverSwitch {
", maxDepth=" + ((response.profile() == null) ? "null" : Integer.toString(response.profile().depth())) +
", must-match=" + ((response.profile() == null) ? "null" : response.profile().mustMatchPattern().toString()) +
", must-not-match=" + ((response.profile() == null) ? "null" : response.profile().mustNotMatchPattern().toString()) +
", initiatorHash=" + new String(response.initiator()) +
", initiatorHash=" + ((response.initiator() == null) ? "null" : new String(response.initiator())) +
//", responseHeader=" + ((entry.responseHeader() == null) ? "null" : entry.responseHeader().toString()) +
", url=" + response.url()); // DEBUG
@ -2362,7 +2362,7 @@ public final class Switchboard extends serverSwitch {
final long start = System.currentTimeMillis();
// header = Client.whead(url.toString(), reqHeader);
client.HEADResponse(url.toString());
header = new ResponseHeader(client.getHeaderHashMap());
header = new ResponseHeader(null, client.getHeaderHashMap());
final long loadtime = System.currentTimeMillis() - start;
// if (header == null) {
if (header == null) {

@ -79,7 +79,7 @@ public class Client {
ConnManagerParams.setMaxTotalConnections(httpParams, maxcon);
// for statistics same value should also be set here
ConnectionInfo.setMaxcount(maxcon);
// perhaps we need more than 2(default) connections per host?
// connections per host (2 default)
final ConnPerRouteBean connPerRoute = new ConnPerRouteBean(2);
// Increase max connections for localhost to 100
HttpHost localhost = new HttpHost("locahost");
@ -118,6 +118,10 @@ public class Client {
ClientConnectionManager clientConnectionManager = new ThreadSafeClientConnManager(httpParams, schemeRegistry);
httpClient = new DefaultHttpClient(clientConnectionManager, httpParams);
// ask for gzip
((AbstractHttpClient) httpClient).addRequestInterceptor(new GzipRequestInterceptor());
// uncompress gzip
((AbstractHttpClient) httpClient).addResponseInterceptor(new GzipResponseInterceptor());
idledConnectionEvictor = new IdledConnectionEvictor(clientConnectionManager);
idledConnectionEvictor.start();
@ -340,6 +344,13 @@ public class Client {
return loc;
}
/**
* @return the systemOST
*/
public static String getSystemOST() {
return systemOST;
}
/**
* testing
*
@ -371,15 +382,19 @@ public class Client {
}
}
// Head some
try {
for (Header header: client.HEADResponse(url).getAllHeaders())
System.out.println(header.getName() + " : " + header.getValue());
System.out.println(client.getHttpResponse().getLocale());
System.out.println(client.getHttpResponse().getProtocolVersion());
System.out.println(client.getHttpResponse().getStatusLine());
} catch (IOException e) {
e.printStackTrace();
// try {
// client.HEADResponse(url);
// } catch (IOException e) {
// e.printStackTrace();
// }
for (Header header: client.getHttpResponse().getAllHeaders()) {
System.out.println("Header " + header.getName() + " : " + header.getValue());
// for (HeaderElement element: header.getElements())
// System.out.println("Element " + element.getName() + " : " + element.getValue());
}
System.out.println(client.getHttpResponse().getLocale());
System.out.println(client.getHttpResponse().getProtocolVersion());
System.out.println(client.getHttpResponse().getStatusLine());
// Post some
// try {
// System.out.println(new String(client.POSTbytes(url, newparts)));

@ -0,0 +1,29 @@
package net.yacy.cora.protocol;
import java.io.IOException;
import java.io.InputStream;
import java.util.zip.GZIPInputStream;
import org.apache.http.HttpEntity;
import org.apache.http.entity.HttpEntityWrapper;
public class GzipDecompressingEntity extends HttpEntityWrapper {
public GzipDecompressingEntity(final HttpEntity entity) {
super(entity);
}
public InputStream getContent() throws IOException, IllegalStateException {
// the wrapped entity's getContent() decides about repeatability
InputStream wrappedin = wrappedEntity.getContent();
return new GZIPInputStream(wrappedin);
}
public long getContentLength() {
// length of ungzipped content not known in advance
return -1;
}
}

@ -0,0 +1,21 @@
package net.yacy.cora.protocol;
import java.io.IOException;
import org.apache.http.HttpException;
import org.apache.http.HttpRequest;
import org.apache.http.HttpRequestInterceptor;
import org.apache.http.protocol.HttpContext;
public class GzipRequestInterceptor implements HttpRequestInterceptor {
private static final String ACCEPT_ENCODING = "Accept-Encoding";
private static final String GZIP_CODEC = "gzip";
public void process(final HttpRequest request, final HttpContext context) throws HttpException, IOException {
if (!request.containsHeader(ACCEPT_ENCODING)) {
request.addHeader(ACCEPT_ENCODING, GZIP_CODEC);
}
}
}

@ -0,0 +1,37 @@
package net.yacy.cora.protocol;
import java.io.IOException;
import org.apache.http.Header;
import org.apache.http.HeaderElement;
import org.apache.http.HttpEntity;
import org.apache.http.HttpException;
import org.apache.http.HttpResponse;
import org.apache.http.HttpResponseInterceptor;
import org.apache.http.protocol.HttpContext;
public class GzipResponseInterceptor implements HttpResponseInterceptor {
private static final String GZIP_CODEC = "gzip";
public void process(final HttpResponse response, final HttpContext context) throws HttpException, IOException {
if (context == null) {
throw new IllegalArgumentException("HTTP context may not be null");
}
HttpEntity entity = response.getEntity();
if (entity != null) {
Header ceheader = entity.getContentEncoding();
if (ceheader != null) {
HeaderElement[] codecs = ceheader.getElements();
for (int i = 0; i < codecs.length; i++) {
if (codecs[i].getName().equalsIgnoreCase(GZIP_CODEC)) {
// response.removeHeader(ceheader);
response.setEntity(new GzipDecompressingEntity(response.getEntity()));
return;
}
}
}
}
}
}
Loading…
Cancel
Save