You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
yacy_search_server/source/de/anomic/http/JakartaCommonsHttpClient.java

619 lines
23 KiB

// JakartaCommonsHttpClient.java
// (C) 2008 by Daniel Raap; danielr@users.berlios.de
// first published 2.4.2008 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2008-03-14 01:16:04 +0100 (Fr, 14 Mrz 2008) $
// $LastChangedRevision: 4558 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.http;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.zip.GZIPOutputStream;
import org.apache.commons.httpclient.ConnectMethod;
import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HostConfiguration;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.ByteArrayRequestEntity;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.HeadMethod;
import org.apache.commons.httpclient.methods.InputStreamRequestEntity;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.commons.httpclient.methods.RequestEntity;
import org.apache.commons.httpclient.methods.multipart.ByteArrayPartSource;
import org.apache.commons.httpclient.methods.multipart.FilePart;
import org.apache.commons.httpclient.methods.multipart.MultipartRequestEntity;
import org.apache.commons.httpclient.methods.multipart.Part;
import org.apache.commons.httpclient.params.DefaultHttpParams;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.apache.commons.httpclient.protocol.Protocol;
import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;
import org.apache.commons.httpclient.util.DateUtil;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.server.logging.serverLog;
/**
* HttpClient implementation which uses Jakarta Commons HttpClient 3.x {@link http://hc.apache.org/httpclient-3.x/}
*
* @author danielr
*
*/
public class JakartaCommonsHttpClient {
/**
* "the HttpClient instance and connection manager should be shared among all threads for maximum efficiency."
* (Concurrent execution of HTTP methods, http://hc.apache.org/httpclient-3.x/performance.html)
*/
private final static MultiThreadedHttpConnectionManager conManager = new MultiThreadedHttpConnectionManager();
private final static HttpClient apacheHttpClient = new HttpClient(conManager);
// last ; must be before location (this is parsed)
private final static String jakartaUserAgent = " " +
((String) DefaultHttpParams.getDefaultParams().getParameter(HttpMethodParams.USER_AGENT)).replace(';', ':');
static {
/**
* set options for client
*/
// simple user agent
setUserAgent("yacy (www.yacy.net; " + de.anomic.http.HttpClient.getSystemOST() + ")");
// only one retry
apacheHttpClient.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
new DefaultHttpMethodRetryHandler(1, false));
/**
* set options for connection manager
*/
// conManager.getParams().setDefaultMaxConnectionsPerHost(4); // default 2
conManager.getParams().setMaxTotalConnections(200); // Proxy may need many connections
conManager.getParams().setConnectionTimeout(60000); // set a default timeout
conManager.getParams().setDefaultMaxConnectionsPerHost(20); // prevent DoS by mistake
// TODO should this be configurable?
// accept self-signed or untrusted certificates
Protocol.registerProtocol("https", new Protocol("https",
(ProtocolSocketFactory) new AcceptEverythingSSLProtcolSocketFactory(), 443));
/**
* set network timeout properties. see: http://java.sun.com/j2se/1.5.0/docs/guide/net/properties.html These
* properties specify the default connect and read timeout (resp.) for the protocol handler used by
* java.net.URLConnection. the java.net.URLConnection is also used by JakartaCommons HttpClient, see
* http://hc.apache.org/httpclient-3.x/apidocs/org/apache/commons/httpclient/util/HttpURLConnection.html
*/
// specify the timeout, in milliseconds, to establish the connection to the host.
// For HTTP connections, it is the timeout when establishing the connection to the HTTP server.
System.setProperty("sun.net.client.defaultConnectTimeout", "10000");
// specify the response timeout, in milliseconds, when reading from an input stream
// after a connection is established with a resource
System.setProperty("sun.net.client.defaultReadTimeout", "60000");
}
/**
* every x milliseconds do a cleanup (close old connections)
*
* minimal intervall the cleanUp is done (in this time after a cleanup no second one is done)
*
* this is the time the method is callable, not the time it is called
*/
private static final int cleanupIntervall = 60000;
/**
* close connections when they are not used for this time
*
* or otherwise: hold connections this time open to reuse them
*/
private static final long closeConnectionsAfterMillis = 120000;
/**
* time the last cleanup was started
*/
private static long lastCleanup = 0;
private Header[] headers = new Header[0];
private httpRemoteProxyConfig proxyConfig = null;
private boolean followRedirects = true;
private boolean ignoreCookies = false;
/**
* constructs a new Client with given parameters
*
* @param timeout in milliseconds
* @param header
* @param proxyConfig
*/
public JakartaCommonsHttpClient(final int timeout, final httpHeader header, final httpRemoteProxyConfig proxyConfig) {
super();
setTimeout(timeout);
setHeader(header);
setProxy(proxyConfig);
}
/*
* (non-Javadoc)
* @see de.anomic.http.HttpClient#setProxy(de.anomic.http.httpRemoteProxyConfig)
*/
public void setProxy(final httpRemoteProxyConfig proxyConfig) {
if (proxyConfig != null && proxyConfig.useProxy()) {
this.proxyConfig = proxyConfig;
}
}
/*
* (non-Javadoc)
* @see de.anomic.http.HttpClient#setHeader(de.anomic.http.httpHeader)
*/
public void setHeader(final httpHeader header) {
headers = convertHeaders(header);
}
/*
* (non-Javadoc)
* @see de.anomic.http.HttpClient#setTimeout(int)
*/
public void setTimeout(final int timeout) {
apacheHttpClient.getParams().setIntParameter(HttpMethodParams.SO_TIMEOUT, timeout);
}
/**
* should redirects automatically be followed?
*
* @param follow
*/
public void setFollowRedirects(final boolean follow) {
followRedirects = follow;
}
/**
* <p>by default Cookies are accepted and used autmatically</p>
*
* <q cite="http://hc.apache.org/httpclient-3.x/cookies.html">HttpClient supports automatic management of cookies, including allowing the server to set cookies and
* automatically return them to the server when required.</q>
* <cite>HttpClient Cookie Guide</cite>
*
* @param ignoreCookies
*/
public void setIgnoreCookies(final boolean ignoreCookies) {
this.ignoreCookies = ignoreCookies;
}
/*
* (non-Javadoc)
* @see de.anomic.http.HttpClient#getUserAgent()
*/
public String getUserAgent() {
return getCurrentUserAgent();
}
/**
* This method GETs a page from the server.
*
* @param uri The URI to the page which should be GET.
* @return InputStream of content (body)
* @throws IOException
*/
public JakartaCommonsHttpResponse GET(final String uri) throws IOException {
final HttpMethod get = new GetMethod(uri);
get.setFollowRedirects(followRedirects);
return execute(get);
}
/**
* This method gets only the header of a page.
*
* @param uri The URI to the page whose header should be get.
* @return Instance of response with the content.
* @throws IOException
*/
public JakartaCommonsHttpResponse HEAD(final String uri) throws IOException {
assert uri != null : "precondition violated: uri != null";
final HttpMethod head = new HeadMethod(uri);
head.setFollowRedirects(followRedirects);
return execute(head);
}
/**
* This method POSTs some data from an InputStream to a page.
*
* This is for compatibility (an InputStream does not need to contain correct HTTP!)
*
* @param uri The URI to the page which the post is sent to.
* @param ins InputStream with the data to be posted to the server.
* @return Instance of response with the content.
* @throws IOException
*/
public JakartaCommonsHttpResponse POST(final String uri, final InputStream ins) throws IOException {
assert uri != null : "precondition violated: uri != null";
assert ins != null : "precondition violated: ins != null";
final PostMethod post = new PostMethod(uri);
post.setRequestEntity(new InputStreamRequestEntity(ins));
// redirects in POST cause a "Entity enclosing requests cannot be redirected without user intervention" -
// exception
post.setFollowRedirects(false);
return execute(post);
}
/**
* This method sends several data at once via a POST request (multipart-message)
*
* @param uri
* @param multiparts
* @return
* @throws IOException
*/
public JakartaCommonsHttpResponse POST(final String uri, final List<Part> multiparts) throws IOException {
return POST(uri, multiparts, false);
}
/**
* This method sends several data at once via a POST request (multipart-message), maybe compressed
*
* @param uri The URI to the page which the post is sent to.
* @param multiparts {@link java.util.List} with the {@link Part}s of data
* @param gzipBody should the body be compressed
* @return Instance of response with the content.
* @throws IOException
*/
public JakartaCommonsHttpResponse POST(final String uri, final List<Part> multiparts, final boolean gzipBody)
throws IOException {
assert uri != null : "precondition violated: uri != null";
final PostMethod post = new PostMethod(uri);
final Part[] parts;
if (multiparts != null) {
parts = multiparts.toArray(new Part[0]);
} else {
// nothing to POST
parts = new Part[0];
}
RequestEntity data = new MultipartRequestEntity(parts, post.getParams());
if (gzipBody) {
// cache data and gzip it
final ByteArrayOutputStream zippedBytes = new ByteArrayOutputStream();
final GZIPOutputStream toZip = new GZIPOutputStream(zippedBytes);
data.writeRequest(toZip);
toZip.finish();
toZip.flush();
// use compressed data as body (not setting content length according to RFC 2616 HTTP/1.1, section 4.4)
data = new ByteArrayRequestEntity(zippedBytes.toByteArray(), data.getContentType());
post.setRequestHeader(httpHeader.CONTENT_ENCODING, httpHeader.CONTENT_ENCODING_GZIP);
}
post.setRequestEntity(data);
// redirects in POST cause a "Entity enclosing requests cannot be redirected without user intervention" -
// exception
post.setFollowRedirects(false);
return execute(post);
}
/*
* (non-Javadoc)
* @see de.anomic.http.HttpClient#CONNECT(java.lang.String, int, de.anomic.http.httpHeader)
*/
public JakartaCommonsHttpResponse CONNECT(final String host, final int port) throws IOException {
final HostConfiguration hostConfig = new HostConfiguration();
hostConfig.setHost(host, port);
final HttpMethod connect = new ConnectMethod(hostConfig);
connect.setFollowRedirects(false); // there are no redirects possible for CONNECT commands as far as I know.
return execute(connect);
}
/**
* adds the yacy-header to the method
*
* @param requestHeader
* @param method
*/
public void addHeader(final httpHeader requestHeader, final HttpMethod method) {
assert method != null : "precondition violated: method != null";
if (requestHeader != null) {
addHeaders(convertHeaders(requestHeader), method);
}
}
/**
* adds every Header in the array to the method
*
* @param requestHeaders
* @param method must not be null
*/
private static void addHeaders(final Header[] requestHeaders, final HttpMethod method) {
if (method == null) {
throw new NullPointerException("method not set");
}
if (requestHeaders != null) {
for (final Header header : requestHeaders) {
method.addRequestHeader(header);
}
}
}
/**
* convert from yacy-header to apache.commons.httpclient.Header
*
* @param requestHeader
* @return
*/
private static Header[] convertHeaders(final httpHeader requestHeader) {
final Header[] headers;
if (requestHeader == null) {
headers = new Header[0];
} else {
headers = new Header[requestHeader.size()];
int i = 0;
for (final String name : requestHeader.keySet()) {
headers[i] = new Header(name, requestHeader.get(name));
i++;
}
}
return headers;
}
/**
* executes a method
*
* @param method
* @return
* @throws IOException
* @throws HttpException
*/
private JakartaCommonsHttpResponse execute(final HttpMethod method) throws IOException, HttpException {
assert method != null : "precondition violated: method != null";
// ignore cookies
if(ignoreCookies) {
method.getParams().setCookiePolicy(CookiePolicy.IGNORE_COOKIES);
}
// set header
for (final Header header : headers) {
method.setRequestHeader(header);
}
// set proxy
final httpRemoteProxyConfig hostProxyConfig = getProxyConfig(method.getURI().getHost());
addProxyAuth(method, hostProxyConfig);
final HostConfiguration hostConfig = getProxyHostConfig(hostProxyConfig);
// statistics
HttpConnectionInfo.addConnection(generateConInfo(method));
// execute (send request)
serverLog.logFine("HTTPC", "executing " + method.hashCode() + " " + method.getName() + " " + method.getURI());
serverLog.logFinest("HTTPC", "->" + method.hashCode() + " request headers " +
Arrays.toString(method.getRequestHeaders()));
try {
if (hostConfig == null) {
apacheHttpClient.executeMethod(method);
} else {
apacheHttpClient.executeMethod(hostConfig, method);
}
} catch (final IOException e) {
// cleanUp statistics
HttpConnectionInfo.removeConnection(generateConInfo(method));
throw e;
}
serverLog.logFinest("HTTPC", "<-" + method.hashCode() + " response headers " +
Arrays.toString(method.getResponseHeaders()));
// return response
return new JakartaCommonsHttpResponse(method);
}
/**
* @param method
* @return
*/
private HttpConnectionInfo generateConInfo(final HttpMethod method) {
int port = 80;
String host = null;
String protocol = null;
try {
port = method.getURI().getPort();
host = method.getURI().getHost();
protocol = method.getURI().getScheme();
} catch (final URIException e) {
// should not happen, because method is already executed
}
final String query = method.getQueryString() != null ? "?" + method.getQueryString() : "";
return new HttpConnectionInfo(protocol, port == -1 || port == 80 ? host : host + ":" + port, method.getName() +
" " + method.getPath() + query, method.hashCode(), System.currentTimeMillis());
}
/**
* if necessary adds a header for proxy-authentication
*
* @param method
* @param hostProxyConfig
*/
private void addProxyAuth(final HttpMethod method, final httpRemoteProxyConfig hostProxyConfig) {
if (hostProxyConfig != null && hostProxyConfig.useProxy()) {
final String remoteProxyUser = hostProxyConfig.getProxyUser();
if (remoteProxyUser != null && remoteProxyUser.length() > 0) {
if (remoteProxyUser.contains(":")) {
serverLog.logWarning("HTTPC", "Proxy authentication contains invalid characters, trying anyway");
}
final String remoteProxyPwd = hostProxyConfig.getProxyPwd();
final String credentials = kelondroBase64Order.standardCoder.encodeString(remoteProxyUser.replace(":",
"") +
":" + remoteProxyPwd);
method.setRequestHeader(httpHeader.PROXY_AUTHORIZATION, "Basic " + credentials);
}
}
}
/**
*
* @param hostname
* @return
*/
private httpRemoteProxyConfig getProxyConfig(final String hostname) {
final httpRemoteProxyConfig hostProxyConfig;
if (proxyConfig != null) {
// client specific
hostProxyConfig = httpdProxyHandler.getProxyConfig(hostname, proxyConfig);
} else {
// default settings
hostProxyConfig = httpdProxyHandler.getProxyConfig(hostname, 0);
}
return hostProxyConfig;
}
/**
* @param hostProxyConfig
* @return current host-config with additional proxy set or null if no proxy should be used
*/
private HostConfiguration getProxyHostConfig(final httpRemoteProxyConfig hostProxyConfig) {
// generate http-configuration
if (hostProxyConfig != null && hostProxyConfig.useProxy()) {
// new config based on client (default)
final HostConfiguration hostConfig = new HostConfiguration(apacheHttpClient.getHostConfiguration());
// add proxy
hostConfig.setProxy(hostProxyConfig.getProxyHost(), hostProxyConfig.getProxyPort());
return hostConfig;
} else {
return null;
}
}
/**
* Returns the given date in an HTTP-usable format. (according to RFC1123/RFC822)
*
* @param date The Date-Object to be converted.
* @return String with the date.
*/
public static String date2String(final Date date) {
if (date == null) {
return "";
}
return DateUtil.formatDate(date);
}
/**
* close all connections
*/
public static void closeAllConnections() {
conManager.closeIdleConnections(1);
conManager.shutdown();
}
/**
* gets the maximum number of connections allowed
*
* @return
*/
public static int maxConnections() {
return conManager.getParams().getMaxTotalConnections();
}
/**
* test
*
* @param args
*/
public static void main(final String[] args) {
JakartaCommonsHttpResponse resp = null;
String url = args[0];
if (!url.toUpperCase().startsWith("HTTP://")) {
url = "http://" + url;
}
try {
if (args.length > 1 && "post".equals(args[1])) {
// POST
final ArrayList<Part> files = new ArrayList<Part>();
files.add(new FilePart("myfile.txt", new ByteArrayPartSource("myfile.txt", "this is not a file ;)"
.getBytes())));
files.add(new FilePart("anotherfile.raw", new ByteArrayPartSource("anotherfile.raw",
"this is not a binary file ;)".getBytes())));
System.out.println("POST " + files.size() + " elements to " + url);
final JakartaCommonsHttpClient client = new JakartaCommonsHttpClient(1000, null, null);
resp = client.POST(url, files);
System.out.println("----- Header: -----");
System.out.println(new String(resp.getResponseHeader().toString()));
System.out.println("----- Body: -----");
System.out.println(new String(resp.getData()));
} else if (args.length > 1 && "head".equals(args[1])) {
// whead
System.out.println("whead " + url);
System.out.println("--------------------------------------");
System.out.println(de.anomic.http.HttpClient.whead(url).toString());
} else {
// wget
System.out.println("wget " + url);
System.out.println("--------------------------------------");
System.out.println(new String(de.anomic.http.HttpClient.wget(url, null, 10000)));
}
} catch (final IOException e) {
e.printStackTrace();
} finally {
if (resp != null) {
// release connection
resp.closeStream();
}
}
}
/**
* @return
*/
public static String getCurrentUserAgent() {
return (String) apacheHttpClient.getParams().getParameter(HttpMethodParams.USER_AGENT);
}
/**
* @param userAgent
*/
public static void setUserAgent(final String userAgent) {
apacheHttpClient.getParams().setParameter(HttpMethodParams.USER_AGENT, userAgent + jakartaUserAgent);
}
/**
* remove unused connections
*/
public static void cleanup() {
// do it only once a while
final long now = System.currentTimeMillis();
if (now - lastCleanup > cleanupIntervall) {
lastCleanup = now;
conManager.closeIdleConnections(closeConnectionsAfterMillis);
conManager.deleteClosedConnections();
HttpConnectionInfo.cleanUp();
}
}
/**
* number of active connections
*
* @return
*/
public static int connectionCount() {
return conManager.getConnectionsInPool();
}
}