// JakartaCommonsHttpClient.java // (C) 2008 by Daniel Raap; danielr@users.berlios.de // first published 2.4.2008 on http://yacy.net // // This is a part of YaCy, a peer-to-peer based web search engine // // $LastChangedDate: 2008-03-14 01:16:04 +0100 (Fr, 14 Mrz 2008) $ // $LastChangedRevision: 4558 $ // $LastChangedBy: orbiter $ // // LICENSE // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA package de.anomic.http; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Map.Entry; import java.util.zip.GZIPOutputStream; import org.apache.commons.httpclient.ConnectMethod; import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler; import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.HostConfiguration; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpMethod; import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager; import org.apache.commons.httpclient.URIException; import org.apache.commons.httpclient.cookie.CookiePolicy; import org.apache.commons.httpclient.methods.ByteArrayRequestEntity; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.methods.HeadMethod; import org.apache.commons.httpclient.methods.InputStreamRequestEntity; import org.apache.commons.httpclient.methods.PostMethod; import org.apache.commons.httpclient.methods.RequestEntity; import org.apache.commons.httpclient.methods.multipart.ByteArrayPartSource; import org.apache.commons.httpclient.methods.multipart.FilePart; import org.apache.commons.httpclient.methods.multipart.MultipartRequestEntity; import org.apache.commons.httpclient.methods.multipart.Part; import org.apache.commons.httpclient.params.DefaultHttpParams; import org.apache.commons.httpclient.params.HttpMethodParams; import org.apache.commons.httpclient.protocol.Protocol; import org.apache.commons.httpclient.protocol.ProtocolSocketFactory; import de.anomic.crawler.Latency; import de.anomic.kelondro.order.Base64Order; import de.anomic.kelondro.util.Log; import de.anomic.yacy.yacyURL; /** * HttpClient implementation which uses Jakarta Commons HttpClient 3.x {@link http://hc.apache.org/httpclient-3.x/} * * @author danielr * */ public class httpClient { /** * "the HttpClient instance and connection manager should be shared among all threads for maximum efficiency." * (Concurrent execution of HTTP methods, http://hc.apache.org/httpclient-3.x/performance.html) */ private static MultiThreadedHttpConnectionManager conManager = null; private static HttpClient apacheHttpClient = null; // last ; must be before location (this is parsed) private final static String jakartaUserAgent = " " + ((String) DefaultHttpParams.getDefaultParams().getParameter(HttpMethodParams.USER_AGENT)).replace(';', ':'); static { /** * set options for client */ initConnectionManager(); // accept self-signed or untrusted certificates Protocol.registerProtocol("https", new Protocol("https", (ProtocolSocketFactory) new AcceptEverythingSSLProtcolSocketFactory(), 443)); /** * set network timeout properties. see: http://java.sun.com/j2se/1.5.0/docs/guide/net/properties.html These * properties specify the default connect and read timeout (resp.) for the protocol handler used by * java.net.URLConnection. the java.net.URLConnection is also used by JakartaCommons HttpClient, see * http://hc.apache.org/httpclient-3.x/apidocs/org/apache/commons/httpclient/util/HttpURLConnection.html */ // specify the timeout, in milliseconds, to establish the connection to the host. // For HTTP connections, it is the timeout when establishing the connection to the HTTP server. System.setProperty("sun.net.client.defaultConnectTimeout", "10000"); // specify the response timeout, in milliseconds, when reading from an input stream // after a connection is established with a resource System.setProperty("sun.net.client.defaultReadTimeout", "60000"); } public static void initConnectionManager() { MultiThreadedHttpConnectionManager.shutdownAll(); conManager = new MultiThreadedHttpConnectionManager(); apacheHttpClient = new HttpClient(conManager); /** * set options for connection manager */ // conManager.getParams().setDefaultMaxConnectionsPerHost(4); // default 2 HostConfiguration localHostConfiguration = new HostConfiguration(); conManager.getParams().setMaxTotalConnections(200); // Proxy may need many connections conManager.getParams().setConnectionTimeout(60000); // set a default timeout conManager.getParams().setDefaultMaxConnectionsPerHost(10); localHostConfiguration.setHost("localhost"); conManager.getParams().setMaxConnectionsPerHost(localHostConfiguration, 100); localHostConfiguration.setHost("127.0.0.1"); conManager.getParams().setMaxConnectionsPerHost(localHostConfiguration, 100); // only one retry apacheHttpClient.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler(1, false)); // simple user agent setUserAgent("yacy (www.yacy.net; " + getSystemOST() + ")"); } /** * every x milliseconds do a cleanup (close old connections) * * minimal intervall the cleanUp is done (in this time after a cleanup no second one is done) * * this is the time the method is callable, not the time it is called */ private static final int cleanupIntervall = 60000; /** * close connections when they are not used for this time * * or otherwise: hold connections this time open to reuse them */ private static final long closeConnectionsAfterMillis = 120000; /** * time the last cleanup was started */ private static long lastCleanup = 0; private Header[] headers = new Header[0]; private httpRemoteProxyConfig proxyConfig = null; private boolean useGlobalProxyConfig = true; private boolean followRedirects = true; private boolean ignoreCookies = false; /** * creates a new JakartaCommonsHttpClient with given timeout using global remoteProxyConfig * * @param timeout in milliseconds */ public httpClient(final int timeout) { this(timeout, null); } /** * creates a new JakartaCommonsHttpClient with given timeout and requestHeader using global remoteProxyConfig * * @param timeout in milliseconds * @param header header options to send */ public httpClient(final int timeout, final httpRequestHeader header) { super(); setTimeout(timeout); setHeader(header); } /** * creates a new JakartaCommonsHttpClient with given timeout and requestHeader using given remoteProxyConfig * * if proxyConfig is null, then no proxy is used * * @param timeout in milliseconds * @param header header options to send * @param proxyConfig */ public httpClient(final int timeout, final httpRequestHeader header, final httpRemoteProxyConfig proxyConfig) { super(); setTimeout(timeout); setHeader(header); setProxy(proxyConfig); } /* * (non-Javadoc) * @see de.anomic.http.HttpClient#setProxy(de.anomic.http.httpRemoteProxyConfig) */ public void setProxy(final httpRemoteProxyConfig proxyConfig) { this.useGlobalProxyConfig = false; this.proxyConfig = proxyConfig; } /* * (non-Javadoc) * @see de.anomic.http.HttpClient#setHeader(de.anomic.http.httpHeader) */ public void setHeader(final httpRequestHeader header) { headers = convertHeaders(header); } /* * (non-Javadoc) * @see de.anomic.http.HttpClient#setTimeout(int) */ @SuppressWarnings("deprecation") public void setTimeout(final int timeout) { apacheHttpClient.getParams().setIntParameter(HttpMethodParams.SO_TIMEOUT, timeout); apacheHttpClient.getParams().setIntParameter(HttpMethodParams.HEAD_BODY_CHECK_TIMEOUT, timeout); apacheHttpClient.setConnectionTimeout(timeout); } /** * should redirects automatically be followed? * * @param follow */ public void setFollowRedirects(final boolean follow) { followRedirects = follow; } /** *
by default Cookies are accepted and used autmatically
* *HttpClient supports automatic management of cookies, including allowing the server to set cookies and * automatically return them to the server when required.* HttpClient Cookie Guide * * @param ignoreCookies */ public void setIgnoreCookies(final boolean ignoreCookies) { this.ignoreCookies = ignoreCookies; } /* * (non-Javadoc) * @see de.anomic.http.HttpClient#getUserAgent() */ public String getUserAgent() { return getCurrentUserAgent(); } /** * This method GETs a page from the server. * * @param uri The URI to the page which should be GET. * @return InputStream of content (body) * @throws IOException */ public httpResponse GET(final String uri) throws IOException { final HttpMethod get = new GetMethod(uri); get.setFollowRedirects(followRedirects); return execute(get); } /** * This method gets only the header of a page. * * @param uri The URI to the page whose header should be get. * @return Instance of response with the content. * @throws IOException */ public httpResponse HEAD(final String uri) throws IOException { assert uri != null : "precondition violated: uri != null"; final HttpMethod head = new HeadMethod(uri); head.setFollowRedirects(followRedirects); return execute(head); } /** * This method POSTs some data from an InputStream to a page. * * This is for compatibility (an InputStream does not need to contain correct HTTP!) * * @param uri The URI to the page which the post is sent to. * @param ins InputStream with the data to be posted to the server. * @return Instance of response with the content. * @throws IOException */ public httpResponse POST(final String uri, final InputStream ins) throws IOException { assert uri != null : "precondition violated: uri != null"; assert ins != null : "precondition violated: ins != null"; final PostMethod post = new PostMethod(uri); post.setRequestEntity(new InputStreamRequestEntity(ins)); // redirects in POST cause a "Entity enclosing requests cannot be redirected without user intervention" - // exception post.setFollowRedirects(false); return execute(post); } /** * This method sends several data at once via a POST request (multipart-message), maybe compressed * * @param uri The URI to the page which the post is sent to. * @param multiparts {@link java.util.List} with the {@link Part}s of data * @param gzipBody should the body be compressed * @return Instance of response with the content. * @throws IOException */ public httpResponse POST(final String uri, final List
stores the data of the request in a new ByteArrayRequestEntity
* *when the request is send make sure to set content-encoding-header to gzip!
* * @param data * @return a ByteArrayRequestEntitiy with gzipped data * @throws IOException */ private RequestEntity zipRequest(final RequestEntity data) throws IOException { // cache data and gzip it final ByteArrayOutputStream zippedBytes = new ByteArrayOutputStream(512); final GZIPOutputStream toZip = new GZIPOutputStream(zippedBytes); data.writeRequest(toZip); toZip.finish(); toZip.flush(); // use compressed data as body (not setting content length according to RFC 2616 HTTP/1.1, section 4.4) return new ByteArrayRequestEntity(zippedBytes.toByteArray(), data.getContentType()); } /* * (non-Javadoc) * @see de.anomic.http.HttpClient#CONNECT(java.lang.String, int, de.anomic.http.httpHeader) */ public httpResponse CONNECT(final String host, final int port) throws IOException { final HostConfiguration hostConfig = new HostConfiguration(); hostConfig.setHost(host, port); final HttpMethod connect = new ConnectMethod(hostConfig); connect.setFollowRedirects(false); // there are no redirects possible for CONNECT commands as far as I know. return execute(connect); } /** * adds the yacy-header to the method * * @param requestHeader * @param method */ public void addHeader(final httpRequestHeader requestHeader, final HttpMethod method) { assert method != null : "precondition violated: method != null"; if (requestHeader != null) { addHeaders(convertHeaders(requestHeader), method); } } /** * adds every Header in the array to the method * * @param requestHeaders * @param method must not be null */ private static void addHeaders(final Header[] requestHeaders, final HttpMethod method) { if (method == null) { throw new NullPointerException("method not set"); } if (requestHeaders != null) { for (final Header header : requestHeaders) { method.addRequestHeader(header); } } } /** * convert from yacy-header to apache.commons.httpclient.Header * * @param requestHeader * @return */ private static Header[] convertHeaders(final httpRequestHeader requestHeader) { final Header[] headers; if (requestHeader == null) { headers = new Header[0]; } else { headers = new Header[requestHeader.size()]; int i = 0; for (final Entry