diff --git a/defaults/yacy.init b/defaults/yacy.init index c29827ef1..f1ff306a2 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -67,6 +67,13 @@ pkcs12ImportPwd = # value is in milliseconds, default is one hour server.maxTrackingTime = 3600000 +# maximum number of tracks per host +server.maxTrackingCount = 1000 + +# maximum number of hosts that are tracked +server.maxTrackingHostCount = 100 + + # Network Definition # There can be separate YaCy networks, and managed sub-groups of the general network. # The essentials of the network definition are attached in separate property files. diff --git a/htroot/ConfigAppearance_p.java b/htroot/ConfigAppearance_p.java index 56be66f25..797ca1a42 100644 --- a/htroot/ConfigAppearance_p.java +++ b/htroot/ConfigAppearance_p.java @@ -36,7 +36,7 @@ import java.util.List; import de.anomic.crawler.HTTPLoader; import de.anomic.data.listManager; -import de.anomic.http.HttpClient; +import de.anomic.http.httpClient; import de.anomic.http.httpRequestHeader; import de.anomic.kelondro.util.FileUtils; import de.anomic.plasma.plasmaSwitchboard; @@ -92,7 +92,7 @@ public class ConfigAppearance_p { final yacyURL u = new yacyURL(url, null); final httpRequestHeader reqHeader = new httpRequestHeader(); reqHeader.put(httpRequestHeader.USER_AGENT, HTTPLoader.yacyUserAgent); - skinVector = FileUtils.strings(HttpClient.wget(u.toString(), reqHeader, 10000), "UTF-8"); + skinVector = FileUtils.strings(httpClient.wget(u.toString(), reqHeader, 10000), "UTF-8"); } catch (final IOException e) { prop.put("status", "1");// unable to get URL prop.put("status_url", url); diff --git a/htroot/ConfigLanguage_p.java b/htroot/ConfigLanguage_p.java index 1ecb89235..734035014 100644 --- a/htroot/ConfigLanguage_p.java +++ b/htroot/ConfigLanguage_p.java @@ -39,7 +39,7 @@ import java.util.List; import de.anomic.crawler.HTTPLoader; import de.anomic.data.listManager; import de.anomic.data.translator; -import de.anomic.http.HttpClient; +import de.anomic.http.httpClient; import de.anomic.http.httpRequestHeader; import de.anomic.kelondro.util.FileUtils; import de.anomic.server.serverObjects; @@ -81,7 +81,7 @@ public class ConfigLanguage_p { final yacyURL u = new yacyURL(url, null); final httpRequestHeader reqHeader = new httpRequestHeader(); reqHeader.put(httpRequestHeader.USER_AGENT, HTTPLoader.yacyUserAgent); - langVector = FileUtils.strings(HttpClient.wget(u.toString(), reqHeader, 10000), "UTF-8"); + langVector = FileUtils.strings(httpClient.wget(u.toString(), reqHeader, 10000), "UTF-8"); }catch(final IOException e){ prop.put("status", "1");//unable to get url prop.put("status_url", url); diff --git a/htroot/Connections_p.java b/htroot/Connections_p.java index 6d650c345..d31cb329c 100644 --- a/htroot/Connections_p.java +++ b/htroot/Connections_p.java @@ -34,7 +34,7 @@ import java.util.Properties; import java.util.Set; import de.anomic.http.HttpConnectionInfo; -import de.anomic.http.JakartaCommonsHttpClient; +import de.anomic.http.httpClient; import de.anomic.http.httpRequestHeader; import de.anomic.http.httpd; import de.anomic.kelondro.order.DateFormatter; @@ -226,7 +226,7 @@ public final class Connections_p { } } prop.put("clientList", c); - prop.put("clientActive", JakartaCommonsHttpClient.connectionCount()); + prop.put("clientActive", httpClient.connectionCount()); // return rewrite values for templates return prop; diff --git a/htroot/Network.java b/htroot/Network.java index c9712d41f..80b2838c6 100644 --- a/htroot/Network.java +++ b/htroot/Network.java @@ -36,7 +36,7 @@ import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import de.anomic.crawler.HTTPLoader; -import de.anomic.http.HttpClient; +import de.anomic.http.httpClient; import de.anomic.http.httpRequestHeader; import de.anomic.kelondro.order.DateFormatter; import de.anomic.plasma.plasmaSwitchboard; @@ -145,7 +145,7 @@ public class Network { prop.put("table_my-url", seed.get(yacySeed.SEEDLIST, "")); // generating the location string - prop.putHTML("table_my-location", HttpClient.generateLocation()); + prop.putHTML("table_my-location", httpClient.generateLocation()); } // overall results: Network statistics @@ -348,7 +348,7 @@ public class Network { userAgent = null; if (seed.hash != null && seed.hash.equals(sb.webIndex.seedDB.mySeed().hash)) { userAgent = HTTPLoader.yacyUserAgent; - location = HttpClient.generateLocation(); + location = httpClient.generateLocation(); } else { userAgent = sb.webIndex.seedDB.peerActions.getUserAgent(seed.getIP()); location = parseLocationInUserAgent(userAgent); diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index 344f41b9d..493eaf8a4 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -36,7 +36,7 @@ import java.util.Map; import de.anomic.htmlFilter.htmlFilterImageEntry; import de.anomic.htmlFilter.htmlFilterCharacterCoding; -import de.anomic.http.HttpClient; +import de.anomic.http.httpClient; import de.anomic.http.httpRequestHeader; import de.anomic.http.httpResponseHeader; import de.anomic.index.indexDocumentMetadata; @@ -199,7 +199,7 @@ public class ViewFile { return prop; } - responseHeader = HttpClient.whead(url.toString()); + responseHeader = httpClient.whead(url.toString()); if (responseHeader == null) { prop.put("error", "4"); prop.put("error_errorText", "Unable to load resource metadata."); diff --git a/htroot/api/util/getpageinfo_p.java b/htroot/api/util/getpageinfo_p.java index e22c39344..0e3a98ca5 100755 --- a/htroot/api/util/getpageinfo_p.java +++ b/htroot/api/util/getpageinfo_p.java @@ -7,7 +7,7 @@ import java.util.Set; import de.anomic.crawler.HTTPLoader; import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterWriter; -import de.anomic.http.HttpClient; +import de.anomic.http.httpClient; import de.anomic.http.httpRequestHeader; import de.anomic.kelondro.util.FileUtils; import de.anomic.plasma.plasmaSwitchboard; @@ -48,7 +48,7 @@ public class getpageinfo_p { final yacyURL u = new yacyURL(url, null); final httpRequestHeader reqHeader = new httpRequestHeader(); reqHeader.put(httpRequestHeader.USER_AGENT, HTTPLoader.yacyUserAgent); // do not set the crawler user agent, because this page was loaded by manual entering of the url - final byte[] r = HttpClient.wget(u.toString(), reqHeader, 5000); + final byte[] r = httpClient.wget(u.toString(), reqHeader, 5000); if (r == null) return prop; final String contentString=new String(r); diff --git a/htroot/sharedBlacklist_p.java b/htroot/sharedBlacklist_p.java index 0b1f33171..7c8cf63b2 100644 --- a/htroot/sharedBlacklist_p.java +++ b/htroot/sharedBlacklist_p.java @@ -40,7 +40,7 @@ import java.util.List; import de.anomic.crawler.HTTPLoader; import de.anomic.data.listManager; import de.anomic.htmlFilter.htmlFilterCharacterCoding; -import de.anomic.http.HttpClient; +import de.anomic.http.httpClient; import de.anomic.http.httpRequestHeader; import de.anomic.index.indexAbstractReferenceBlacklist; import de.anomic.kelondro.util.FileUtils; @@ -139,7 +139,7 @@ public class sharedBlacklist_p { // get List yacyURL u = new yacyURL(downloadURLOld, null); - otherBlacklist = FileUtils.strings(HttpClient.wget(u.toString(), reqHeader, 1000), "UTF-8"); + otherBlacklist = FileUtils.strings(httpClient.wget(u.toString(), reqHeader, 1000), "UTF-8"); } catch (final Exception e) { prop.put("status", STATUS_PEER_UNKNOWN); prop.putHTML("status_name", Hash); @@ -158,7 +158,7 @@ public class sharedBlacklist_p { final yacyURL u = new yacyURL(downloadURL, null); final httpRequestHeader reqHeader = new httpRequestHeader(); reqHeader.put(httpRequestHeader.USER_AGENT, HTTPLoader.yacyUserAgent); - otherBlacklist = FileUtils.strings(HttpClient.wget(u.toString(), reqHeader, 10000), "UTF-8"); //get List + otherBlacklist = FileUtils.strings(httpClient.wget(u.toString(), reqHeader, 10000), "UTF-8"); //get List } catch (final Exception e) { prop.put("status", STATUS_URL_PROBLEM); prop.putHTML("status_address",downloadURL); diff --git a/source/de/anomic/crawler/HTTPLoader.java b/source/de/anomic/crawler/HTTPLoader.java index a910b9188..375003076 100644 --- a/source/de/anomic/crawler/HTTPLoader.java +++ b/source/de/anomic/crawler/HTTPLoader.java @@ -28,9 +28,8 @@ package de.anomic.crawler; import java.io.IOException; import java.util.Date; -import de.anomic.http.HttpClient; -import de.anomic.http.JakartaCommonsHttpClient; -import de.anomic.http.JakartaCommonsHttpResponse; +import de.anomic.http.httpClient; +import de.anomic.http.httpResponse; import de.anomic.http.httpRequestHeader; import de.anomic.http.httpResponseHeader; import de.anomic.http.httpdProxyCacheEntry; @@ -49,8 +48,8 @@ public final class HTTPLoader { private static final String DEFAULT_CHARSET = "ISO-8859-1,utf-8;q=0.7,*;q=0.7"; private static final long DEFAULT_MAXFILESIZE = 1024 * 1024 * 10; public static final int DEFAULT_CRAWLING_RETRY_COUNT = 5; - public static final String crawlerUserAgent = "yacybot (" + HttpClient.getSystemOST() +") http://yacy.net/bot.html"; - public static final String yacyUserAgent = "yacy (" + HttpClient.getSystemOST() +") yacy.net"; + public static final String crawlerUserAgent = "yacybot (" + httpClient.getSystemOST() +") http://yacy.net/bot.html"; + public static final String yacyUserAgent = "yacy (" + httpClient.getSystemOST() +") yacy.net"; /** * The socket timeout that should be used @@ -139,9 +138,9 @@ public final class HTTPLoader { requestHeader.put(httpRequestHeader.ACCEPT_ENCODING, sb.getConfig("crawler.http.acceptEncoding", DEFAULT_ENCODING)); // HTTP-Client - final JakartaCommonsHttpClient client = new JakartaCommonsHttpClient(socketTimeout, requestHeader); + final httpClient client = new httpClient(socketTimeout, requestHeader); - JakartaCommonsHttpResponse res = null; + httpResponse res = null; //try { // send request res = client.GET(entry.url().toString()); diff --git a/source/de/anomic/crawler/RobotsTxt.java b/source/de/anomic/crawler/RobotsTxt.java index b8fac7771..772c88ba4 100644 --- a/source/de/anomic/crawler/RobotsTxt.java +++ b/source/de/anomic/crawler/RobotsTxt.java @@ -40,8 +40,8 @@ import java.util.LinkedList; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; -import de.anomic.http.JakartaCommonsHttpClient; -import de.anomic.http.JakartaCommonsHttpResponse; +import de.anomic.http.httpClient; +import de.anomic.http.httpResponse; import de.anomic.http.httpRequestHeader; import de.anomic.kelondro.blob.BLOB; import de.anomic.kelondro.blob.BLOBHeap; @@ -528,8 +528,8 @@ public class RobotsTxt { // setup http-client //TODO: adding Traffic statistic for robots download? - final JakartaCommonsHttpClient client = new JakartaCommonsHttpClient(10000, reqHeaders); - JakartaCommonsHttpResponse res = null; + final httpClient client = new httpClient(10000, reqHeaders); + httpResponse res = null; try { // sending the get request res = client.GET(robotsURL.toString()); diff --git a/source/de/anomic/data/SitemapParser.java b/source/de/anomic/data/SitemapParser.java index 1224cca0e..91ad060f6 100644 --- a/source/de/anomic/data/SitemapParser.java +++ b/source/de/anomic/data/SitemapParser.java @@ -41,8 +41,8 @@ import org.xml.sax.helpers.DefaultHandler; import de.anomic.crawler.CrawlEntry; import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.HTTPLoader; -import de.anomic.http.JakartaCommonsHttpClient; -import de.anomic.http.JakartaCommonsHttpResponse; +import de.anomic.http.httpClient; +import de.anomic.http.httpResponse; import de.anomic.http.httpRequestHeader; import de.anomic.http.httpdByteCountInputStream; import de.anomic.index.indexURLReference; @@ -153,8 +153,8 @@ public class SitemapParser extends DefaultHandler { // download document final httpRequestHeader requestHeader = new httpRequestHeader(); requestHeader.put(httpRequestHeader.USER_AGENT, HTTPLoader.crawlerUserAgent); - final JakartaCommonsHttpClient client = new JakartaCommonsHttpClient(5000, requestHeader); - JakartaCommonsHttpResponse res = null; + final httpClient client = new httpClient(5000, requestHeader); + httpResponse res = null; try { res = client.GET(siteMapURL.toString()); if (res.getStatusCode() != 200) { diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index 751c3ef3f..fab9809d1 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -44,7 +44,7 @@ import java.util.Properties; import javax.swing.event.EventListenerList; import de.anomic.crawler.HTTPLoader; -import de.anomic.http.HttpClient; +import de.anomic.http.httpClient; import de.anomic.http.httpRequestHeader; import de.anomic.kelondro.util.FileUtils; import de.anomic.plasma.plasmaParser; @@ -503,7 +503,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen // load page final httpRequestHeader reqHeader = new httpRequestHeader(); reqHeader.put(httpRequestHeader.USER_AGENT, HTTPLoader.crawlerUserAgent); - final byte[] page = HttpClient.wget(location.toString(), reqHeader, 10000); + final byte[] page = httpClient.wget(location.toString(), reqHeader, 10000); if (page == null) throw new IOException("no response from url " + location.toString()); // scrape content diff --git a/source/de/anomic/http/HttpClient.java b/source/de/anomic/http/HttpClient.java deleted file mode 100644 index d274957b8..000000000 --- a/source/de/anomic/http/HttpClient.java +++ /dev/null @@ -1,148 +0,0 @@ -// HttpClient.java -// (C) 2008 by Daniel Raap; danielr@users.berlios.de -// first published 2.4.2008 on http://yacy.net -// -// This is a part of YaCy, a peer-to-peer based web search engine -// -// $LastChangedDate: 2008-03-14 01:16:04 +0100 (Fr, 14 Mrz 2008) $ -// $LastChangedRevision: 4558 $ -// $LastChangedBy: orbiter $ -// -// LICENSE -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -package de.anomic.http; - -import java.io.IOException; - -import de.anomic.kelondro.util.Log; - -/** - * Client who does http requests - * - * some methods must be implemented (the "socket-layer") - */ -public abstract class HttpClient { - - /** - * provide system information for client identification - */ - private static final String systemOST = System.getProperty("os.arch", "no-os-arch") + " " + - System.getProperty("os.name", "no-os-name") + " " + System.getProperty("os.version", "no-os-version") + - "; " + "java " + System.getProperty("java.version", "no-java-version") + "; " + generateLocation(); - - /** - * generating the location string - * - * @return - */ - public static String generateLocation() { - String loc = System.getProperty("user.timezone", "nowhere"); - final int p = loc.indexOf("/"); - if (p > 0) { - loc = loc.substring(0, p); - } - loc = loc + "/" + System.getProperty("user.language", "dumb"); - return loc; - } - - /** - * @return the systemOST - */ - public static String getSystemOST() { - return systemOST; - } - - /** - * Gets a page (as raw bytes) addressing vhost at host in uri with specified header and timeout - * - * @param uri - * @param header - * @param vhost - * @param timeout in milliseconds - * @return - */ - public static byte[] wget(final String uri) { - return wget(uri, new httpRequestHeader(), 10000, null); - } - public static byte[] wget(final String uri, final httpRequestHeader header, final int timeout) { - return wget(uri, header, timeout, null); - } - public static byte[] wget(final String uri, final httpRequestHeader header, final int timeout, final String vhost) { - assert uri != null : "precondition violated: uri != null"; - addHostHeader(header, vhost); - final JakartaCommonsHttpClient client = new JakartaCommonsHttpClient(timeout, header); - - // do the request - try { - final JakartaCommonsHttpResponse response = client.GET(uri); - return response.getData(); - } catch (final IOException e) { - Log.logWarning("HTTPC", "wget(" + uri + ") failed: " + e.getMessage()); - } - return null; - } - - /** - * adds a Host-header to the header if vhost is not null - * - * @param header - * @param vhost - * @return - */ - private static void addHostHeader(httpRequestHeader header, final String vhost) { - if (vhost != null) { - if (header != null) { - header = new httpRequestHeader(); - } - // set host-header - header.add(httpRequestHeader.HOST, vhost); - } - } - - /** - * Gets a page-header - * - * @param uri - * @return - */ - public static httpResponseHeader whead(final String uri) { - return whead(uri, null); - } - - /** - * Gets a page-header - * - * @param uri - * @param header request header - * @return null on error - */ - public static httpResponseHeader whead(final String uri, final httpRequestHeader header) { - final JakartaCommonsHttpClient client = new JakartaCommonsHttpClient(10000, header); - JakartaCommonsHttpResponse response = null; - try { - response = client.HEAD(uri); - return response.getResponseHeader(); - } catch (final IOException e) { - Log.logWarning("HTTPC", "whead(" + uri + ") failed: " + e.getMessage()); - return null; - } finally { - if (response != null) { - response.closeStream(); - } - } - } -} diff --git a/source/de/anomic/http/JakartaCommonsHttpClient.java b/source/de/anomic/http/httpClient.java similarity index 84% rename from source/de/anomic/http/JakartaCommonsHttpClient.java rename to source/de/anomic/http/httpClient.java index 7d7651b98..bc3c351f2 100644 --- a/source/de/anomic/http/JakartaCommonsHttpClient.java +++ b/source/de/anomic/http/httpClient.java @@ -23,6 +23,7 @@ // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + package de.anomic.http; import java.io.ByteArrayOutputStream; @@ -67,7 +68,7 @@ import de.anomic.kelondro.util.Log; * @author danielr * */ -public class JakartaCommonsHttpClient { +public class httpClient { /** * "the HttpClient instance and connection manager should be shared among all threads for maximum efficiency." @@ -85,7 +86,7 @@ public class JakartaCommonsHttpClient { * set options for client */ // simple user agent - setUserAgent("yacy (www.yacy.net; " + de.anomic.http.HttpClient.getSystemOST() + ")"); + setUserAgent("yacy (www.yacy.net; " + getSystemOST() + ")"); // only one retry apacheHttpClient.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler(1, false)); @@ -152,7 +153,7 @@ public class JakartaCommonsHttpClient { * * @param timeout in milliseconds */ - public JakartaCommonsHttpClient(final int timeout) { + public httpClient(final int timeout) { this(timeout, null); } @@ -162,7 +163,7 @@ public class JakartaCommonsHttpClient { * @param timeout in milliseconds * @param header header options to send */ - public JakartaCommonsHttpClient(final int timeout, final httpRequestHeader header) { + public httpClient(final int timeout, final httpRequestHeader header) { super(); setTimeout(timeout); setHeader(header); @@ -177,7 +178,7 @@ public class JakartaCommonsHttpClient { * @param header header options to send * @param proxyConfig */ - public JakartaCommonsHttpClient(final int timeout, final httpRequestHeader header, final httpRemoteProxyConfig proxyConfig) { + public httpClient(final int timeout, final httpRequestHeader header, final httpRemoteProxyConfig proxyConfig) { super(); setTimeout(timeout); setHeader(header); @@ -246,7 +247,7 @@ public class JakartaCommonsHttpClient { * @return InputStream of content (body) * @throws IOException */ - public JakartaCommonsHttpResponse GET(final String uri) throws IOException { + public httpResponse GET(final String uri) throws IOException { final HttpMethod get = new GetMethod(uri); get.setFollowRedirects(followRedirects); return execute(get); @@ -259,7 +260,7 @@ public class JakartaCommonsHttpClient { * @return Instance of response with the content. * @throws IOException */ - public JakartaCommonsHttpResponse HEAD(final String uri) throws IOException { + public httpResponse HEAD(final String uri) throws IOException { assert uri != null : "precondition violated: uri != null"; final HttpMethod head = new HeadMethod(uri); head.setFollowRedirects(followRedirects); @@ -276,7 +277,7 @@ public class JakartaCommonsHttpClient { * @return Instance of response with the content. * @throws IOException */ - public JakartaCommonsHttpResponse POST(final String uri, final InputStream ins) throws IOException { + public httpResponse POST(final String uri, final InputStream ins) throws IOException { assert uri != null : "precondition violated: uri != null"; assert ins != null : "precondition violated: ins != null"; final PostMethod post = new PostMethod(uri); @@ -295,7 +296,7 @@ public class JakartaCommonsHttpClient { * @return * @throws IOException */ - public JakartaCommonsHttpResponse POST(final String uri, final List multiparts) throws IOException { + public httpResponse POST(final String uri, final List multiparts) throws IOException { return POST(uri, multiparts, false); } @@ -308,7 +309,7 @@ public class JakartaCommonsHttpClient { * @return Instance of response with the content. * @throws IOException */ - public JakartaCommonsHttpResponse POST(final String uri, final List multiparts, final boolean gzipBody) + public httpResponse POST(final String uri, final List multiparts, final boolean gzipBody) throws IOException { assert uri != null : "precondition violated: uri != null"; final PostMethod post = new PostMethod(uri); @@ -358,7 +359,7 @@ public class JakartaCommonsHttpClient { * (non-Javadoc) * @see de.anomic.http.HttpClient#CONNECT(java.lang.String, int, de.anomic.http.httpHeader) */ - public JakartaCommonsHttpResponse CONNECT(final String host, final int port) throws IOException { + public httpResponse CONNECT(final String host, final int port) throws IOException { final HostConfiguration hostConfig = new HostConfiguration(); hostConfig.setHost(host, port); final HttpMethod connect = new ConnectMethod(hostConfig); @@ -424,7 +425,7 @@ public class JakartaCommonsHttpClient { * @return * @throws IOException */ - private JakartaCommonsHttpResponse execute(final HttpMethod method) throws IOException { + private httpResponse execute(final HttpMethod method) throws IOException { assert method != null : "precondition violated: method != null"; checkIgnoreCookies(method); setHeader(method); @@ -458,7 +459,7 @@ public class JakartaCommonsHttpClient { Arrays.toString(method.getResponseHeaders())); // return response - return new JakartaCommonsHttpResponse(method); + return new httpResponse(method); } /** @@ -611,7 +612,7 @@ public class JakartaCommonsHttpClient { * @param args */ public static void main(final String[] args) { - JakartaCommonsHttpResponse resp = null; + httpResponse resp = null; String url = args[0]; if (!url.toUpperCase().startsWith("HTTP://")) { url = "http://" + url; @@ -625,7 +626,7 @@ public class JakartaCommonsHttpClient { files.add(new FilePart("anotherfile.raw", new ByteArrayPartSource("anotherfile.raw", "this is not a binary file ;)".getBytes()))); System.out.println("POST " + files.size() + " elements to " + url); - final JakartaCommonsHttpClient client = new JakartaCommonsHttpClient(1000); + final httpClient client = new httpClient(1000); resp = client.POST(url, files); System.out.println("----- Header: -----"); System.out.println(resp.getResponseHeader().toString()); @@ -635,12 +636,12 @@ public class JakartaCommonsHttpClient { // whead System.out.println("whead " + url); System.out.println("--------------------------------------"); - System.out.println(de.anomic.http.HttpClient.whead(url).toString()); + System.out.println(whead(url).toString()); } else { // wget System.out.println("wget " + url); System.out.println("--------------------------------------"); - System.out.println(new String(de.anomic.http.HttpClient.wget(url, null, 10000))); + System.out.println(new String(wget(url, null, 10000))); } } catch (final IOException e) { e.printStackTrace(); @@ -688,4 +689,115 @@ public class JakartaCommonsHttpClient { public static int connectionCount() { return conManager.getConnectionsInPool(); } + + + + /** + * provide system information for client identification + */ + private static final String systemOST = System.getProperty("os.arch", "no-os-arch") + " " + + System.getProperty("os.name", "no-os-name") + " " + System.getProperty("os.version", "no-os-version") + + "; " + "java " + System.getProperty("java.version", "no-java-version") + "; " + generateLocation(); + + /** + * generating the location string + * + * @return + */ + public static String generateLocation() { + String loc = System.getProperty("user.timezone", "nowhere"); + final int p = loc.indexOf("/"); + if (p > 0) { + loc = loc.substring(0, p); + } + loc = loc + "/" + System.getProperty("user.language", "dumb"); + return loc; + } + + /** + * @return the systemOST + */ + public static String getSystemOST() { + return systemOST; + } + + /** + * Gets a page (as raw bytes) addressing vhost at host in uri with specified header and timeout + * + * @param uri + * @param header + * @param vhost + * @param timeout in milliseconds + * @return + */ + public static byte[] wget(final String uri) { + return wget(uri, new httpRequestHeader(), 10000, null); + } + public static byte[] wget(final String uri, final httpRequestHeader header, final int timeout) { + return wget(uri, header, timeout, null); + } + public static byte[] wget(final String uri, final httpRequestHeader header, final int timeout, final String vhost) { + assert uri != null : "precondition violated: uri != null"; + addHostHeader(header, vhost); + final httpClient client = new httpClient(timeout, header); + + // do the request + try { + final httpResponse response = client.GET(uri); + return response.getData(); + } catch (final IOException e) { + Log.logWarning("HTTPC", "wget(" + uri + ") failed: " + e.getMessage()); + } + return null; + } + + /** + * adds a Host-header to the header if vhost is not null + * + * @param header + * @param vhost + * @return + */ + private static void addHostHeader(httpRequestHeader header, final String vhost) { + if (vhost != null) { + if (header != null) { + header = new httpRequestHeader(); + } + // set host-header + header.add(httpRequestHeader.HOST, vhost); + } + } + + /** + * Gets a page-header + * + * @param uri + * @return + */ + public static httpResponseHeader whead(final String uri) { + return whead(uri, null); + } + + /** + * Gets a page-header + * + * @param uri + * @param header request header + * @return null on error + */ + public static httpResponseHeader whead(final String uri, final httpRequestHeader header) { + final httpClient client = new httpClient(10000, header); + httpResponse response = null; + try { + response = client.HEAD(uri); + return response.getResponseHeader(); + } catch (final IOException e) { + Log.logWarning("HTTPC", "whead(" + uri + ") failed: " + e.getMessage()); + return null; + } finally { + if (response != null) { + response.closeStream(); + } + } + } } \ No newline at end of file diff --git a/source/de/anomic/http/JakartaCommonsHttpResponse.java b/source/de/anomic/http/httpResponse.java similarity index 97% rename from source/de/anomic/http/JakartaCommonsHttpResponse.java rename to source/de/anomic/http/httpResponse.java index e2c607e8b..2cc594fde 100644 --- a/source/de/anomic/http/JakartaCommonsHttpResponse.java +++ b/source/de/anomic/http/httpResponse.java @@ -41,7 +41,7 @@ import de.anomic.kelondro.util.FileUtils; * @author daniel * @since 21.03.2008 */ -public class JakartaCommonsHttpResponse { +public class httpResponse { private final HttpMethod method; private String incomingAccountingName = null; @@ -56,7 +56,7 @@ public class JakartaCommonsHttpResponse { * @param method * @throws IOException */ - public JakartaCommonsHttpResponse(final HttpMethod method) { + public httpResponse(final HttpMethod method) { super(); this.method = method; diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java index 7ff317514..6c67c2c2b 100644 --- a/source/de/anomic/http/httpdProxyHandler.java +++ b/source/de/anomic/http/httpdProxyHandler.java @@ -438,7 +438,7 @@ public final class httpdProxyHandler { final GZIPOutputStream gzippedOut = null; - JakartaCommonsHttpResponse res = null; + httpResponse res = null; try { final int reqID = requestHeader.hashCode(); @@ -473,7 +473,7 @@ public final class httpdProxyHandler { final String connectHost = hostPart(host, port, yAddress); final String getUrl = "http://"+ connectHost + remotePath; - final JakartaCommonsHttpClient client = setupHttpClient(requestHeader, connectHost); + final httpClient client = setupHttpClient(requestHeader, connectHost); // send request try { @@ -724,7 +724,7 @@ public final class httpdProxyHandler { public static void doHead(final Properties conProp, final httpRequestHeader requestHeader, OutputStream respond) { - JakartaCommonsHttpResponse res = null; + httpResponse res = null; yacyURL url = null; try { final int reqID = requestHeader.hashCode(); @@ -793,7 +793,7 @@ public final class httpdProxyHandler { final String getUrl = "http://"+ connectHost + remotePath; if (theLogger.isFinest()) theLogger.logFinest(reqID +" using url: "+ getUrl); - final JakartaCommonsHttpClient client = setupHttpClient(requestHeader, connectHost); + final httpClient client = setupHttpClient(requestHeader, connectHost); // send request try { @@ -884,7 +884,7 @@ public final class httpdProxyHandler { final String getUrl = "http://"+ connectHost + remotePath; if (theLogger.isFinest()) theLogger.logFinest(reqID +" using url: "+ getUrl); - final JakartaCommonsHttpClient client = setupHttpClient(requestHeader, connectHost); + final httpClient client = setupHttpClient(requestHeader, connectHost); // check input if(body == null) { @@ -910,7 +910,7 @@ public final class httpdProxyHandler { } body = new ByteArrayInputStream(bodyData); } - JakartaCommonsHttpResponse res = null; + httpResponse res = null; try { // sending the request res = client.POST(getUrl, body); @@ -1050,9 +1050,9 @@ public final class httpdProxyHandler { * @param connectHost may be 'host:port' or 'host:port/path' * @return */ - private static JakartaCommonsHttpClient setupHttpClient(final httpRequestHeader requestHeader, final String connectHost) { + private static httpClient setupHttpClient(final httpRequestHeader requestHeader, final String connectHost) { // setup HTTP-client - final JakartaCommonsHttpClient client = new JakartaCommonsHttpClient(timeout, requestHeader); + final httpClient client = new httpClient(timeout, requestHeader); client.setFollowRedirects(false); // cookies are handled by the user's browser client.setIgnoreCookies(true); @@ -1232,10 +1232,10 @@ public final class httpdProxyHandler { (proxyConfig.useProxy()) && (proxyConfig.useProxy4SSL()) ) { - final JakartaCommonsHttpClient remoteProxy = new JakartaCommonsHttpClient(timeout, requestHeader, proxyConfig); + final httpClient remoteProxy = new httpClient(timeout, requestHeader, proxyConfig); remoteProxy.setFollowRedirects(false); // should not be needed, but safe is safe - JakartaCommonsHttpResponse response = null; + httpResponse response = null; try { response = remoteProxy.CONNECT(host, port); // outputs a logline to the serverlog with the current status diff --git a/source/de/anomic/index/indexRepositoryReference.java b/source/de/anomic/index/indexRepositoryReference.java index 7a50c5c1e..2e72bd7d5 100644 --- a/source/de/anomic/index/indexRepositoryReference.java +++ b/source/de/anomic/index/indexRepositoryReference.java @@ -40,8 +40,8 @@ import java.util.Map; import java.util.TreeSet; import de.anomic.htmlFilter.htmlFilterCharacterCoding; -import de.anomic.http.JakartaCommonsHttpClient; -import de.anomic.http.JakartaCommonsHttpResponse; +import de.anomic.http.httpClient; +import de.anomic.http.httpResponse; import de.anomic.http.httpRemoteProxyConfig; import de.anomic.kelondro.blob.Cache; import de.anomic.kelondro.index.Row; @@ -243,9 +243,9 @@ public final class indexRepositoryReference { final yacyURL newUrl = new yacyURL(newUrlStr, null); // doing a http head request to test if the url is correct - final JakartaCommonsHttpClient client = new JakartaCommonsHttpClient(10000); + final httpClient client = new httpClient(10000); client.setProxy(proxyConfig); - JakartaCommonsHttpResponse res = null; + httpResponse res = null; try { res = client.HEAD(newUrl.toString()); } finally { diff --git a/source/de/anomic/net/natLib.java b/source/de/anomic/net/natLib.java index cb688bd43..60ff1b19b 100644 --- a/source/de/anomic/net/natLib.java +++ b/source/de/anomic/net/natLib.java @@ -29,7 +29,7 @@ import java.net.InetAddress; import java.net.UnknownHostException; import java.util.ArrayList; -import de.anomic.http.HttpClient; +import de.anomic.http.httpClient; import de.anomic.kelondro.util.FileUtils; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverDomains; @@ -46,7 +46,7 @@ public class natLib { rm status.htm */ try { - ArrayList x = FileUtils.strings(HttpClient.wget("http://admin:"+password+"@192.168.0.1:80/status.htm", null, 10000), "UTF-8"); + ArrayList x = FileUtils.strings(httpClient.wget("http://admin:"+password+"@192.168.0.1:80/status.htm", null, 10000), "UTF-8"); x = nxTools.grep(x, 1, "IP Address"); if ((x == null) || (x.size() == 0)) return null; final String line = nxTools.tail1(x); @@ -59,7 +59,7 @@ public class natLib { private static String getWhatIsMyIP() { try { ArrayList x = FileUtils.strings( - HttpClient.wget("http://www.whatismyip.com/", null, 10000), "UTF-8"); + httpClient.wget("http://www.whatismyip.com/", null, 10000), "UTF-8"); x = nxTools.grep(x, 0, "Your IP is"); final String line = nxTools.tail1(x); return nxTools.awk(line, " ", 4); @@ -71,7 +71,7 @@ public class natLib { private static String getStanford() { try { ArrayList x = FileUtils.strings( - HttpClient.wget("http://www.slac.stanford.edu/cgi-bin/nph-traceroute.pl", null, 10000), + httpClient.wget("http://www.slac.stanford.edu/cgi-bin/nph-traceroute.pl", null, 10000), "UTF-8"); x = nxTools.grep(x, 0, "firewall protecting your browser"); final String line = nxTools.tail1(x); @@ -83,7 +83,7 @@ public class natLib { private static String getIPID() { try { - ArrayList x = FileUtils.strings(HttpClient.wget("http://ipid.shat.net/", null, 10000), "UTF-8"); + ArrayList x = FileUtils.strings(httpClient.wget("http://ipid.shat.net/", null, 10000), "UTF-8"); x = nxTools.grep(x, 2, "Your IP address"); final String line = nxTools.tail1(x); return nxTools.awk(nxTools.awk(nxTools.awk(line, " ", 5), ">", 2), "<", 1); diff --git a/source/de/anomic/plasma/parser/odt/odtParser.java b/source/de/anomic/plasma/parser/odt/odtParser.java index 53921cbaf..cf8df5006 100644 --- a/source/de/anomic/plasma/parser/odt/odtParser.java +++ b/source/de/anomic/plasma/parser/odt/odtParser.java @@ -43,7 +43,7 @@ import com.catcode.odf.OpenDocumentMetadata; import com.catcode.odf.OpenDocumentTextInputStream; import de.anomic.crawler.HTTPLoader; -import de.anomic.http.HttpClient; +import de.anomic.http.httpClient; import de.anomic.http.httpRequestHeader; import de.anomic.kelondro.util.Log; import de.anomic.kelondro.util.FileUtils; @@ -246,7 +246,7 @@ public class odtParser extends AbstractParser implements Parser { // downloading the document content final httpRequestHeader reqHeader = new httpRequestHeader(); reqHeader.put(httpRequestHeader.USER_AGENT, HTTPLoader.crawlerUserAgent); - final byte[] content = HttpClient.wget(contentUrl.toString(), reqHeader, 10000); + final byte[] content = httpClient.wget(contentUrl.toString(), reqHeader, 10000); final ByteArrayInputStream input = new ByteArrayInputStream(content); // parsing the document diff --git a/source/de/anomic/plasma/parser/rpm/rpmParser.java b/source/de/anomic/plasma/parser/rpm/rpmParser.java index 51b265bdc..542a76cf8 100644 --- a/source/de/anomic/plasma/parser/rpm/rpmParser.java +++ b/source/de/anomic/plasma/parser/rpm/rpmParser.java @@ -34,7 +34,7 @@ import com.jguild.jrpm.io.RPMFile; import com.jguild.jrpm.io.datatype.DataTypeIf; import de.anomic.crawler.HTTPLoader; -import de.anomic.http.HttpClient; +import de.anomic.http.httpClient; import de.anomic.http.httpRequestHeader; import de.anomic.kelondro.util.FileUtils; import de.anomic.plasma.plasmaParserDocument; @@ -166,7 +166,7 @@ public class rpmParser extends AbstractParser implements Parser { final rpmParser testParser = new rpmParser(); final httpRequestHeader reqHeader = new httpRequestHeader(); reqHeader.put(httpRequestHeader.USER_AGENT, HTTPLoader.crawlerUserAgent); - final byte[] content = HttpClient.wget(contentUrl.toString(), reqHeader, 10000); + final byte[] content = httpClient.wget(contentUrl.toString(), reqHeader, 10000); final ByteArrayInputStream input = new ByteArrayInputStream(content); testParser.parse(contentUrl, "application/x-rpm", null, input); } catch (final Exception e) { diff --git a/source/de/anomic/plasma/parser/vcf/vcfParser.java b/source/de/anomic/plasma/parser/vcf/vcfParser.java index 8b0da3106..55bb43739 100644 --- a/source/de/anomic/plasma/parser/vcf/vcfParser.java +++ b/source/de/anomic/plasma/parser/vcf/vcfParser.java @@ -35,7 +35,7 @@ import java.util.Iterator; import java.util.LinkedList; import de.anomic.crawler.HTTPLoader; -import de.anomic.http.HttpClient; +import de.anomic.http.httpClient; import de.anomic.http.httpRequestHeader; import de.anomic.kelondro.order.Base64Order; import de.anomic.plasma.plasmaParserDocument; @@ -277,7 +277,7 @@ public class vcfParser extends AbstractParser implements Parser { final vcfParser testParser = new vcfParser(); final httpRequestHeader reqHeader = new httpRequestHeader(); reqHeader.put(httpRequestHeader.USER_AGENT, HTTPLoader.crawlerUserAgent); - final byte[] content = HttpClient.wget(contentUrl.toString(), reqHeader, 10000); + final byte[] content = httpClient.wget(contentUrl.toString(), reqHeader, 10000); final ByteArrayInputStream input = new ByteArrayInputStream(content); testParser.parse(contentUrl, "text/x-vcard", "UTF-8",input); } catch (final Exception e) { diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index 1220bc536..d73318942 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -56,8 +56,8 @@ import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterImageEntry; import de.anomic.htmlFilter.htmlFilterInputStream; import de.anomic.htmlFilter.htmlFilterWriter; -import de.anomic.http.JakartaCommonsHttpClient; -import de.anomic.http.JakartaCommonsHttpResponse; +import de.anomic.http.httpClient; +import de.anomic.http.httpResponse; import de.anomic.kelondro.util.Log; import de.anomic.kelondro.util.FileUtils; import de.anomic.plasma.parser.Parser; @@ -880,7 +880,7 @@ public final class plasmaParser { } final String mode = args[0]; - JakartaCommonsHttpResponse res = null; + httpResponse res = null; plasmaParserDocument document = null; try { // close InputStream when done if (mode.equalsIgnoreCase("-f")) { @@ -890,7 +890,7 @@ public final class plasmaParser { contentURL = new yacyURL(args[1], null); // downloading the document content - final JakartaCommonsHttpClient client = new JakartaCommonsHttpClient(5000); + final httpClient client = new httpClient(5000); res = client.GET(args[1]); if (res.getStatusCode() != 200) { diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index 38cd9ddfa..b5b40afb1 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -41,7 +41,7 @@ import java.util.regex.Pattern; import de.anomic.htmlFilter.htmlFilterCharacterCoding; import de.anomic.htmlFilter.htmlFilterImageEntry; -import de.anomic.http.HttpClient; +import de.anomic.http.httpClient; import de.anomic.http.httpResponseHeader; import de.anomic.index.indexDocumentMetadata; import de.anomic.index.indexURLReference; @@ -852,7 +852,7 @@ public class plasmaSnippetCache { // getting URL mimeType try { - responseHeader = HttpClient.whead(url.toString()); + responseHeader = httpClient.whead(url.toString()); } catch (final Exception e) { // ingore this. http header download failed } diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index d6bd92518..ec3f41757 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -131,8 +131,7 @@ import de.anomic.data.messageBoard; import de.anomic.data.userDB; import de.anomic.data.wikiBoard; import de.anomic.data.wiki.wikiParser; -import de.anomic.http.HttpClient; -import de.anomic.http.JakartaCommonsHttpClient; +import de.anomic.http.httpClient; import de.anomic.http.httpRemoteProxyConfig; import de.anomic.http.httpRequestHeader; import de.anomic.http.httpResponseHeader; @@ -1089,7 +1088,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch getConfigLong("bootstrapLoadTimeout", 6000)) { @@ -2060,7 +2056,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch result = FileUtils.table(HttpClient.wget(url.toString(), reqHeader, 10000), "UTF-8"); + final HashMap result = FileUtils.table(httpClient.wget(url.toString(), reqHeader, 10000), "UTF-8"); if (result == null) return new HashMap(); return result; } catch (final Exception e) { diff --git a/source/de/anomic/server/serverAbstractSwitch.java b/source/de/anomic/server/serverAbstractSwitch.java index 7008c05de..3eaafe69a 100644 --- a/source/de/anomic/server/serverAbstractSwitch.java +++ b/source/de/anomic/server/serverAbstractSwitch.java @@ -29,7 +29,6 @@ import java.util.Iterator; import java.util.Map; import java.util.SortedMap; import java.util.TreeMap; -import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.LinkedBlockingQueue; import de.anomic.kelondro.util.Log; @@ -37,8 +36,6 @@ import de.anomic.kelondro.util.FileUtils; public abstract class serverAbstractSwitch implements serverSwitch { - private static final long maxTrackingTimeDefault = 1000 * 60 * 60; // store only access data from the last hour to save ram space - // configuration management private final File configFile; private final String configComment; @@ -46,14 +43,13 @@ public abstract class serverAbstractSwitch implements serverSwitch { protected boolean firstInit; protected Log log; protected int serverJobs; - private long maxTrackingTime; private Map configProps; private final Map configRemoved; private final HashMap authorization; private final TreeMap workerThreads; private final TreeMap switchActions; private final LinkedBlockingQueue cacheStack; - private final ConcurrentHashMap> accessTracker; // mappings from requesting host to an ArrayList of serverTrack-entries + private final serverAccessTracker accessTracker; public serverAbstractSwitch(final File rootPath, final String initPath, final String configPath, final boolean applyPro) { // we initialize the switchboard with a property file, @@ -133,7 +129,6 @@ public abstract class serverAbstractSwitch implements serverSwitch { // other settings authorization = new HashMap(); - accessTracker = new ConcurrentHashMap>(); // init thread control workerThreads = new TreeMap(); @@ -145,7 +140,11 @@ public abstract class serverAbstractSwitch implements serverSwitch { serverJobs = 0; // init server tracking - maxTrackingTime = getConfigLong("maxTrackingTime", maxTrackingTimeDefault); + this.accessTracker = new serverAccessTracker( + getConfigLong("server.maxTrackingTime", 60 * 60 * 1000), + (int) getConfigLong("server.maxTrackingCount", 1000), + (int) getConfigLong("server.maxTrackingHostCount", 100) + ); } // a logger for this switchboard @@ -156,59 +155,6 @@ public abstract class serverAbstractSwitch implements serverSwitch { public Log getLog() { return log; } - - /* - * remove all entries from the access tracker where the age of the last access is greater than the given timeout - */ - public void cleanupAccessTracker(final long timeout) { - final Iterator>> i = accessTracker.entrySet().iterator(); - while (i.hasNext()) { - if (i.next().getValue().tailMap(Long.valueOf(System.currentTimeMillis() - timeout)).size() == 0) i.remove(); - } - } - - public void track(final String host, String accessPath) { - // learn that a specific host has accessed a specific path - if (accessPath == null) accessPath="NULL"; - SortedMap access = accessTracker.get(host); - if (access == null) access = new TreeMap(); - - synchronized (access) { - access.put(Long.valueOf(System.currentTimeMillis()), accessPath); - // write back to tracker - accessTracker.put(host, clearTooOldAccess(access)); - } - } - - public SortedMap accessTrack(final String host) { - // returns mapping from Long(accesstime) to path - - SortedMap access = accessTracker.get(host); - if (access == null) return null; - // clear too old entries - synchronized (access) { - if ((access = clearTooOldAccess(access)).size() != access.size()) { - // write back to tracker - if (access.size() == 0) { - accessTracker.remove(host); - } else { - accessTracker.put(host, access); - } - } - } - return access; - } - - private SortedMap clearTooOldAccess(final SortedMap access) { - return access.tailMap(Long.valueOf(System.currentTimeMillis() - maxTrackingTime)); - } - - public Iterator accessHosts() { - // returns an iterator of hosts in tracker (String) - final HashMap> accessTrackerClone = new HashMap>(); - accessTrackerClone.putAll(accessTracker); - return accessTrackerClone.keySet().iterator(); - } public void setConfig(final Map otherConfigs) { final Iterator> i = otherConfigs.entrySet().iterator(); @@ -547,4 +493,17 @@ public abstract class serverAbstractSwitch implements serverSwitch { public void handleBusyState(final int jobs) { serverJobs = jobs; } + + public void track(String host, String accessPath) { + this.accessTracker.track(host, accessPath); + } + + public SortedMap accessTrack(String host) { + return this.accessTracker.accessTrack(host); + } + + public Iterator accessHosts() { + return this.accessTracker.accessHosts(); + } + } diff --git a/source/de/anomic/server/serverAccessTracker.java b/source/de/anomic/server/serverAccessTracker.java new file mode 100644 index 000000000..153d61dd3 --- /dev/null +++ b/source/de/anomic/server/serverAccessTracker.java @@ -0,0 +1,128 @@ +// serverAccessTracker.java +// ------------------------------------- +// (C) 2009 by Michael Peter Christen; mc@yacy.net +// first published on http://yacy.net +// Frankfurt, Germany, 20.02.2009 +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.server; + +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.SortedMap; +import java.util.TreeMap; +import java.util.concurrent.ConcurrentHashMap; + +public class serverAccessTracker { + + private long cleanupCycle = 60000; // 1 minute + + private long maxTrackingTime; + private int maxTrackingCount; + private int maxHostCount; + private final ConcurrentHashMap> accessTracker; // mappings from requesting host to an ArrayList of serverTrack-entries + private long lastCleanup; + + public serverAccessTracker(long maxTrackingTime, int maxTrackingCount, int maxTrackingHostCount) { + this.maxTrackingTime = maxTrackingTime; + this.maxTrackingCount = maxTrackingCount; + this.maxHostCount = maxTrackingHostCount; + this.accessTracker = new ConcurrentHashMap>(); + } + + /* + * remove all entries from the access tracker where the age of the last access is greater than the given timeout + */ + private synchronized void cleanupAccessTracker() { + + if (System.currentTimeMillis() - this.lastCleanup < cleanupCycle) return; + + // clear entries which had no entry for the maxTrackingTime time + final Iterator>> i = accessTracker.entrySet().iterator(); + SortedMap track; + while (i.hasNext()) { + track = i.next().getValue(); + if (track.tailMap(Long.valueOf(System.currentTimeMillis() - maxTrackingTime)).size() == 0) { + // all entries are too old. delete the whole track + i.remove(); + } else { + // check if the maxTrackingCount is exceeded + while (track.size() > this.maxTrackingCount) { + // delete the oldest entries + track.remove(track.firstKey()); + } + } + } + + // if there are more entries left than maxTrackingCount, delete some. + while (accessTracker.size() > this.maxHostCount) { + // delete just any + accessTracker.remove(accessTracker.keys().nextElement()); + } + + this.lastCleanup = System.currentTimeMillis(); + } + + private SortedMap clearTooOldAccess(final SortedMap access) { + return access.tailMap(Long.valueOf(System.currentTimeMillis() - maxTrackingTime)); + } + + public void track(final String host, String accessPath) { + // check storage size + if (System.currentTimeMillis() - this.lastCleanup > cleanupCycle) { + cleanupAccessTracker(); + this.lastCleanup = System.currentTimeMillis(); + } + + // learn that a specific host has accessed a specific path + if (accessPath == null) accessPath="NULL"; + SortedMap track = accessTracker.get(host); + if (track == null) track = new TreeMap(); + + synchronized (track) { + track.put(Long.valueOf(System.currentTimeMillis()), accessPath); + // write back to tracker + accessTracker.put(host, clearTooOldAccess(track)); + } + } + + public SortedMap accessTrack(final String host) { + // returns mapping from Long(accesstime) to path + + SortedMap access = accessTracker.get(host); + if (access == null) return null; + // clear too old entries + synchronized (access) { + if ((access = clearTooOldAccess(access)).size() != access.size()) { + // write back to tracker + if (access.size() == 0) { + accessTracker.remove(host); + } else { + accessTracker.put(host, access); + } + } + } + return access; + } + + public Iterator accessHosts() { + // returns an iterator of hosts in tracker (String) + final HashMap> accessTrackerClone = new HashMap>(); + accessTrackerClone.putAll(accessTracker); + return accessTrackerClone.keySet().iterator(); + } +} diff --git a/source/de/anomic/tools/loaderThreads.java b/source/de/anomic/tools/loaderThreads.java index 7a5250f35..982fc9211 100644 --- a/source/de/anomic/tools/loaderThreads.java +++ b/source/de/anomic/tools/loaderThreads.java @@ -25,7 +25,7 @@ import java.util.ArrayList; import java.util.Hashtable; import de.anomic.crawler.HTTPLoader; -import de.anomic.http.HttpClient; +import de.anomic.http.httpClient; import de.anomic.http.httpRemoteProxyConfig; import de.anomic.http.httpRequestHeader; import de.anomic.kelondro.util.FileUtils; @@ -125,7 +125,7 @@ public class loaderThreads { try { final httpRequestHeader reqHeader = new httpRequestHeader(); reqHeader.put(httpRequestHeader.USER_AGENT, HTTPLoader.crawlerUserAgent); - page = HttpClient.wget(url.toString(), reqHeader, timeout); + page = httpClient.wget(url.toString(), reqHeader, timeout); loaded = true; process.feed(page); if (process.status() == loaderCore.STATUS_FAILED) { diff --git a/source/de/anomic/urlRedirector/urlRedirectord.java b/source/de/anomic/urlRedirector/urlRedirectord.java index 97f270700..a2d1974fe 100644 --- a/source/de/anomic/urlRedirector/urlRedirectord.java +++ b/source/de/anomic/urlRedirector/urlRedirectord.java @@ -10,7 +10,7 @@ import java.util.Date; import de.anomic.crawler.CrawlEntry; import de.anomic.crawler.CrawlProfile; import de.anomic.data.userDB; -import de.anomic.http.HttpClient; +import de.anomic.http.httpClient; import de.anomic.http.httpResponseHeader; import de.anomic.kelondro.util.Log; import de.anomic.plasma.plasmaParser; @@ -182,7 +182,7 @@ public class urlRedirectord implements serverHandler, Cloneable { final yacyURL reqURL = new yacyURL(this.nextURL, null); // getting URL mimeType - final httpResponseHeader header = HttpClient.whead(reqURL.toString()); + final httpResponseHeader header = httpClient.whead(reqURL.toString()); if (plasmaParser.supportedContent( plasmaParser.PARSER_MODE_URLREDIRECTOR, diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index e35a06e84..23cf55ee3 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -62,9 +62,8 @@ import de.anomic.crawler.HTTPLoader; import de.anomic.crawler.ResultURLs; import de.anomic.http.DefaultCharsetFilePart; import de.anomic.http.DefaultCharsetStringPart; -import de.anomic.http.HttpClient; -import de.anomic.http.JakartaCommonsHttpClient; -import de.anomic.http.JakartaCommonsHttpResponse; +import de.anomic.http.httpClient; +import de.anomic.http.httpResponse; import de.anomic.http.httpRemoteProxyConfig; import de.anomic.http.httpRequestHeader; import de.anomic.index.indexContainer; @@ -272,10 +271,10 @@ public final class yacyClient { final httpRequestHeader header = new httpRequestHeader(); header.put(httpRequestHeader.USER_AGENT, HTTPLoader.yacyUserAgent); header.put(httpRequestHeader.HOST, vhost); - final JakartaCommonsHttpClient client = new JakartaCommonsHttpClient(timeout, header); + final httpClient client = new httpClient(timeout, header); client.setProxy(proxyConfig()); - JakartaCommonsHttpResponse res = null; + httpResponse res = null; byte[] content = null; try { // send request/data @@ -1080,7 +1079,7 @@ public final class yacyClient { final httpRequestHeader reqHeader = new httpRequestHeader(); reqHeader.put(httpRequestHeader.USER_AGENT, HTTPLoader.crawlerUserAgent); - final byte[] content = HttpClient.wget( + final byte[] content = httpClient.wget( "http://" + target.getPublicAddress() + "/yacy/search.html" + "?myseed=" + sb.webIndex.seedDB.mySeed().genSeedStr(null) + "&youare=" + target.hash + "&key=" + diff --git a/source/de/anomic/yacy/yacySeedDB.java b/source/de/anomic/yacy/yacySeedDB.java index 92da4b0c5..b5fc0ebde 100644 --- a/source/de/anomic/yacy/yacySeedDB.java +++ b/source/de/anomic/yacy/yacySeedDB.java @@ -41,8 +41,8 @@ import java.util.Map; import java.util.TreeMap; import de.anomic.crawler.HTTPLoader; -import de.anomic.http.JakartaCommonsHttpClient; -import de.anomic.http.JakartaCommonsHttpResponse; +import de.anomic.http.httpClient; +import de.anomic.http.httpResponse; import de.anomic.http.httpRequestHeader; import de.anomic.http.httpd; import de.anomic.http.httpdAlternativeDomainNames; @@ -823,9 +823,9 @@ public final class yacySeedDB implements httpdAlternativeDomainNames { reqHeader.put(httpRequestHeader.USER_AGENT, HTTPLoader.yacyUserAgent); // init http-client - final JakartaCommonsHttpClient client = new JakartaCommonsHttpClient(10000, reqHeader); + final httpClient client = new httpClient(10000, reqHeader); byte[] content = null; - JakartaCommonsHttpResponse res = null; + httpResponse res = null; try { // send request res = client.GET(seedURL.toString()); diff --git a/source/de/anomic/yacy/yacyVersion.java b/source/de/anomic/yacy/yacyVersion.java index 78fdd73b3..455652538 100644 --- a/source/de/anomic/yacy/yacyVersion.java +++ b/source/de/anomic/yacy/yacyVersion.java @@ -44,8 +44,8 @@ import java.util.regex.Pattern; import de.anomic.crawler.HTTPLoader; import de.anomic.htmlFilter.htmlFilterContentScraper; -import de.anomic.http.JakartaCommonsHttpClient; -import de.anomic.http.JakartaCommonsHttpResponse; +import de.anomic.http.httpClient; +import de.anomic.http.httpResponse; import de.anomic.http.httpResponseHeader; import de.anomic.http.httpRequestHeader; import de.anomic.kelondro.util.Log; @@ -346,8 +346,8 @@ public final class yacyVersion implements Comparator, Comparable