From b6c21671436c05aa890046ada2e4f4734388042e Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 1 Apr 2009 13:21:47 +0000 Subject: [PATCH] - patch for bad web structure dumps - added automatic slow down of accessed to specific domains when access to a web page fails git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5765 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/crawler/Latency.java | 16 ++++++++++++++++ source/de/anomic/http/httpClient.java | 8 ++++++++ source/de/anomic/plasma/plasmaWebStructure.java | 8 +++++++- 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/source/de/anomic/crawler/Latency.java b/source/de/anomic/crawler/Latency.java index 06fda7735..2e2bc57fc 100644 --- a/source/de/anomic/crawler/Latency.java +++ b/source/de/anomic/crawler/Latency.java @@ -45,6 +45,17 @@ public class Latency { } } + public static void slowdown(String hosthash, String host) { + assert hosthash.length() == 6; + Host h = map.get(hosthash); + if (h == null) { + h = new Host(host, 3000); + map.put(hosthash, h); + } else { + h.slowdown(); + } + } + public static Host host(String hosthash) { assert hosthash.length() == 6; return map.get(hosthash); @@ -172,6 +183,11 @@ public class Latency { this.timeacc += time; this.count++; } + public void slowdown() { + this.lastacc = System.currentTimeMillis(); + this.timeacc = Math.min(60000, average() * 5); + this.count = 1; + } public int count() { return this.count; } diff --git a/source/de/anomic/http/httpClient.java b/source/de/anomic/http/httpClient.java index 6afaa8034..91ab16a4c 100644 --- a/source/de/anomic/http/httpClient.java +++ b/source/de/anomic/http/httpClient.java @@ -59,8 +59,10 @@ import org.apache.commons.httpclient.params.HttpMethodParams; import org.apache.commons.httpclient.protocol.Protocol; import org.apache.commons.httpclient.protocol.ProtocolSocketFactory; +import de.anomic.crawler.Latency; import de.anomic.kelondro.order.Base64Order; import de.anomic.kelondro.util.Log; +import de.anomic.yacy.yacyURL; /** * HttpClient implementation which uses Jakarta Commons HttpClient 3.x {@link http://hc.apache.org/httpclient-3.x/} @@ -448,14 +450,20 @@ public class httpClient { } } catch (final IllegalThreadStateException e) { // cleanUp statistics + yacyURL url = new yacyURL(method.getURI().toString(), null); + Latency.slowdown(url.hash().substring(6), url.getHost()); HttpConnectionInfo.removeConnection(generateConInfo(method)); throw e; } catch (final IOException e) { // cleanUp statistics + yacyURL url = new yacyURL(method.getURI().toString(), null); + Latency.slowdown(url.hash().substring(6), url.getHost()); HttpConnectionInfo.removeConnection(generateConInfo(method)); throw e; } catch (final IllegalStateException e) { // cleanUp statistics + yacyURL url = new yacyURL(method.getURI().toString(), null); + Latency.slowdown(url.hash().substring(6), url.getHost()); HttpConnectionInfo.removeConnection(generateConInfo(method)); throw new IOException(e.getMessage()); } diff --git a/source/de/anomic/plasma/plasmaWebStructure.java b/source/de/anomic/plasma/plasmaWebStructure.java index d67608cb1..a50f9eecf 100644 --- a/source/de/anomic/plasma/plasmaWebStructure.java +++ b/source/de/anomic/plasma/plasmaWebStructure.java @@ -184,9 +184,15 @@ public class plasmaWebStructure { final Map map = new HashMap(); String c; final int refsc = refstr2count(refs); + int d; for (int i = 0; i < refsc; i++) { c = refs.substring(8 + i * 10, 8 + (i + 1) * 10); - map.put(c.substring(0, 6), Integer.valueOf(c.substring(6), 16)); + try { + d = Integer.valueOf(c.substring(6), 16); + } catch (NumberFormatException e) { + d = 1; + } + map.put(c.substring(0, 6), d); } return map; }