diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 57e5fc32c..0bcde88ad 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -392,7 +392,7 @@ public class Crawler_p { for (DigestURI u: rootURLs) { hosthashes.add(ASCII.getBytes(u.hosthash())); } - sb.crawlQueues.errorURL.removeHost(hosthashes, true); + sb.crawlQueues.errorURL.removeHosts(hosthashes, false); for (byte[] hosthash: hosthashes) { try { String deletequery = CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + ASCII.String(hosthash) + "\" AND " + CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]"; diff --git a/source/net/yacy/cora/document/ASCII.java b/source/net/yacy/cora/document/ASCII.java index 5b0cd36b6..578616fa6 100644 --- a/source/net/yacy/cora/document/ASCII.java +++ b/source/net/yacy/cora/document/ASCII.java @@ -133,4 +133,15 @@ public class ASCII implements Comparator { } return b; } + + public final static byte[] getBytes(final String s, final int beginIndex, final int endIndex) { + assert s != null; + //assert s.length() < 3 || s.charAt(2) != '@'; + int count = endIndex - beginIndex; + final byte[] b = new byte[count]; + for (int i = 0; i < count; i++) { + b[i] = (byte) s.charAt(i + beginIndex); + } + return b; + } } \ No newline at end of file diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java index 02c487f33..df0f49d34 100644 --- a/source/net/yacy/crawler/CrawlStacker.java +++ b/source/net/yacy/crawler/CrawlStacker.java @@ -27,7 +27,9 @@ package net.yacy.crawler; import java.io.IOException; import java.net.InetAddress; import java.net.MalformedURLException; +import java.util.ArrayList; import java.util.Date; +import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Properties; @@ -184,6 +186,9 @@ public final class CrawlStacker { final byte[] urlhash = url.hash(); if (replace) { this.indexSegment.fulltext().remove(urlhash); + byte[] hosthash = new byte[6]; System.arraycopy(urlhash, 6, hosthash, 0, 6); + List hosthashes = new ArrayList(); hosthashes.add(hosthash); + this.nextQueue.errorURL.removeHosts(hosthashes, false); this.nextQueue.removeURL(urlhash); String u = url.toNormalform(true); if (u.endsWith("/")) { diff --git a/source/net/yacy/crawler/data/CrawlQueues.java b/source/net/yacy/crawler/data/CrawlQueues.java index 80316f187..bff9bce48 100644 --- a/source/net/yacy/crawler/data/CrawlQueues.java +++ b/source/net/yacy/crawler/data/CrawlQueues.java @@ -161,6 +161,7 @@ public class CrawlQueues { } public void removeURL(final byte[] hash) { + assert hash != null && hash.length == 12; this.noticeURL.removeByURLHash(hash); this.delegatedURL.remove(hash); this.errorURL.remove(hash); diff --git a/source/net/yacy/crawler/data/ZURL.java b/source/net/yacy/crawler/data/ZURL.java index 6970819fd..226907325 100644 --- a/source/net/yacy/crawler/data/ZURL.java +++ b/source/net/yacy/crawler/data/ZURL.java @@ -136,13 +136,18 @@ public class ZURL implements Iterable { if (hash == null) return false; //System.out.println("*** DEBUG ZURL " + this.urlIndex.filename() + " remove " + hash); try { + Iterator i = ZURL.this.stack.iterator(); + while (i.hasNext()) { + byte[] b = i.next(); + if (NaturalOrder.naturalOrder.equal(hash, b)) i.remove(); + } return this.urlIndex.delete(hash); } catch (final IOException e) { return false; } } - public void removeHost(final Iterable hosthashes, final boolean concurrent) { + public void removeHosts(final Iterable hosthashes, final boolean concurrent) { if (hosthashes == null) return; Thread t = new Thread() { public void run() { diff --git a/source/net/yacy/crawler/retrieval/HTTPLoader.java b/source/net/yacy/crawler/retrieval/HTTPLoader.java index 09ac43511..248a9e8d5 100644 --- a/source/net/yacy/crawler/retrieval/HTTPLoader.java +++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java @@ -34,7 +34,6 @@ import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.util.ConcurrentLog; -import net.yacy.crawler.data.Cache; import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.Latency; import net.yacy.crawler.data.ZURL.FailCategory; diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 84394ca7f..566ba979b 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2903,6 +2903,9 @@ public final class Switchboard extends serverSwitch { } // remove the document from the error-db + byte[] hosthash = new byte[6]; System.arraycopy(urlhash, 6, hosthash, 0, 6); + List hosthashes = new ArrayList(); hosthashes.add(hosthash); + this.crawlQueues.errorURL.removeHosts(hosthashes, false); this.crawlQueues.removeURL(urlhash); // get a scraper to get the title