From 791e1dcfdf6cbfb6df3d31db8337eef88b0e5274 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Mon, 5 Nov 2012 22:14:27 +0100 Subject: [PATCH] when a new crawl is started, delete all entries about error-urls for crawl-start domains --- htroot/Crawler_p.java | 11 ++++++ source/net/yacy/crawler/data/ZURL.java | 47 ++++++++++++++----------- source/net/yacy/search/Switchboard.java | 1 - 3 files changed, 38 insertions(+), 21 deletions(-) diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 5301cbeae..86c43357a 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -35,6 +35,7 @@ import java.util.regex.PatternSyntaxException; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.federate.solr.YaCySchema; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.SpaceExceededException; @@ -371,6 +372,16 @@ public class Crawler_p { sb.crawler.removePassive(handle); try {sb.crawlQueues.noticeURL.removeByProfileHandle(profile.handle(), 10000);} catch (SpaceExceededException e1) {} + // delete all error urls for that domain + for (DigestURI u: rootURLs) { + String hosthash = u.hosthash(); + try { + sb.crawlQueues.errorURL.removeHost(ASCII.getBytes(hosthash)); + sb.index.fulltext().getSolr().deleteByQuery(YaCySchema.host_id_s.name() + ":\"" + hosthash + "\" AND " + YaCySchema.failreason_t.name() + ":[* TO *]"); + sb.index.fulltext().commit(); + } catch (IOException e) {Log.logException(e);} + } + // start the crawl if ("url".equals(crawlingMode)) { if (rootURLs.size() == 0) { diff --git a/source/net/yacy/crawler/data/ZURL.java b/source/net/yacy/crawler/data/ZURL.java index 214f1aad3..7ec8740c8 100644 --- a/source/net/yacy/crawler/data/ZURL.java +++ b/source/net/yacy/crawler/data/ZURL.java @@ -31,6 +31,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Date; import java.util.Iterator; +import java.util.List; import java.util.Queue; import java.util.concurrent.LinkedBlockingQueue; @@ -38,16 +39,15 @@ import org.apache.solr.common.SolrInputDocument; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.UTF8; -import net.yacy.cora.federate.solr.connector.ShardSolrConnector; import net.yacy.cora.federate.solr.connector.SolrConnector; import net.yacy.cora.order.Base64Order; +import net.yacy.cora.order.NaturalOrder; import net.yacy.cora.util.SpaceExceededException; import net.yacy.crawler.retrieval.Request; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.Index; import net.yacy.kelondro.index.Row; -import net.yacy.kelondro.index.RowSet; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.table.SplitTable; import net.yacy.kelondro.table.Table; @@ -56,7 +56,7 @@ import net.yacy.search.index.SolrConfiguration; public class ZURL implements Iterable { - public static Log log = new Log("REJECTED"); + private static Log log = new Log("REJECTED"); private static final int EcoFSBufferSize = 2000; private static final int maxStackSize = 1000; @@ -93,7 +93,7 @@ public class ZURL implements Iterable { private final SolrConnector solrConnector; private final SolrConfiguration solrConfiguration; - public ZURL( + protected ZURL( final SolrConnector solrConnector, final SolrConfiguration solrConfiguration, final File cachePath, @@ -124,21 +124,12 @@ public class ZURL implements Iterable { this.stack = new LinkedBlockingQueue(); } - public ZURL(final ShardSolrConnector solrConnector, - final SolrConfiguration solrConfiguration) { - this.solrConnector = solrConnector; - this.solrConfiguration = solrConfiguration; - // creates a new ZUR in RAM - this.urlIndex = new RowSet(rowdef); - this.stack = new LinkedBlockingQueue(); - } - - public void clear() throws IOException { + protected void clear() throws IOException { if (this.urlIndex != null) this.urlIndex.clear(); if (this.stack != null) this.stack.clear(); } - public void close() { + protected void close() { try {clear();} catch (final IOException e) {} if (this.urlIndex != null) this.urlIndex.close(); } @@ -153,6 +144,22 @@ public class ZURL implements Iterable { return false; } } + + public void removeHost(final byte[] hosthash) throws IOException { + if (hosthash == null) return; + Iterator i = this.urlIndex.keys(true, null); + List r = new ArrayList(); + while (i.hasNext()) { + byte[] b = i.next(); + if (NaturalOrder.naturalOrder.equal(hosthash, 0, b, 6, 6)) r.add(b); + } + for (byte[] b: r) this.urlIndex.remove(b); + i = this.stack.iterator(); + while (i.hasNext()) { + byte[] b = i.next(); + if (NaturalOrder.naturalOrder.equal(hosthash, 0, b, 6, 6)) i.remove(); + } + } public void push( final Request bentry, @@ -259,7 +266,7 @@ public class ZURL implements Iterable { } } - public boolean exists(final byte[] urlHash) { + boolean exists(final byte[] urlHash) { return this.urlIndex.has(urlHash); } @@ -273,14 +280,14 @@ public class ZURL implements Iterable { public class Entry { - Request bentry; // the balancer entry + private Request bentry; // the balancer entry private final byte[] executor; // the crawling executor private final Date workdate; // the time when the url was last time tried to load private final int workcount; // number of tryings private final String anycause; // string describing reason for load fail private boolean stored; - protected Entry( + private Entry( final Request bentry, final byte[] executor, final Date workdate, @@ -297,7 +304,7 @@ public class ZURL implements Iterable { this.stored = false; } - protected Entry(final Row.Entry entry) throws IOException { + private Entry(final Row.Entry entry) throws IOException { assert (entry != null); this.executor = entry.getColBytes(1, true); this.workdate = new Date(entry.getColLong(2)); @@ -317,7 +324,7 @@ public class ZURL implements Iterable { return this.bentry.initiator(); } - public byte[] hash() { + private byte[] hash() { // return a url-hash, based on the md5 algorithm // the result is a String of 12 bytes within a 72-bit space // (each byte has an 6-bit range) diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 8c6ab6dcf..3927045a1 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -183,7 +183,6 @@ import net.yacy.repository.LoaderDispatcher; import net.yacy.search.index.Segment; import net.yacy.search.index.SolrConfiguration; import net.yacy.search.query.AccessTracker; -import net.yacy.search.query.QueryParams; import net.yacy.search.query.SearchEvent; import net.yacy.search.query.SearchEventCache; import net.yacy.search.ranking.BlockRank;