diff --git a/htroot/CrawlStartExpert_p.html b/htroot/CrawlStartExpert_p.html index b63d9b9bd..81c106a42 100644 --- a/htroot/CrawlStartExpert_p.html +++ b/htroot/CrawlStartExpert_p.html @@ -109,15 +109,13 @@ : - - - - - - + + +
on URLs for Crawling:
+
on URLs for Crawling:
Restrict to start domain(s)
Restrict to sub-path(s)
Use filter
on IPs for Crawling:
on URLs for Indexing
on IPs for Crawling:
on URLs for Indexing
@@ -131,9 +129,9 @@ : - - - + + +
on URLs for Crawling:
on IPs for Crawling:
on URLs for Indexing:
on URLs for Crawling:
on IPs for Crawling:
on URLs for Indexing:
diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 58a18a10d..eaedb7e71 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -24,9 +24,11 @@ import java.io.FileInputStream; import java.io.IOException; import java.io.Writer; import java.net.MalformedURLException; +import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.HashSet; +import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Set; @@ -294,8 +296,7 @@ public class Crawler_p { siteFilter = CrawlProfile.siteFilter(rootURLs); if (deleteold) { for (DigestURI u: rootURLs) { - int count = sb.index.fulltext().deleteDomainHashpart(u.hosthash(), deleteageDate, rootURLs.size() > 1); - if (count > 0) Log.logInfo("Crawler_p", "deleted " + count + " documents for host " + u.getHost()); + sb.index.fulltext().deleteDomainHashpart(u.hosthash(), deleteageDate, rootURLs.size() > 1); } } } else if (subPath) { @@ -366,14 +367,17 @@ public class Crawler_p { try {sb.crawlQueues.noticeURL.removeByProfileHandle(profile.handle(), 10000);} catch (SpaceExceededException e1) {} // delete all error urls for that domain + List hosthashes = new ArrayList(); for (DigestURI u: rootURLs) { - String hosthash = u.hosthash(); + hosthashes.add(ASCII.getBytes(u.hosthash())); + } + sb.crawlQueues.errorURL.removeHost(hosthashes, true); + for (byte[] hosthash: hosthashes) { try { - sb.crawlQueues.errorURL.removeHost(ASCII.getBytes(hosthash)); - sb.index.fulltext().getSolr().deleteByQuery(YaCySchema.host_id_s.getSolrFieldName() + ":\"" + hosthash + "\" AND " + YaCySchema.failreason_t.getSolrFieldName() + ":[* TO *]"); - sb.index.fulltext().commit(true); + sb.index.fulltext().getSolr().deleteByQuery(YaCySchema.host_id_s.getSolrFieldName() + ":\"" + ASCII.String(hosthash) + "\" AND " + YaCySchema.failreason_t.getSolrFieldName() + ":[* TO *]"); } catch (IOException e) {Log.logException(e);} } + sb.index.fulltext().commit(true); // start the crawl if ("url".equals(crawlingMode)) { diff --git a/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java index c5b95db64..7a6278480 100644 --- a/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java @@ -255,12 +255,10 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo } @Override - public int deleteByQuery(final String querystring) throws IOException { - int count = 0; - if (this.solr0 != null) count += this.solr0.deleteByQuery(querystring); - if (this.solr1 != null) count += this.solr1.deleteByQuery(querystring); + public void deleteByQuery(final String querystring) throws IOException { + if (this.solr0 != null) this.solr0.deleteByQuery(querystring); + if (this.solr1 != null) this.solr1.deleteByQuery(querystring); this.clearCache(); - return count; } @Override diff --git a/source/net/yacy/cora/federate/solr/connector/MultipleSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/MultipleSolrConnector.java index d34518901..8e37f00f0 100644 --- a/source/net/yacy/cora/federate/solr/connector/MultipleSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/MultipleSolrConnector.java @@ -146,8 +146,8 @@ public class MultipleSolrConnector extends AbstractSolrConnector implements Solr } @Override - public int deleteByQuery(final String querystring) throws IOException { - return this.solr.deleteByQuery(querystring); + public void deleteByQuery(final String querystring) throws IOException { + this.solr.deleteByQuery(querystring); } @Override diff --git a/source/net/yacy/cora/federate/solr/connector/RetrySolrConnector.java b/source/net/yacy/cora/federate/solr/connector/RetrySolrConnector.java index f9de62b97..491bca8a7 100644 --- a/source/net/yacy/cora/federate/solr/connector/RetrySolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/RetrySolrConnector.java @@ -122,18 +122,18 @@ public class RetrySolrConnector extends AbstractSolrConnector implements SolrCon } @Override - public int deleteByQuery(final String querystring) throws IOException { + public void deleteByQuery(final String querystring) throws IOException { final long t = System.currentTimeMillis() + this.retryMaxTime; Throwable ee = null; while (System.currentTimeMillis() < t) try { - return this.solrConnector.deleteByQuery(querystring); + this.solrConnector.deleteByQuery(querystring); + return; } catch (final Throwable e) { ee = e; try {Thread.sleep(10);} catch (final InterruptedException e1) {} continue; } if (ee != null) throw (ee instanceof IOException) ? (IOException) ee : new IOException(ee.getMessage()); - return 0; } @Override diff --git a/source/net/yacy/cora/federate/solr/connector/ShardSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/ShardSolrConnector.java index 386a1c34b..a3dcbf9fb 100644 --- a/source/net/yacy/cora/federate/solr/connector/ShardSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/ShardSolrConnector.java @@ -120,10 +120,8 @@ public class ShardSolrConnector extends AbstractSolrConnector implements SolrCon } @Override - public int deleteByQuery(final String querystring) throws IOException { - int count = 0; - for (final SolrConnector connector: this.connectors) count += connector.deleteByQuery(querystring); - return count; + public void deleteByQuery(final String querystring) throws IOException { + for (final SolrConnector connector: this.connectors) connector.deleteByQuery(querystring); } /** diff --git a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java index b05094939..5329abb4f 100644 --- a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java @@ -90,7 +90,7 @@ public interface SolrConnector extends Iterable /* Iterable of document * @return the number of deletions * @throws IOException */ - public int deleteByQuery(final String querystring) throws IOException; + public void deleteByQuery(final String querystring) throws IOException; /** * check if a given key exists in solr at the field fieldName diff --git a/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java b/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java index a0f65a5df..88ef095a5 100644 --- a/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java @@ -242,14 +242,10 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen * @throws IOException */ @Override - public int deleteByQuery(final String querystring) throws IOException { + public void deleteByQuery(final String querystring) throws IOException { try { synchronized (this.server) { - long c0 = this.getQueryCount(querystring); this.server.deleteByQuery(querystring, this.commitWithinMs); - this.commit(true); - long c1 = this.getQueryCount(querystring); - return (int) (c1 - c0); } } catch (final Throwable e) { throw new IOException(e); diff --git a/source/net/yacy/cora/protocol/http/HTTPClient.java b/source/net/yacy/cora/protocol/http/HTTPClient.java index 91c2b3d74..8006a9835 100644 --- a/source/net/yacy/cora/protocol/http/HTTPClient.java +++ b/source/net/yacy/cora/protocol/http/HTTPClient.java @@ -605,7 +605,7 @@ public class HTTPClient { } catch (final IOException e) { ConnectionInfo.removeConnection(httpUriRequest.hashCode()); httpUriRequest.abort(); - throw new IOException("Client can't execute: " + e.getMessage()); + throw new IOException("Client can't execute: " + e.getCause().getMessage()); } } diff --git a/source/net/yacy/crawler/data/ZURL.java b/source/net/yacy/crawler/data/ZURL.java index 134c71c0d..7dc9b0526 100644 --- a/source/net/yacy/crawler/data/ZURL.java +++ b/source/net/yacy/crawler/data/ZURL.java @@ -147,20 +147,31 @@ public class ZURL implements Iterable { } } - public void removeHost(final byte[] hosthash) throws IOException { - if (hosthash == null) return; - Iterator i = this.urlIndex.keys(true, null); - List r = new ArrayList(); - while (i.hasNext()) { - byte[] b = i.next(); - if (NaturalOrder.naturalOrder.equal(hosthash, 0, b, 6, 6)) r.add(b); - } - for (byte[] b: r) this.urlIndex.remove(b); - i = this.stack.iterator(); - while (i.hasNext()) { - byte[] b = i.next(); - if (NaturalOrder.naturalOrder.equal(hosthash, 0, b, 6, 6)) i.remove(); - } + public void removeHost(final Iterable hosthashes, final boolean concurrent) { + if (hosthashes == null) return; + Thread t = new Thread() { + public void run() { + try { + Iterator i = ZURL.this.urlIndex.keys(true, null); + List r = new ArrayList(); + while (i.hasNext()) { + byte[] b = i.next(); + for (byte[] hosthash: hosthashes) { + if (NaturalOrder.naturalOrder.equal(hosthash, 0, b, 6, 6)) r.add(b); + } + } + for (byte[] b: r) ZURL.this.urlIndex.remove(b); + i = ZURL.this.stack.iterator(); + while (i.hasNext()) { + byte[] b = i.next(); + for (byte[] hosthash: hosthashes) { + if (NaturalOrder.naturalOrder.equal(hosthash, 0, b, 6, 6)) i.remove(); + } + } + } catch (IOException e) {} + } + }; + if (concurrent) t.start(); else t.run(); } public void push( diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index 27138a7f1..8b50d2d4b 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -30,7 +30,6 @@ import java.io.PrintWriter; import java.net.MalformedURLException; import java.util.ArrayList; import java.util.Date; -import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -48,7 +47,6 @@ import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector; import net.yacy.cora.federate.solr.connector.MirrorSolrConnector; import net.yacy.cora.federate.solr.connector.SolrConnector; import net.yacy.cora.order.CloneableIterator; -import net.yacy.cora.sorting.ConcurrentScoreMap; import net.yacy.cora.sorting.ReversibleScoreMap; import net.yacy.cora.sorting.ScoreMap; import net.yacy.cora.storage.ZIPReader; @@ -206,6 +204,7 @@ public final class Fulltext { this.urlIndexFile.clear(); } this.statsDump = null; + this.getSolr().commit(true); } public void clearLocalSolr() throws IOException { @@ -356,22 +355,19 @@ public final class Fulltext { * here such a fragment can be used to delete all these domains at once * @param hosthash the hash of the host to be deleted * @param freshdate either NULL or a date in the past which is the limit for deletion. Only documents older than this date are deleted - * @return number of deleted domains * @throws IOException */ - public int deleteDomainHashpart(final String hosthash, Date freshdate, boolean concurrent) { + public void deleteDomainHashpart(final String hosthash, Date freshdate, boolean concurrent) { // first collect all url hashes that belong to the domain assert hosthash.length() == 6; final String q = YaCySchema.host_id_s.getSolrFieldName() + ":\"" + hosthash + "\"" + ((freshdate != null && freshdate.before(new Date())) ? (" AND " + YaCySchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : ""); - final AtomicInteger count = new AtomicInteger(0); Thread t = new Thread() { public void run() { // delete in solr synchronized (Fulltext.this.solr) { try { - count.addAndGet(Fulltext.this.solr.deleteByQuery(q)); - if (count.get() > 0) Fulltext.this.solr.commit(true); + Fulltext.this.solr.deleteByQuery(q); } catch (IOException e) {} } @@ -408,22 +404,22 @@ public final class Fulltext { } } }; - if (concurrent) t.start(); else t.run(); - return count.get(); + if (concurrent) t.start(); else { + t.run(); + Fulltext.this.getSolr().commit(true); + } } - public int deleteDomainHostname(final String hostname, Date freshdate, boolean concurrent) { + public void deleteDomainHostname(final String hostname, Date freshdate, boolean concurrent) { // first collect all url hashes that belong to the domain final String q = YaCySchema.host_s.getSolrFieldName() + ":\"" + hostname + "\"" + ((freshdate != null && freshdate.before(new Date())) ? (" AND " + YaCySchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : ""); - final AtomicInteger count = new AtomicInteger(0); Thread t = new Thread() { public void run() { // delete in solr synchronized (Fulltext.this.solr) { try { - count.addAndGet(Fulltext.this.solr.deleteByQuery(q)); - if (count.get() > 0) Fulltext.this.solr.commit(true); + Fulltext.this.solr.deleteByQuery(q); } catch (IOException e) {} } // finally remove the line with statistics @@ -440,8 +436,10 @@ public final class Fulltext { } } }; - if (concurrent) t.start(); else t.run(); - return count.get(); + if (concurrent) t.start(); else { + t.run(); + Fulltext.this.getSolr().commit(true); + } } /** @@ -748,42 +746,7 @@ public final class Fulltext { } } - - /** - * calculate a score map for url hash samples: each sample is a single url hash - * that stands for all entries for the corresponding domain. The map counts the number - * of occurrences of the domain - * @param domainSamples a map from domain hashes to hash statistics - * @return a map from url hash samples to counters - */ - public ScoreMap urlSampleScores(final Map domainSamples) { - final ScoreMap urlSampleScore = new ConcurrentScoreMap(); - for (final Map.Entry e: domainSamples.entrySet()) { - urlSampleScore.inc(ASCII.String(e.getValue().urlhashb), e.getValue().count); - } - return urlSampleScore; - } - - /** - * calculate all domain names for all domain hashes - * @param domainSamples a map from domain hashes to hash statistics - * @return a map from domain hashes to host stats including domain names - */ - public Map domainHashResolver(final Map domainSamples) { - final HashMap hostMap = new HashMap(); - - final ScoreMap hosthashScore = new ConcurrentScoreMap(); - for (final Map.Entry e: domainSamples.entrySet()) { - hosthashScore.inc(ASCII.String(e.getValue().urlhashb, 6, 6), e.getValue().count); - } - DigestURI url; - for (final Map.Entry e: domainSamples.entrySet()) { - url = this.getURL(e.getValue().urlhashb); - hostMap.put(e.getKey(), new HostStat(url.getHost(), url.getPort(), e.getKey(), hosthashScore.get(e.getKey()))); - } - return hostMap; - } - + public Iterator statistics(int count, final ScoreMap domainScore) { // prevent too heavy IO. if (this.statsDump != null && count <= this.statsDump.size()) return this.statsDump.iterator(); @@ -809,15 +772,6 @@ public final class Fulltext { return (this.statsDump == null) ? new ArrayList().iterator() : this.statsDump.iterator(); } - private static class URLHashCounter { - public byte[] urlhashb; - public int count; - public URLHashCounter(final byte[] urlhashb) { - this.urlhashb = urlhashb; - this.count = 1; - } - } - public static class HostStat { public String hostname, hosthash; public int port;