diff --git a/htroot/CrawlStartExpert_p.html b/htroot/CrawlStartExpert_p.html
index b63d9b9bd..81c106a42 100644
--- a/htroot/CrawlStartExpert_p.html
+++ b/htroot/CrawlStartExpert_p.html
@@ -109,15 +109,13 @@
: |
|
@@ -131,9 +129,9 @@
| : |
|
diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java
index 58a18a10d..eaedb7e71 100644
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@@ -24,9 +24,11 @@ import java.io.FileInputStream;
import java.io.IOException;
import java.io.Writer;
import java.net.MalformedURLException;
+import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
+import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
@@ -294,8 +296,7 @@ public class Crawler_p {
siteFilter = CrawlProfile.siteFilter(rootURLs);
if (deleteold) {
for (DigestURI u: rootURLs) {
- int count = sb.index.fulltext().deleteDomainHashpart(u.hosthash(), deleteageDate, rootURLs.size() > 1);
- if (count > 0) Log.logInfo("Crawler_p", "deleted " + count + " documents for host " + u.getHost());
+ sb.index.fulltext().deleteDomainHashpart(u.hosthash(), deleteageDate, rootURLs.size() > 1);
}
}
} else if (subPath) {
@@ -366,14 +367,17 @@ public class Crawler_p {
try {sb.crawlQueues.noticeURL.removeByProfileHandle(profile.handle(), 10000);} catch (SpaceExceededException e1) {}
// delete all error urls for that domain
+ List hosthashes = new ArrayList();
for (DigestURI u: rootURLs) {
- String hosthash = u.hosthash();
+ hosthashes.add(ASCII.getBytes(u.hosthash()));
+ }
+ sb.crawlQueues.errorURL.removeHost(hosthashes, true);
+ for (byte[] hosthash: hosthashes) {
try {
- sb.crawlQueues.errorURL.removeHost(ASCII.getBytes(hosthash));
- sb.index.fulltext().getSolr().deleteByQuery(YaCySchema.host_id_s.getSolrFieldName() + ":\"" + hosthash + "\" AND " + YaCySchema.failreason_t.getSolrFieldName() + ":[* TO *]");
- sb.index.fulltext().commit(true);
+ sb.index.fulltext().getSolr().deleteByQuery(YaCySchema.host_id_s.getSolrFieldName() + ":\"" + ASCII.String(hosthash) + "\" AND " + YaCySchema.failreason_t.getSolrFieldName() + ":[* TO *]");
} catch (IOException e) {Log.logException(e);}
}
+ sb.index.fulltext().commit(true);
// start the crawl
if ("url".equals(crawlingMode)) {
diff --git a/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java
index c5b95db64..7a6278480 100644
--- a/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java
+++ b/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java
@@ -255,12 +255,10 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo
}
@Override
- public int deleteByQuery(final String querystring) throws IOException {
- int count = 0;
- if (this.solr0 != null) count += this.solr0.deleteByQuery(querystring);
- if (this.solr1 != null) count += this.solr1.deleteByQuery(querystring);
+ public void deleteByQuery(final String querystring) throws IOException {
+ if (this.solr0 != null) this.solr0.deleteByQuery(querystring);
+ if (this.solr1 != null) this.solr1.deleteByQuery(querystring);
this.clearCache();
- return count;
}
@Override
diff --git a/source/net/yacy/cora/federate/solr/connector/MultipleSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/MultipleSolrConnector.java
index d34518901..8e37f00f0 100644
--- a/source/net/yacy/cora/federate/solr/connector/MultipleSolrConnector.java
+++ b/source/net/yacy/cora/federate/solr/connector/MultipleSolrConnector.java
@@ -146,8 +146,8 @@ public class MultipleSolrConnector extends AbstractSolrConnector implements Solr
}
@Override
- public int deleteByQuery(final String querystring) throws IOException {
- return this.solr.deleteByQuery(querystring);
+ public void deleteByQuery(final String querystring) throws IOException {
+ this.solr.deleteByQuery(querystring);
}
@Override
diff --git a/source/net/yacy/cora/federate/solr/connector/RetrySolrConnector.java b/source/net/yacy/cora/federate/solr/connector/RetrySolrConnector.java
index f9de62b97..491bca8a7 100644
--- a/source/net/yacy/cora/federate/solr/connector/RetrySolrConnector.java
+++ b/source/net/yacy/cora/federate/solr/connector/RetrySolrConnector.java
@@ -122,18 +122,18 @@ public class RetrySolrConnector extends AbstractSolrConnector implements SolrCon
}
@Override
- public int deleteByQuery(final String querystring) throws IOException {
+ public void deleteByQuery(final String querystring) throws IOException {
final long t = System.currentTimeMillis() + this.retryMaxTime;
Throwable ee = null;
while (System.currentTimeMillis() < t) try {
- return this.solrConnector.deleteByQuery(querystring);
+ this.solrConnector.deleteByQuery(querystring);
+ return;
} catch (final Throwable e) {
ee = e;
try {Thread.sleep(10);} catch (final InterruptedException e1) {}
continue;
}
if (ee != null) throw (ee instanceof IOException) ? (IOException) ee : new IOException(ee.getMessage());
- return 0;
}
@Override
diff --git a/source/net/yacy/cora/federate/solr/connector/ShardSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/ShardSolrConnector.java
index 386a1c34b..a3dcbf9fb 100644
--- a/source/net/yacy/cora/federate/solr/connector/ShardSolrConnector.java
+++ b/source/net/yacy/cora/federate/solr/connector/ShardSolrConnector.java
@@ -120,10 +120,8 @@ public class ShardSolrConnector extends AbstractSolrConnector implements SolrCon
}
@Override
- public int deleteByQuery(final String querystring) throws IOException {
- int count = 0;
- for (final SolrConnector connector: this.connectors) count += connector.deleteByQuery(querystring);
- return count;
+ public void deleteByQuery(final String querystring) throws IOException {
+ for (final SolrConnector connector: this.connectors) connector.deleteByQuery(querystring);
}
/**
diff --git a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java
index b05094939..5329abb4f 100644
--- a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java
+++ b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java
@@ -90,7 +90,7 @@ public interface SolrConnector extends Iterable /* Iterable of document
* @return the number of deletions
* @throws IOException
*/
- public int deleteByQuery(final String querystring) throws IOException;
+ public void deleteByQuery(final String querystring) throws IOException;
/**
* check if a given key exists in solr at the field fieldName
diff --git a/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java b/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java
index a0f65a5df..88ef095a5 100644
--- a/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java
+++ b/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java
@@ -242,14 +242,10 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen
* @throws IOException
*/
@Override
- public int deleteByQuery(final String querystring) throws IOException {
+ public void deleteByQuery(final String querystring) throws IOException {
try {
synchronized (this.server) {
- long c0 = this.getQueryCount(querystring);
this.server.deleteByQuery(querystring, this.commitWithinMs);
- this.commit(true);
- long c1 = this.getQueryCount(querystring);
- return (int) (c1 - c0);
}
} catch (final Throwable e) {
throw new IOException(e);
diff --git a/source/net/yacy/cora/protocol/http/HTTPClient.java b/source/net/yacy/cora/protocol/http/HTTPClient.java
index 91c2b3d74..8006a9835 100644
--- a/source/net/yacy/cora/protocol/http/HTTPClient.java
+++ b/source/net/yacy/cora/protocol/http/HTTPClient.java
@@ -605,7 +605,7 @@ public class HTTPClient {
} catch (final IOException e) {
ConnectionInfo.removeConnection(httpUriRequest.hashCode());
httpUriRequest.abort();
- throw new IOException("Client can't execute: " + e.getMessage());
+ throw new IOException("Client can't execute: " + e.getCause().getMessage());
}
}
diff --git a/source/net/yacy/crawler/data/ZURL.java b/source/net/yacy/crawler/data/ZURL.java
index 134c71c0d..7dc9b0526 100644
--- a/source/net/yacy/crawler/data/ZURL.java
+++ b/source/net/yacy/crawler/data/ZURL.java
@@ -147,20 +147,31 @@ public class ZURL implements Iterable {
}
}
- public void removeHost(final byte[] hosthash) throws IOException {
- if (hosthash == null) return;
- Iterator i = this.urlIndex.keys(true, null);
- List r = new ArrayList();
- while (i.hasNext()) {
- byte[] b = i.next();
- if (NaturalOrder.naturalOrder.equal(hosthash, 0, b, 6, 6)) r.add(b);
- }
- for (byte[] b: r) this.urlIndex.remove(b);
- i = this.stack.iterator();
- while (i.hasNext()) {
- byte[] b = i.next();
- if (NaturalOrder.naturalOrder.equal(hosthash, 0, b, 6, 6)) i.remove();
- }
+ public void removeHost(final Iterable hosthashes, final boolean concurrent) {
+ if (hosthashes == null) return;
+ Thread t = new Thread() {
+ public void run() {
+ try {
+ Iterator i = ZURL.this.urlIndex.keys(true, null);
+ List r = new ArrayList();
+ while (i.hasNext()) {
+ byte[] b = i.next();
+ for (byte[] hosthash: hosthashes) {
+ if (NaturalOrder.naturalOrder.equal(hosthash, 0, b, 6, 6)) r.add(b);
+ }
+ }
+ for (byte[] b: r) ZURL.this.urlIndex.remove(b);
+ i = ZURL.this.stack.iterator();
+ while (i.hasNext()) {
+ byte[] b = i.next();
+ for (byte[] hosthash: hosthashes) {
+ if (NaturalOrder.naturalOrder.equal(hosthash, 0, b, 6, 6)) i.remove();
+ }
+ }
+ } catch (IOException e) {}
+ }
+ };
+ if (concurrent) t.start(); else t.run();
}
public void push(
diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java
index 27138a7f1..8b50d2d4b 100644
--- a/source/net/yacy/search/index/Fulltext.java
+++ b/source/net/yacy/search/index/Fulltext.java
@@ -30,7 +30,6 @@ import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Date;
-import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
@@ -48,7 +47,6 @@ import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector;
import net.yacy.cora.federate.solr.connector.MirrorSolrConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.order.CloneableIterator;
-import net.yacy.cora.sorting.ConcurrentScoreMap;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.cora.sorting.ScoreMap;
import net.yacy.cora.storage.ZIPReader;
@@ -206,6 +204,7 @@ public final class Fulltext {
this.urlIndexFile.clear();
}
this.statsDump = null;
+ this.getSolr().commit(true);
}
public void clearLocalSolr() throws IOException {
@@ -356,22 +355,19 @@ public final class Fulltext {
* here such a fragment can be used to delete all these domains at once
* @param hosthash the hash of the host to be deleted
* @param freshdate either NULL or a date in the past which is the limit for deletion. Only documents older than this date are deleted
- * @return number of deleted domains
* @throws IOException
*/
- public int deleteDomainHashpart(final String hosthash, Date freshdate, boolean concurrent) {
+ public void deleteDomainHashpart(final String hosthash, Date freshdate, boolean concurrent) {
// first collect all url hashes that belong to the domain
assert hosthash.length() == 6;
final String q = YaCySchema.host_id_s.getSolrFieldName() + ":\"" + hosthash + "\"" +
((freshdate != null && freshdate.before(new Date())) ? (" AND " + YaCySchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : "");
- final AtomicInteger count = new AtomicInteger(0);
Thread t = new Thread() {
public void run() {
// delete in solr
synchronized (Fulltext.this.solr) {
try {
- count.addAndGet(Fulltext.this.solr.deleteByQuery(q));
- if (count.get() > 0) Fulltext.this.solr.commit(true);
+ Fulltext.this.solr.deleteByQuery(q);
} catch (IOException e) {}
}
@@ -408,22 +404,22 @@ public final class Fulltext {
}
}
};
- if (concurrent) t.start(); else t.run();
- return count.get();
+ if (concurrent) t.start(); else {
+ t.run();
+ Fulltext.this.getSolr().commit(true);
+ }
}
- public int deleteDomainHostname(final String hostname, Date freshdate, boolean concurrent) {
+ public void deleteDomainHostname(final String hostname, Date freshdate, boolean concurrent) {
// first collect all url hashes that belong to the domain
final String q = YaCySchema.host_s.getSolrFieldName() + ":\"" + hostname + "\"" +
((freshdate != null && freshdate.before(new Date())) ? (" AND " + YaCySchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : "");
- final AtomicInteger count = new AtomicInteger(0);
Thread t = new Thread() {
public void run() {
// delete in solr
synchronized (Fulltext.this.solr) {
try {
- count.addAndGet(Fulltext.this.solr.deleteByQuery(q));
- if (count.get() > 0) Fulltext.this.solr.commit(true);
+ Fulltext.this.solr.deleteByQuery(q);
} catch (IOException e) {}
}
// finally remove the line with statistics
@@ -440,8 +436,10 @@ public final class Fulltext {
}
}
};
- if (concurrent) t.start(); else t.run();
- return count.get();
+ if (concurrent) t.start(); else {
+ t.run();
+ Fulltext.this.getSolr().commit(true);
+ }
}
/**
@@ -748,42 +746,7 @@ public final class Fulltext {
}
}
-
- /**
- * calculate a score map for url hash samples: each sample is a single url hash
- * that stands for all entries for the corresponding domain. The map counts the number
- * of occurrences of the domain
- * @param domainSamples a map from domain hashes to hash statistics
- * @return a map from url hash samples to counters
- */
- public ScoreMap urlSampleScores(final Map domainSamples) {
- final ScoreMap urlSampleScore = new ConcurrentScoreMap();
- for (final Map.Entry e: domainSamples.entrySet()) {
- urlSampleScore.inc(ASCII.String(e.getValue().urlhashb), e.getValue().count);
- }
- return urlSampleScore;
- }
-
- /**
- * calculate all domain names for all domain hashes
- * @param domainSamples a map from domain hashes to hash statistics
- * @return a map from domain hashes to host stats including domain names
- */
- public Map domainHashResolver(final Map domainSamples) {
- final HashMap hostMap = new HashMap();
-
- final ScoreMap hosthashScore = new ConcurrentScoreMap();
- for (final Map.Entry e: domainSamples.entrySet()) {
- hosthashScore.inc(ASCII.String(e.getValue().urlhashb, 6, 6), e.getValue().count);
- }
- DigestURI url;
- for (final Map.Entry e: domainSamples.entrySet()) {
- url = this.getURL(e.getValue().urlhashb);
- hostMap.put(e.getKey(), new HostStat(url.getHost(), url.getPort(), e.getKey(), hosthashScore.get(e.getKey())));
- }
- return hostMap;
- }
-
+
public Iterator statistics(int count, final ScoreMap domainScore) {
// prevent too heavy IO.
if (this.statsDump != null && count <= this.statsDump.size()) return this.statsDump.iterator();
@@ -809,15 +772,6 @@ public final class Fulltext {
return (this.statsDump == null) ? new ArrayList().iterator() : this.statsDump.iterator();
}
- private static class URLHashCounter {
- public byte[] urlhashb;
- public int count;
- public URLHashCounter(final byte[] urlhashb) {
- this.urlhashb = urlhashb;
- this.count = 1;
- }
- }
-
public static class HostStat {
public String hostname, hosthash;
public int port;
|