Enhanced crawl start for very, very large crawl lists (i.e. > 5000)

which had a problem because of badly used concurrency.
This fix also caused a redesign of the whole host deletion process.
This should fix bug http://bugs.yacy.net/view.php?id=250
pull/1/head
Michael Peter Christen 12 years ago
parent 6aabc4e5c8
commit 030d0776ff

@ -27,9 +27,11 @@
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
@ -124,7 +126,9 @@ public class CrawlResults {
if (post.containsKey("deletedomain")) {
final String domain = post.get("domain", null);
if (domain != null) {
sb.index.fulltext().deleteDomainHostname(domain, null);
Set<String> hostnames = new HashSet<String>();
hostnames.add(domain);
sb.index.fulltext().deleteStaleDomainNames(hostnames, null);
ResultURLs.deleteDomain(tabletype, domain);
}
}

@ -312,9 +312,9 @@ public class Crawler_p {
if (fullDomain) {
siteFilter = CrawlProfile.siteFilter(rootURLs);
if (deleteold) {
for (DigestURL u: rootURLs) {
sb.index.fulltext().deleteDomainHashpart(u.hosthash(), deleteageDate);
}
Set<String> hosthashes = new HashSet<String>();
for (DigestURL u: rootURLs) hosthashes.add(u.hosthash());
sb.index.fulltext().deleteStaleDomainHashes(hosthashes, deleteageDate);
}
} else if (subPath) {
siteFilter = CrawlProfile.subpathFilter(rootURLs);
@ -387,10 +387,12 @@ public class Crawler_p {
try {sb.crawlQueues.noticeURL.removeByProfileHandle(profile.handle(), 10000);} catch (final SpaceExceededException e1) {}
// delete all error urls for that domain
Set<String> hosthashes = new HashSet<String>();
for (DigestURL u: rootURLs) {
sb.index.fulltext().remove(u.hash());
sb.crawlQueues.errorURL.removeHost(ASCII.getBytes(u.hosthash()));
hosthashes.add(u.hosthash());
}
sb.crawlQueues.errorURL.removeHosts(hosthashes);
sb.index.fulltext().commit(true);
// start the crawl

@ -28,9 +28,11 @@
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.encoding.ASCII;
@ -294,7 +296,9 @@ public class IndexControlURLs_p {
if (post.containsKey("deletedomain")) {
final String domain = post.get("domain");
segment.fulltext().deleteDomainHostname(domain, null);
Set<String> hostnames = new HashSet<String>();
hostnames.add(domain);
segment.fulltext().deleteStaleDomainNames(hostnames, null);
// trigger the loading of the table
post.put("statistics", "");
}

@ -28,8 +28,10 @@ import java.io.IOException;
import java.net.InetAddress;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.atomic.AtomicInteger;
@ -175,6 +177,17 @@ public final class CrawlStacker {
}
private void enqueueEntries(final byte[] initiator, final String profileHandle, final List<AnchorURL> hyperlinks, final boolean replace) {
if (replace) {
// delete old entries, if exists to force a re-load of the url (thats wanted here)
Set<String> hosthashes = new HashSet<String>();
for (final AnchorURL url: hyperlinks) {
if (url == null) continue;
final byte[] urlhash = url.hash();
byte[] hosthash = new byte[6]; System.arraycopy(urlhash, 6, hosthash, 0, 6);
hosthashes.add(ASCII.String(hosthash));
}
this.nextQueue.errorURL.removeHosts(hosthashes);
}
for (final AnchorURL url: hyperlinks) {
if (url == null) continue;
@ -182,8 +195,6 @@ public final class CrawlStacker {
final byte[] urlhash = url.hash();
if (replace) {
this.indexSegment.fulltext().remove(urlhash);
byte[] hosthash = new byte[6]; System.arraycopy(urlhash, 6, hosthash, 0, 6);
this.nextQueue.errorURL.removeHost(hosthash);
String u = url.toNormalform(true);
if (u.endsWith("/")) {
u = u + "index.html";

@ -59,6 +59,7 @@ import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
@ -2925,7 +2926,9 @@ public final class Switchboard extends serverSwitch {
// remove the document from the error-db
byte[] hosthash = new byte[6]; System.arraycopy(urlhash, 6, hosthash, 0, 6);
this.crawlQueues.errorURL.removeHost(hosthash);
Set<String> hosthashes = new HashSet<String>();
hosthashes.add(ASCII.String(hosthash));
this.crawlQueues.errorURL.removeHosts(hosthashes);
this.index.fulltext().remove(urlhash);
// get a scraper to get the title

@ -26,6 +26,7 @@ import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrQuery.SortClause;
@ -37,7 +38,6 @@ import org.apache.solr.common.SolrInputDocument;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.FailCategory;
import net.yacy.cora.order.NaturalOrder;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.search.index.Fulltext;
@ -81,18 +81,15 @@ public class ErrorCache {
this.fulltext.getDefaultConnector().deleteByQuery(CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]");
}
public void removeHost(final byte[] hosthash) {
if (hosthash == null) return;
try {
this.fulltext.getDefaultConnector().deleteByQuery(CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + ASCII.String(hosthash) + "\" AND " + CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]");
public void removeHosts(final Set<String> hosthashes) {
if (hosthashes == null || hosthashes.size() == 0) return;
this.fulltext.deleteDomainErrors(hosthashes);
synchronized (this.stack) {
Iterator<String> i = ErrorCache.this.stack.keySet().iterator();
while (i.hasNext()) {
String b = i.next();
if (NaturalOrder.naturalOrder.equal(hosthash, 0, ASCII.getBytes(b), 6, 6)) i.remove();
}
if (hosthashes.contains(b)) i.remove();
}
} catch (final IOException e) {
}
}

@ -440,23 +440,15 @@ public final class Fulltext {
* @param freshdate either NULL or a date in the past which is the limit for deletion. Only documents older than this date are deleted
* @throws IOException
*/
public void deleteDomainHashpart(final String hosthash, Date freshdate) {
// first collect all url hashes that belong to the domain
assert hosthash.length() == 6;
final String collection1Query = CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hosthash + "\"" +
((freshdate != null && freshdate.before(new Date())) ?
(" AND " + CollectionSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") :
""
);
final String webgraphQuery = WebgraphSchema.source_host_id_s.getSolrFieldName() + ":\"" + hosthash + "\"" +
((freshdate != null && freshdate.before(new Date())) ?
(" AND " + WebgraphSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") :
""
);
public void deleteStaleDomainHashes(final Set<String> hosthashes, Date freshdate) {
// delete in solr
try {Fulltext.this.getDefaultConnector().deleteByQuery(collection1Query);} catch (final IOException e) {}
if (this.writeWebgraph) try {Fulltext.this.getWebgraphConnector().deleteByQuery(webgraphQuery);} catch (final IOException e) {}
Date now = new Date();
deleteDomainWithConstraint(this.getDefaultConnector(), CollectionSchema.host_id_s.getSolrFieldName(), hosthashes,
(freshdate == null || freshdate.after(now)) ? null :
(CollectionSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]"));
if (this.writeWebgraph) deleteDomainWithConstraint(this.getWebgraphConnector(), WebgraphSchema.source_host_id_s.getSolrFieldName(), hosthashes,
(freshdate == null || freshdate.after(now)) ? null :
(WebgraphSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]"));
// delete in old metadata structure
if (Fulltext.this.urlIndexFile != null) {
@ -467,7 +459,7 @@ public final class Fulltext {
String hash;
while (i != null && i.hasNext()) {
hash = ASCII.String(i.next());
if (hosthash.equals(hash.substring(6))) l.add(hash);
if (hosthashes.contains(hash.substring(6))) l.add(hash);
}
// then delete the urls using this list
@ -481,32 +473,20 @@ public final class Fulltext {
HostStat hs;
while (hsi.hasNext()) {
hs = hsi.next();
if (hs.hosthash.equals(hosthash)) {
hsi.remove();
break;
}
if (hosthashes.contains(hs.hosthash)) hsi.remove();
}
}
}
public void deleteDomainHostname(final String hostname, Date freshdate) {
// first collect all url hashes that belong to the domain
final String collectionQuery =
CollectionSchema.host_s.getSolrFieldName() + ":\"" + hostname + "\"" +
((freshdate != null && freshdate.before(new Date())) ?
(" AND " + CollectionSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") :
""
);
final String webgraphQuery =
WebgraphSchema.source_host_s.getSolrFieldName() + ":\"" + hostname + "\"" +
((freshdate != null && freshdate.before(new Date())) ?
(" AND " + WebgraphSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") :
""
);
public void deleteStaleDomainNames(final Set<String> hostnames, Date freshdate) {
// delete in solr
try {Fulltext.this.getDefaultConnector().deleteByQuery(collectionQuery);} catch (final IOException e) {}
if (this.writeWebgraph) try {Fulltext.this.getWebgraphConnector().deleteByQuery(webgraphQuery);} catch (final IOException e) {}
Date now = new Date();
deleteDomainWithConstraint(this.getDefaultConnector(), CollectionSchema.host_s.getSolrFieldName(), hostnames,
(freshdate == null || freshdate.after(now)) ? null :
(CollectionSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]"));
if (this.writeWebgraph) deleteDomainWithConstraint(this.getWebgraphConnector(), WebgraphSchema.source_host_s.getSolrFieldName(), hostnames,
(freshdate == null || freshdate.after(now)) ? null :
(WebgraphSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]"));
// finally remove the line with statistics
if (Fulltext.this.statsDump != null) {
@ -514,12 +494,39 @@ public final class Fulltext {
HostStat hs;
while (hsi.hasNext()) {
hs = hsi.next();
if (hs.hostname.equals(hostname)) {
hsi.remove();
break;
if (hostnames.contains(hs.hostname)) hsi.remove();
}
}
}
/**
* delete all documents within a domain that are registered as error document
* @param hosthashes
*/
public void deleteDomainErrors(final Set<String> hosthashes) {
deleteDomainWithConstraint(this.getDefaultConnector(), CollectionSchema.host_id_s.getSolrFieldName(), hosthashes, CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]");
}
private static void deleteDomainWithConstraint(SolrConnector connector, String fieldname, final Set<String> hosthashes, String constraintQuery) {
if (hosthashes == null || hosthashes.size() == 0) return;
int subsetscount = 1 + (hosthashes.size() / 255); // if the list is too large, we get a "too many boolean clauses" exception
int c = 0;
@SuppressWarnings("unchecked")
List<String>[] subsets = new ArrayList[subsetscount];
for (int i = 0; i < subsetscount; i++) subsets[i] = new ArrayList<String>();
for (String hosthash: hosthashes) subsets[c++ % subsetscount].add(hosthash);
for (List<String> subset: subsets) {
try {
StringBuilder query = new StringBuilder();
for (String hosthash: subset) {
if (query.length() > 0) query.append(" OR ");
//query.append(CollectionSchema.host_id_s.getSolrFieldName()).append(":\"").append(hosthash).append(":\"");
query.append("({!raw f=").append(fieldname).append('}').append(hosthash).append(")");
}
if (constraintQuery == null) connector.deleteByQuery(query.toString()); else connector.deleteByQuery("(" + query.toString() + ") AND " + constraintQuery);
} catch (final IOException e) {
}
}
}
/**

Loading…
Cancel
Save