From 030d0776ff4dc6cbf23697188866ee5feb1107f8 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Thu, 24 Oct 2013 16:20:20 +0200 Subject: [PATCH] Enhanced crawl start for very, very large crawl lists (i.e. > 5000) which had a problem because of badly used concurrency. This fix also caused a redesign of the whole host deletion process. This should fix bug http://bugs.yacy.net/view.php?id=250 --- htroot/CrawlResults.java | 6 +- htroot/Crawler_p.java | 10 ++- htroot/IndexControlURLs_p.java | 6 +- source/net/yacy/crawler/CrawlStacker.java | 15 +++- source/net/yacy/search/Switchboard.java | 5 +- source/net/yacy/search/index/ErrorCache.java | 19 ++-- source/net/yacy/search/index/Fulltext.java | 91 +++++++++++--------- 7 files changed, 90 insertions(+), 62 deletions(-) diff --git a/htroot/CrawlResults.java b/htroot/CrawlResults.java index 0ee465dbf..9014570b4 100644 --- a/htroot/CrawlResults.java +++ b/htroot/CrawlResults.java @@ -27,9 +27,11 @@ import java.text.SimpleDateFormat; import java.util.Arrays; import java.util.Date; +import java.util.HashSet; import java.util.Iterator; import java.util.Locale; import java.util.Map; +import java.util.Set; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.UTF8; @@ -124,7 +126,9 @@ public class CrawlResults { if (post.containsKey("deletedomain")) { final String domain = post.get("domain", null); if (domain != null) { - sb.index.fulltext().deleteDomainHostname(domain, null); + Set hostnames = new HashSet(); + hostnames.add(domain); + sb.index.fulltext().deleteStaleDomainNames(hostnames, null); ResultURLs.deleteDomain(tabletype, domain); } } diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 609ab5fb6..8e0d0107b 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -312,9 +312,9 @@ public class Crawler_p { if (fullDomain) { siteFilter = CrawlProfile.siteFilter(rootURLs); if (deleteold) { - for (DigestURL u: rootURLs) { - sb.index.fulltext().deleteDomainHashpart(u.hosthash(), deleteageDate); - } + Set hosthashes = new HashSet(); + for (DigestURL u: rootURLs) hosthashes.add(u.hosthash()); + sb.index.fulltext().deleteStaleDomainHashes(hosthashes, deleteageDate); } } else if (subPath) { siteFilter = CrawlProfile.subpathFilter(rootURLs); @@ -387,10 +387,12 @@ public class Crawler_p { try {sb.crawlQueues.noticeURL.removeByProfileHandle(profile.handle(), 10000);} catch (final SpaceExceededException e1) {} // delete all error urls for that domain + Set hosthashes = new HashSet(); for (DigestURL u: rootURLs) { sb.index.fulltext().remove(u.hash()); - sb.crawlQueues.errorURL.removeHost(ASCII.getBytes(u.hosthash())); + hosthashes.add(u.hosthash()); } + sb.crawlQueues.errorURL.removeHosts(hosthashes); sb.index.fulltext().commit(true); // start the crawl diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java index 1fc8c91d8..94d46ba0e 100644 --- a/htroot/IndexControlURLs_p.java +++ b/htroot/IndexControlURLs_p.java @@ -28,9 +28,11 @@ import java.io.File; import java.io.IOException; import java.net.MalformedURLException; +import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Set; import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.document.encoding.ASCII; @@ -294,7 +296,9 @@ public class IndexControlURLs_p { if (post.containsKey("deletedomain")) { final String domain = post.get("domain"); - segment.fulltext().deleteDomainHostname(domain, null); + Set hostnames = new HashSet(); + hostnames.add(domain); + segment.fulltext().deleteStaleDomainNames(hostnames, null); // trigger the loading of the table post.put("statistics", ""); } diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java index c9a5a0a50..a3b5da25f 100644 --- a/source/net/yacy/crawler/CrawlStacker.java +++ b/source/net/yacy/crawler/CrawlStacker.java @@ -28,8 +28,10 @@ import java.io.IOException; import java.net.InetAddress; import java.net.MalformedURLException; import java.util.Date; +import java.util.HashSet; import java.util.List; import java.util.Locale; +import java.util.Set; import java.util.concurrent.BlockingQueue; import java.util.concurrent.atomic.AtomicInteger; @@ -175,6 +177,17 @@ public final class CrawlStacker { } private void enqueueEntries(final byte[] initiator, final String profileHandle, final List hyperlinks, final boolean replace) { + if (replace) { + // delete old entries, if exists to force a re-load of the url (thats wanted here) + Set hosthashes = new HashSet(); + for (final AnchorURL url: hyperlinks) { + if (url == null) continue; + final byte[] urlhash = url.hash(); + byte[] hosthash = new byte[6]; System.arraycopy(urlhash, 6, hosthash, 0, 6); + hosthashes.add(ASCII.String(hosthash)); + } + this.nextQueue.errorURL.removeHosts(hosthashes); + } for (final AnchorURL url: hyperlinks) { if (url == null) continue; @@ -182,8 +195,6 @@ public final class CrawlStacker { final byte[] urlhash = url.hash(); if (replace) { this.indexSegment.fulltext().remove(urlhash); - byte[] hosthash = new byte[6]; System.arraycopy(urlhash, 6, hosthash, 0, 6); - this.nextQueue.errorURL.removeHost(hosthash); String u = url.toNormalform(true); if (u.endsWith("/")) { u = u + "index.html"; diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index e5a36a242..c7495be85 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -59,6 +59,7 @@ import java.util.ArrayList; import java.util.Collection; import java.util.Date; import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashSet; import java.util.List; @@ -2925,7 +2926,9 @@ public final class Switchboard extends serverSwitch { // remove the document from the error-db byte[] hosthash = new byte[6]; System.arraycopy(urlhash, 6, hosthash, 0, 6); - this.crawlQueues.errorURL.removeHost(hosthash); + Set hosthashes = new HashSet(); + hosthashes.add(ASCII.String(hosthash)); + this.crawlQueues.errorURL.removeHosts(hosthashes); this.index.fulltext().remove(urlhash); // get a scraper to get the title diff --git a/source/net/yacy/search/index/ErrorCache.java b/source/net/yacy/search/index/ErrorCache.java index 7bf4e3a05..d460ab680 100644 --- a/source/net/yacy/search/index/ErrorCache.java +++ b/source/net/yacy/search/index/ErrorCache.java @@ -26,6 +26,7 @@ import java.util.Collection; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.Map; +import java.util.Set; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrQuery.SortClause; @@ -37,7 +38,6 @@ import org.apache.solr.common.SolrInputDocument; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.solr.FailCategory; -import net.yacy.cora.order.NaturalOrder; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.data.CrawlProfile; import net.yacy.search.index.Fulltext; @@ -81,18 +81,15 @@ public class ErrorCache { this.fulltext.getDefaultConnector().deleteByQuery(CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]"); } - public void removeHost(final byte[] hosthash) { - if (hosthash == null) return; - try { - this.fulltext.getDefaultConnector().deleteByQuery(CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + ASCII.String(hosthash) + "\" AND " + CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]"); - synchronized (this.stack) { + public void removeHosts(final Set hosthashes) { + if (hosthashes == null || hosthashes.size() == 0) return; + this.fulltext.deleteDomainErrors(hosthashes); + synchronized (this.stack) { Iterator i = ErrorCache.this.stack.keySet().iterator(); - while (i.hasNext()) { - String b = i.next(); - if (NaturalOrder.naturalOrder.equal(hosthash, 0, ASCII.getBytes(b), 6, 6)) i.remove(); - } + while (i.hasNext()) { + String b = i.next(); + if (hosthashes.contains(b)) i.remove(); } - } catch (final IOException e) { } } diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index d9854fd65..40f6db2c4 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -440,23 +440,15 @@ public final class Fulltext { * @param freshdate either NULL or a date in the past which is the limit for deletion. Only documents older than this date are deleted * @throws IOException */ - public void deleteDomainHashpart(final String hosthash, Date freshdate) { - // first collect all url hashes that belong to the domain - assert hosthash.length() == 6; - final String collection1Query = CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hosthash + "\"" + - ((freshdate != null && freshdate.before(new Date())) ? - (" AND " + CollectionSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : - "" - ); - final String webgraphQuery = WebgraphSchema.source_host_id_s.getSolrFieldName() + ":\"" + hosthash + "\"" + - ((freshdate != null && freshdate.before(new Date())) ? - (" AND " + WebgraphSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : - "" - ); - + public void deleteStaleDomainHashes(final Set hosthashes, Date freshdate) { // delete in solr - try {Fulltext.this.getDefaultConnector().deleteByQuery(collection1Query);} catch (final IOException e) {} - if (this.writeWebgraph) try {Fulltext.this.getWebgraphConnector().deleteByQuery(webgraphQuery);} catch (final IOException e) {} + Date now = new Date(); + deleteDomainWithConstraint(this.getDefaultConnector(), CollectionSchema.host_id_s.getSolrFieldName(), hosthashes, + (freshdate == null || freshdate.after(now)) ? null : + (CollectionSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]")); + if (this.writeWebgraph) deleteDomainWithConstraint(this.getWebgraphConnector(), WebgraphSchema.source_host_id_s.getSolrFieldName(), hosthashes, + (freshdate == null || freshdate.after(now)) ? null : + (WebgraphSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]")); // delete in old metadata structure if (Fulltext.this.urlIndexFile != null) { @@ -467,7 +459,7 @@ public final class Fulltext { String hash; while (i != null && i.hasNext()) { hash = ASCII.String(i.next()); - if (hosthash.equals(hash.substring(6))) l.add(hash); + if (hosthashes.contains(hash.substring(6))) l.add(hash); } // then delete the urls using this list @@ -481,32 +473,20 @@ public final class Fulltext { HostStat hs; while (hsi.hasNext()) { hs = hsi.next(); - if (hs.hosthash.equals(hosthash)) { - hsi.remove(); - break; - } + if (hosthashes.contains(hs.hosthash)) hsi.remove(); } } } - public void deleteDomainHostname(final String hostname, Date freshdate) { - // first collect all url hashes that belong to the domain - final String collectionQuery = - CollectionSchema.host_s.getSolrFieldName() + ":\"" + hostname + "\"" + - ((freshdate != null && freshdate.before(new Date())) ? - (" AND " + CollectionSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : - "" - ); - final String webgraphQuery = - WebgraphSchema.source_host_s.getSolrFieldName() + ":\"" + hostname + "\"" + - ((freshdate != null && freshdate.before(new Date())) ? - (" AND " + WebgraphSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : - "" - ); - - // delete in solr - try {Fulltext.this.getDefaultConnector().deleteByQuery(collectionQuery);} catch (final IOException e) {} - if (this.writeWebgraph) try {Fulltext.this.getWebgraphConnector().deleteByQuery(webgraphQuery);} catch (final IOException e) {} + public void deleteStaleDomainNames(final Set hostnames, Date freshdate) { + + Date now = new Date(); + deleteDomainWithConstraint(this.getDefaultConnector(), CollectionSchema.host_s.getSolrFieldName(), hostnames, + (freshdate == null || freshdate.after(now)) ? null : + (CollectionSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]")); + if (this.writeWebgraph) deleteDomainWithConstraint(this.getWebgraphConnector(), WebgraphSchema.source_host_s.getSolrFieldName(), hostnames, + (freshdate == null || freshdate.after(now)) ? null : + (WebgraphSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]")); // finally remove the line with statistics if (Fulltext.this.statsDump != null) { @@ -514,10 +494,37 @@ public final class Fulltext { HostStat hs; while (hsi.hasNext()) { hs = hsi.next(); - if (hs.hostname.equals(hostname)) { - hsi.remove(); - break; + if (hostnames.contains(hs.hostname)) hsi.remove(); + } + } + } + + /** + * delete all documents within a domain that are registered as error document + * @param hosthashes + */ + public void deleteDomainErrors(final Set hosthashes) { + deleteDomainWithConstraint(this.getDefaultConnector(), CollectionSchema.host_id_s.getSolrFieldName(), hosthashes, CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]"); + } + + private static void deleteDomainWithConstraint(SolrConnector connector, String fieldname, final Set hosthashes, String constraintQuery) { + if (hosthashes == null || hosthashes.size() == 0) return; + int subsetscount = 1 + (hosthashes.size() / 255); // if the list is too large, we get a "too many boolean clauses" exception + int c = 0; + @SuppressWarnings("unchecked") + List[] subsets = new ArrayList[subsetscount]; + for (int i = 0; i < subsetscount; i++) subsets[i] = new ArrayList(); + for (String hosthash: hosthashes) subsets[c++ % subsetscount].add(hosthash); + for (List subset: subsets) { + try { + StringBuilder query = new StringBuilder(); + for (String hosthash: subset) { + if (query.length() > 0) query.append(" OR "); + //query.append(CollectionSchema.host_id_s.getSolrFieldName()).append(":\"").append(hosthash).append(":\""); + query.append("({!raw f=").append(fieldname).append('}').append(hosthash).append(")"); } + if (constraintQuery == null) connector.deleteByQuery(query.toString()); else connector.deleteByQuery("(" + query.toString() + ") AND " + constraintQuery); + } catch (final IOException e) { } } }