From 3ee4f56c39f447b29ebe19c8563d18239148ea2d Mon Sep 17 00:00:00 2001 From: luccioman Date: Thu, 22 Sep 2016 09:07:07 +0200 Subject: [PATCH] Improved ErrorCache behavior when switching networks Even after network switch, ErroCache was still holding a reference to the previous Solr cores, thus becoming useless until next YaCy restart. Initial error cache filling with recent errors from the index was also missing after the swtich. --- source/net/yacy/crawler/data/CrawlQueues.java | 6 +- source/net/yacy/search/index/ErrorCache.java | 65 +++++--------- .../yacy/search/index/ErrorCacheFiller.java | 88 +++++++++++++++++++ 3 files changed, 116 insertions(+), 43 deletions(-) create mode 100644 source/net/yacy/search/index/ErrorCacheFiller.java diff --git a/source/net/yacy/crawler/data/CrawlQueues.java b/source/net/yacy/crawler/data/CrawlQueues.java index 4e272204a..025b5b112 100644 --- a/source/net/yacy/crawler/data/CrawlQueues.java +++ b/source/net/yacy/crawler/data/CrawlQueues.java @@ -68,6 +68,7 @@ import net.yacy.search.IndexingQueueEntry; import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; import net.yacy.search.index.ErrorCache; +import net.yacy.search.index.ErrorCacheFiller; public class CrawlQueues { @@ -95,7 +96,7 @@ public class CrawlQueues { log.config("Opening noticeURL.."); this.noticeURL = new NoticedURL(queuePath, sb.getConfigInt("crawler.onDemandLimit", 1000), sb.exceed134217727); log.config("Opening errorURL.."); - this.errorURL = new ErrorCache(sb.index.fulltext()); + this.errorURL = new ErrorCache(sb); log.config("Opening delegatedURL.."); this.delegatedURL = null; } @@ -117,6 +118,9 @@ public class CrawlQueues { // removed pending requests this.workerQueue.clear(); this.errorURL.clearCache(); + /* Concurrently refill the error cache with recent errors from the index */ + new ErrorCacheFiller(this.sb, this.errorURL).start(); + if (this.remoteCrawlProviderHashes != null) this.remoteCrawlProviderHashes.clear(); this.noticeURL.close(); this.noticeURL = new NoticedURL(newQueuePath, sb.getConfigInt("crawler.onDemandLimit", 1000), this.sb.exceed134217727); diff --git a/source/net/yacy/search/index/ErrorCache.java b/source/net/yacy/search/index/ErrorCache.java index 0cf287396..fedd72f3e 100644 --- a/source/net/yacy/search/index/ErrorCache.java +++ b/source/net/yacy/search/index/ErrorCache.java @@ -28,12 +28,8 @@ import java.util.LinkedHashMap; import java.util.Map; import java.util.Set; -import org.apache.solr.client.solrj.SolrQuery; -import org.apache.solr.client.solrj.SolrQuery.SortClause; import org.apache.solr.common.SolrDocument; -import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrInputDocument; -import org.apache.solr.common.params.CommonParams; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.id.DigestURL; @@ -42,6 +38,7 @@ import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.robots.RobotsTxt; +import net.yacy.search.Switchboard; import net.yacy.search.schema.CollectionConfiguration; import net.yacy.search.schema.CollectionSchema; @@ -52,51 +49,27 @@ public class ErrorCache { // the class object private final Map cache; - private final Fulltext fulltext; + private final Switchboard sb; - public ErrorCache(final Fulltext fulltext) { - this.fulltext = fulltext; + public ErrorCache(final Switchboard sb) { + this.sb = sb; this.cache = new LinkedHashMap(); // concurrently fill stack with latest values - new Thread() { - @Override - public void run() { - final SolrQuery params = new SolrQuery(); - params.setParam("defType", "edismax"); - params.setStart(0); - params.setRows(1000); - params.setFacet(false); - params.setSort(new SortClause(CollectionSchema.load_date_dt.getSolrFieldName(), SolrQuery.ORDER.desc)); // load_date_dt = faildate - params.setFields(CollectionSchema.id.getSolrFieldName()); - params.setQuery(CollectionSchema.failreason_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM); - params.set(CommonParams.DF, CollectionSchema.id.getSolrFieldName()); // DisMaxParams.QF or CommonParams.DF must be given - SolrDocumentList docList; - try { - docList = fulltext.getDefaultConnector().getDocumentListByParams(params); - if (docList != null) for (int i = docList.size() - 1; i >= 0; i--) { - SolrDocument doc = docList.get(i); - String hash = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()); - cache.put(hash, null); - } - } catch (IOException e) { - ConcurrentLog.logException(e); - } - } - }.start(); + new ErrorCacheFiller(sb, this).start(); } - + public void clearCache() { if (this.cache != null) synchronized (this.cache) {this.cache.clear();} } public void clear() throws IOException { clearCache(); - this.fulltext.getDefaultConnector().deleteByQuery(CollectionSchema.failreason_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM); + this.sb.index.fulltext().getDefaultConnector().deleteByQuery(CollectionSchema.failreason_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM); } public void removeHosts(final Set hosthashes) { if (hosthashes == null || hosthashes.size() == 0) return; - this.fulltext.deleteDomainErrors(hosthashes); + this.sb.index.fulltext().deleteDomainErrors(hosthashes); synchronized (this.cache) { Iterator i = ErrorCache.this.cache.keySet().iterator(); while (i.hasNext()) { @@ -105,6 +78,14 @@ public class ErrorCache { } } } + + /** + * Put a document hash to the internal cache. + * @param hash document hash. + */ + public void putHashOnly(String hash) { + this.cache.put(hash, null); + } /** * Adds a error document to the Solr index (marked as failed by httpstatus_i <> 200) @@ -129,16 +110,16 @@ public class ErrorCache { url, profile == null ? null : profile.collections(), failCategory.name() + " " + reason, failCategory.failType, httpcode, crawldepth); - if (this.fulltext.getDefaultConnector() != null && failCategory.store && !RobotsTxt.isRobotsURL(url)) { + if (this.sb.index.fulltext().getDefaultConnector() != null && failCategory.store && !RobotsTxt.isRobotsURL(url)) { // send the error to solr try { // do not overwrite error reports with error reports - SolrDocument olddoc = this.fulltext.getDefaultConnector().getDocumentById(ASCII.String(failDoc.getDigestURL().hash()), CollectionSchema.httpstatus_i.getSolrFieldName()); + SolrDocument olddoc = this.sb.index.fulltext().getDefaultConnector().getDocumentById(ASCII.String(failDoc.getDigestURL().hash()), CollectionSchema.httpstatus_i.getSolrFieldName()); if (olddoc == null || olddoc.getFieldValue(CollectionSchema.httpstatus_i.getSolrFieldName()) == null || ((Integer) olddoc.getFieldValue(CollectionSchema.httpstatus_i.getSolrFieldName())) == 200) { - SolrInputDocument errorDoc = failDoc.toSolr(this.fulltext.getDefaultConfiguration()); - this.fulltext.getDefaultConnector().add(errorDoc); + SolrInputDocument errorDoc = failDoc.toSolr(this.sb.index.fulltext().getDefaultConfiguration()); + this.sb.index.fulltext().getDefaultConnector().add(errorDoc); } } catch (final IOException e) { ConcurrentLog.warn("SOLR", "failed to send error " + url.toNormalform(true) + " to solr: " + e.getMessage()); @@ -174,7 +155,7 @@ public class ErrorCache { String hash = entry.getKey(); CollectionConfiguration.FailDoc failDoc = entry.getValue(); if (failDoc == null) { - SolrDocument doc = this.fulltext.getDefaultConnector().getDocumentById(hash); + SolrDocument doc = this.sb.index.fulltext().getDefaultConnector().getDocumentById(hash); if (doc != null) failDoc = new CollectionConfiguration.FailDoc(doc); } if (failDoc != null) l.add(failDoc); @@ -193,7 +174,7 @@ public class ErrorCache { } if (failDoc != null) return failDoc; try { - final SolrDocument doc = this.fulltext.getDefaultConnector().getDocumentById(urlhash); + final SolrDocument doc = this.sb.index.fulltext().getDefaultConnector().getDocumentById(urlhash); if (doc == null) return null; Object failreason = doc.getFieldValue(CollectionSchema.failreason_s.getSolrFieldName()); if (failreason == null || failreason.toString().length() == 0) return null; @@ -207,7 +188,7 @@ public class ErrorCache { String urlHashString = ASCII.String(urlHash); try { // load the fail reason, if exists - final SolrDocument doc = this.fulltext.getDefaultConnector().getDocumentById(urlHashString, CollectionSchema.failreason_s.getSolrFieldName()); + final SolrDocument doc = this.sb.index.fulltext().getDefaultConnector().getDocumentById(urlHashString, CollectionSchema.failreason_s.getSolrFieldName()); if (doc == null) return false; // check if the document contains a value in the field CollectionSchema.failreason_s diff --git a/source/net/yacy/search/index/ErrorCacheFiller.java b/source/net/yacy/search/index/ErrorCacheFiller.java new file mode 100644 index 000000000..a4ba0a262 --- /dev/null +++ b/source/net/yacy/search/index/ErrorCacheFiller.java @@ -0,0 +1,88 @@ +/** + * ErrorCache + * Copyright 2016 by luccioman + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.search.index; + +import java.io.IOException; + +import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.client.solrj.SolrQuery.SortClause; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; +import org.apache.solr.common.params.CommonParams; + +import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; +import net.yacy.cora.util.ConcurrentLog; +import net.yacy.search.Switchboard; +import net.yacy.search.schema.CollectionSchema; + +/** + * A task to concurrently fill the ErrorCache from the index + * @author luccioman + * + */ +public class ErrorCacheFiller extends Thread { + + /** Switchboard instance */ + private Switchboard sb; + + /** The cache to fill */ + private ErrorCache cache; + + /** + * Constructor : this prepares the concurrent task + * @param sb switchboard instance. Must not be null. + * @param cache error cache to fill. Must not be null. + */ + public ErrorCacheFiller(Switchboard sb, ErrorCache cache) { + if(sb == null || cache == null) { + throw new IllegalArgumentException("Unexpected null parameters"); + } + this.sb = sb; + this.cache = cache; + } + + /** + * Fills the error cache with recently failed document hashes found in the index + */ + @Override + public void run() { + final SolrQuery params = new SolrQuery(); + params.setParam("defType", "edismax"); + params.setStart(0); + params.setRows(1000); + params.setFacet(false); + params.setSort(new SortClause(CollectionSchema.load_date_dt.getSolrFieldName(), SolrQuery.ORDER.desc)); // load_date_dt = faildate + params.setFields(CollectionSchema.id.getSolrFieldName()); + params.setQuery(CollectionSchema.failreason_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM); + params.set(CommonParams.DF, CollectionSchema.id.getSolrFieldName()); // DisMaxParams.QF or CommonParams.DF must be given + SolrDocumentList docList; + try { + docList = this.sb.index.fulltext().getDefaultConnector().getDocumentListByParams(params); + if (docList != null) for (int i = docList.size() - 1; i >= 0; i--) { + SolrDocument doc = docList.get(i); + String hash = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()); + cache.putHashOnly(hash); + } + } catch (IOException e) { + ConcurrentLog.logException(e); + } + } + +}