Improved ErrorCache behavior when switching networks

Even after network switch, ErroCache was still holding a reference to
the previous Solr cores, thus becoming useless until next YaCy restart.

Initial error cache filling with recent errors from the index was also
missing after the swtich.
pull/77/head
luccioman 9 years ago
parent 7d5ba2afa4
commit 3ee4f56c39

@ -68,6 +68,7 @@ import net.yacy.search.IndexingQueueEntry;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants; import net.yacy.search.SwitchboardConstants;
import net.yacy.search.index.ErrorCache; import net.yacy.search.index.ErrorCache;
import net.yacy.search.index.ErrorCacheFiller;
public class CrawlQueues { public class CrawlQueues {
@ -95,7 +96,7 @@ public class CrawlQueues {
log.config("Opening noticeURL.."); log.config("Opening noticeURL..");
this.noticeURL = new NoticedURL(queuePath, sb.getConfigInt("crawler.onDemandLimit", 1000), sb.exceed134217727); this.noticeURL = new NoticedURL(queuePath, sb.getConfigInt("crawler.onDemandLimit", 1000), sb.exceed134217727);
log.config("Opening errorURL.."); log.config("Opening errorURL..");
this.errorURL = new ErrorCache(sb.index.fulltext()); this.errorURL = new ErrorCache(sb);
log.config("Opening delegatedURL.."); log.config("Opening delegatedURL..");
this.delegatedURL = null; this.delegatedURL = null;
} }
@ -117,6 +118,9 @@ public class CrawlQueues {
// removed pending requests // removed pending requests
this.workerQueue.clear(); this.workerQueue.clear();
this.errorURL.clearCache(); this.errorURL.clearCache();
/* Concurrently refill the error cache with recent errors from the index */
new ErrorCacheFiller(this.sb, this.errorURL).start();
if (this.remoteCrawlProviderHashes != null) this.remoteCrawlProviderHashes.clear(); if (this.remoteCrawlProviderHashes != null) this.remoteCrawlProviderHashes.clear();
this.noticeURL.close(); this.noticeURL.close();
this.noticeURL = new NoticedURL(newQueuePath, sb.getConfigInt("crawler.onDemandLimit", 1000), this.sb.exceed134217727); this.noticeURL = new NoticedURL(newQueuePath, sb.getConfigInt("crawler.onDemandLimit", 1000), this.sb.exceed134217727);

@ -28,12 +28,8 @@ import java.util.LinkedHashMap;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrQuery.SortClause;
import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.CommonParams;
import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
@ -42,6 +38,7 @@ import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.robots.RobotsTxt; import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.search.Switchboard;
import net.yacy.search.schema.CollectionConfiguration; import net.yacy.search.schema.CollectionConfiguration;
import net.yacy.search.schema.CollectionSchema; import net.yacy.search.schema.CollectionSchema;
@ -52,51 +49,27 @@ public class ErrorCache {
// the class object // the class object
private final Map<String, CollectionConfiguration.FailDoc> cache; private final Map<String, CollectionConfiguration.FailDoc> cache;
private final Fulltext fulltext; private final Switchboard sb;
public ErrorCache(final Fulltext fulltext) { public ErrorCache(final Switchboard sb) {
this.fulltext = fulltext; this.sb = sb;
this.cache = new LinkedHashMap<String, CollectionConfiguration.FailDoc>(); this.cache = new LinkedHashMap<String, CollectionConfiguration.FailDoc>();
// concurrently fill stack with latest values // concurrently fill stack with latest values
new Thread() { new ErrorCacheFiller(sb, this).start();
@Override
public void run() {
final SolrQuery params = new SolrQuery();
params.setParam("defType", "edismax");
params.setStart(0);
params.setRows(1000);
params.setFacet(false);
params.setSort(new SortClause(CollectionSchema.load_date_dt.getSolrFieldName(), SolrQuery.ORDER.desc)); // load_date_dt = faildate
params.setFields(CollectionSchema.id.getSolrFieldName());
params.setQuery(CollectionSchema.failreason_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM);
params.set(CommonParams.DF, CollectionSchema.id.getSolrFieldName()); // DisMaxParams.QF or CommonParams.DF must be given
SolrDocumentList docList;
try {
docList = fulltext.getDefaultConnector().getDocumentListByParams(params);
if (docList != null) for (int i = docList.size() - 1; i >= 0; i--) {
SolrDocument doc = docList.get(i);
String hash = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
cache.put(hash, null);
}
} catch (IOException e) {
ConcurrentLog.logException(e);
}
}
}.start();
} }
public void clearCache() { public void clearCache() {
if (this.cache != null) synchronized (this.cache) {this.cache.clear();} if (this.cache != null) synchronized (this.cache) {this.cache.clear();}
} }
public void clear() throws IOException { public void clear() throws IOException {
clearCache(); clearCache();
this.fulltext.getDefaultConnector().deleteByQuery(CollectionSchema.failreason_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM); this.sb.index.fulltext().getDefaultConnector().deleteByQuery(CollectionSchema.failreason_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM);
} }
public void removeHosts(final Set<String> hosthashes) { public void removeHosts(final Set<String> hosthashes) {
if (hosthashes == null || hosthashes.size() == 0) return; if (hosthashes == null || hosthashes.size() == 0) return;
this.fulltext.deleteDomainErrors(hosthashes); this.sb.index.fulltext().deleteDomainErrors(hosthashes);
synchronized (this.cache) { synchronized (this.cache) {
Iterator<String> i = ErrorCache.this.cache.keySet().iterator(); Iterator<String> i = ErrorCache.this.cache.keySet().iterator();
while (i.hasNext()) { while (i.hasNext()) {
@ -105,6 +78,14 @@ public class ErrorCache {
} }
} }
} }
/**
* Put a document hash to the internal cache.
* @param hash document hash.
*/
public void putHashOnly(String hash) {
this.cache.put(hash, null);
}
/** /**
* Adds a error document to the Solr index (marked as failed by httpstatus_i <> 200) * Adds a error document to the Solr index (marked as failed by httpstatus_i <> 200)
@ -129,16 +110,16 @@ public class ErrorCache {
url, profile == null ? null : profile.collections(), url, profile == null ? null : profile.collections(),
failCategory.name() + " " + reason, failCategory.failType, failCategory.name() + " " + reason, failCategory.failType,
httpcode, crawldepth); httpcode, crawldepth);
if (this.fulltext.getDefaultConnector() != null && failCategory.store && !RobotsTxt.isRobotsURL(url)) { if (this.sb.index.fulltext().getDefaultConnector() != null && failCategory.store && !RobotsTxt.isRobotsURL(url)) {
// send the error to solr // send the error to solr
try { try {
// do not overwrite error reports with error reports // do not overwrite error reports with error reports
SolrDocument olddoc = this.fulltext.getDefaultConnector().getDocumentById(ASCII.String(failDoc.getDigestURL().hash()), CollectionSchema.httpstatus_i.getSolrFieldName()); SolrDocument olddoc = this.sb.index.fulltext().getDefaultConnector().getDocumentById(ASCII.String(failDoc.getDigestURL().hash()), CollectionSchema.httpstatus_i.getSolrFieldName());
if (olddoc == null || if (olddoc == null ||
olddoc.getFieldValue(CollectionSchema.httpstatus_i.getSolrFieldName()) == null || olddoc.getFieldValue(CollectionSchema.httpstatus_i.getSolrFieldName()) == null ||
((Integer) olddoc.getFieldValue(CollectionSchema.httpstatus_i.getSolrFieldName())) == 200) { ((Integer) olddoc.getFieldValue(CollectionSchema.httpstatus_i.getSolrFieldName())) == 200) {
SolrInputDocument errorDoc = failDoc.toSolr(this.fulltext.getDefaultConfiguration()); SolrInputDocument errorDoc = failDoc.toSolr(this.sb.index.fulltext().getDefaultConfiguration());
this.fulltext.getDefaultConnector().add(errorDoc); this.sb.index.fulltext().getDefaultConnector().add(errorDoc);
} }
} catch (final IOException e) { } catch (final IOException e) {
ConcurrentLog.warn("SOLR", "failed to send error " + url.toNormalform(true) + " to solr: " + e.getMessage()); ConcurrentLog.warn("SOLR", "failed to send error " + url.toNormalform(true) + " to solr: " + e.getMessage());
@ -174,7 +155,7 @@ public class ErrorCache {
String hash = entry.getKey(); String hash = entry.getKey();
CollectionConfiguration.FailDoc failDoc = entry.getValue(); CollectionConfiguration.FailDoc failDoc = entry.getValue();
if (failDoc == null) { if (failDoc == null) {
SolrDocument doc = this.fulltext.getDefaultConnector().getDocumentById(hash); SolrDocument doc = this.sb.index.fulltext().getDefaultConnector().getDocumentById(hash);
if (doc != null) failDoc = new CollectionConfiguration.FailDoc(doc); if (doc != null) failDoc = new CollectionConfiguration.FailDoc(doc);
} }
if (failDoc != null) l.add(failDoc); if (failDoc != null) l.add(failDoc);
@ -193,7 +174,7 @@ public class ErrorCache {
} }
if (failDoc != null) return failDoc; if (failDoc != null) return failDoc;
try { try {
final SolrDocument doc = this.fulltext.getDefaultConnector().getDocumentById(urlhash); final SolrDocument doc = this.sb.index.fulltext().getDefaultConnector().getDocumentById(urlhash);
if (doc == null) return null; if (doc == null) return null;
Object failreason = doc.getFieldValue(CollectionSchema.failreason_s.getSolrFieldName()); Object failreason = doc.getFieldValue(CollectionSchema.failreason_s.getSolrFieldName());
if (failreason == null || failreason.toString().length() == 0) return null; if (failreason == null || failreason.toString().length() == 0) return null;
@ -207,7 +188,7 @@ public class ErrorCache {
String urlHashString = ASCII.String(urlHash); String urlHashString = ASCII.String(urlHash);
try { try {
// load the fail reason, if exists // load the fail reason, if exists
final SolrDocument doc = this.fulltext.getDefaultConnector().getDocumentById(urlHashString, CollectionSchema.failreason_s.getSolrFieldName()); final SolrDocument doc = this.sb.index.fulltext().getDefaultConnector().getDocumentById(urlHashString, CollectionSchema.failreason_s.getSolrFieldName());
if (doc == null) return false; if (doc == null) return false;
// check if the document contains a value in the field CollectionSchema.failreason_s // check if the document contains a value in the field CollectionSchema.failreason_s

@ -0,0 +1,88 @@
/**
* ErrorCache
* Copyright 2016 by luccioman
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.search.index;
import java.io.IOException;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrQuery.SortClause;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.params.CommonParams;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.search.Switchboard;
import net.yacy.search.schema.CollectionSchema;
/**
* A task to concurrently fill the ErrorCache from the index
* @author luccioman
*
*/
public class ErrorCacheFiller extends Thread {
/** Switchboard instance */
private Switchboard sb;
/** The cache to fill */
private ErrorCache cache;
/**
* Constructor : this prepares the concurrent task
* @param sb switchboard instance. Must not be null.
* @param cache error cache to fill. Must not be null.
*/
public ErrorCacheFiller(Switchboard sb, ErrorCache cache) {
if(sb == null || cache == null) {
throw new IllegalArgumentException("Unexpected null parameters");
}
this.sb = sb;
this.cache = cache;
}
/**
* Fills the error cache with recently failed document hashes found in the index
*/
@Override
public void run() {
final SolrQuery params = new SolrQuery();
params.setParam("defType", "edismax");
params.setStart(0);
params.setRows(1000);
params.setFacet(false);
params.setSort(new SortClause(CollectionSchema.load_date_dt.getSolrFieldName(), SolrQuery.ORDER.desc)); // load_date_dt = faildate
params.setFields(CollectionSchema.id.getSolrFieldName());
params.setQuery(CollectionSchema.failreason_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM);
params.set(CommonParams.DF, CollectionSchema.id.getSolrFieldName()); // DisMaxParams.QF or CommonParams.DF must be given
SolrDocumentList docList;
try {
docList = this.sb.index.fulltext().getDefaultConnector().getDocumentListByParams(params);
if (docList != null) for (int i = docList.size() - 1; i >= 0; i--) {
SolrDocument doc = docList.get(i);
String hash = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
cache.putHashOnly(hash);
}
} catch (IOException e) {
ConcurrentLog.logException(e);
}
}
}
Loading…
Cancel
Save