From 303f5694ba9c234d2f57a17998e942cf63cd19d9 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Thu, 12 Dec 2013 03:36:30 +0100 Subject: [PATCH] avoid usage of existsByQuery. If a document can be loaded by the ID before testing other fields from the existsByQuery request, then a document cache fills and queries after that one can be avoided. --- .../federate/solr/SchemaConfiguration.java | 17 +++++------ .../solr/connector/AbstractSolrConnector.java | 10 ------- .../solr/connector/CachedSolrConnector.java | 28 ------------------- .../ConcurrentUpdateSolrConnector.java | 6 ---- .../solr/connector/MirrorSolrConnector.java | 8 ------ .../solr/connector/SolrConnector.java | 8 ------ source/net/yacy/crawler/CrawlStacker.java | 2 +- source/net/yacy/search/index/ErrorCache.java | 6 +++- 8 files changed, 13 insertions(+), 72 deletions(-) diff --git a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java index 3dd5a4d73..aa6a9ddf6 100644 --- a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java +++ b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java @@ -31,7 +31,6 @@ import java.util.Map; import java.util.Set; import org.apache.solr.common.SolrDocument; -import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.SolrInputField; @@ -158,17 +157,15 @@ public class SchemaConfiguration extends Configuration implements Serializable { continue uniquecheck; } try { - if (segment.fulltext().getDefaultConnector().existsByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\"")) { + final SolrDocument doc = segment.fulltext().getDefaultConnector().getDocumentById(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\""); + if (doc != null) { // switch unique attribute in new document sid.setField(uniquefield.getSolrFieldName(), false); - // switch attribute also in all existing documents (which should be exactly only one!) - SolrDocumentList docs = segment.fulltext().getDefaultConnector().getDocumentListByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\" AND " + uniquefield.getSolrFieldName() + ":true", 0, 1000); - for (SolrDocument doc: docs) { - SolrInputDocument sidContext = segment.fulltext().getDefaultConfiguration().toSolrInputDocument(doc); - sidContext.setField(uniquefield.getSolrFieldName(), false); - segment.putDocumentInQueue(sidContext); - changed = true; - } + // switch attribute in existing document + SolrInputDocument sidContext = segment.fulltext().getDefaultConfiguration().toSolrInputDocument(doc); + sidContext.setField(uniquefield.getSolrFieldName(), false); + segment.putDocumentInQueue(sidContext); + changed = true; } else { sid.setField(uniquefield.getSolrFieldName(), true); } diff --git a/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java index ecfcfeae3..08fb02460 100644 --- a/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java @@ -71,16 +71,6 @@ public abstract class AbstractSolrConnector implements SolrConnector { } protected final static int pagesize = 100; - @Override - public boolean existsByQuery(final String query) throws IOException { - try { - long count = getCountByQuery(query); - return count > 0; - } catch (final Throwable e) { - return false; - } - } - /** * Get a query result from solr as a stream of documents. * The result queue is considered as terminated if AbstractSolrConnector.POISON_DOCUMENT is returned. diff --git a/source/net/yacy/cora/federate/solr/connector/CachedSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/CachedSolrConnector.java index eaf93603c..04c3976de 100644 --- a/source/net/yacy/cora/federate/solr/connector/CachedSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/CachedSolrConnector.java @@ -122,34 +122,6 @@ public class CachedSolrConnector extends AbstractSolrConnector implements SolrCo this.clearCaches(); this.solr.deleteByQuery(querystring); } - - @Override - public boolean existsByQuery(final String query) throws IOException { - if (this.hitCache.containsKey(query)) { - this.hitCache_Hit++; - return true; - } - this.hitCache_Miss++; - if (this.documentCache.containsKey(query)) { - this.documentCache_Hit++; - return true; - } - this.documentCache_Miss++; - if (this.missCache.containsKey(query)) { - this.missCache_Hit++; - return false; - } - this.missCache_Miss++; - if (solr != null && solr.existsByQuery(query)) { - this.missCache.remove(query); - this.hitCache.put(query, EXIST); - this.hitCache_Insert++; - return true; - } - this.missCache.put(query, EXIST); - this.missCache_Insert++; - return false; - } @Override public SolrDocument getDocumentById(final String id, final String ... fields) throws IOException { diff --git a/source/net/yacy/cora/federate/solr/connector/ConcurrentUpdateSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/ConcurrentUpdateSolrConnector.java index 979af6f96..28a565c1d 100644 --- a/source/net/yacy/cora/federate/solr/connector/ConcurrentUpdateSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/ConcurrentUpdateSolrConnector.java @@ -376,12 +376,6 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector { e.addAll(e1); return e; } - - @Override - public boolean existsByQuery(String solrquery) throws IOException { - // this is actually wrong but to make it right we need to wait until all queues are flushed. But that may take very long when the queues are filled again all the time. - return this.connector.existsByQuery(solrquery); - } @Override public void add(SolrInputDocument solrdoc) throws IOException, SolrException { diff --git a/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java index 31f019b13..3f4e6e1e3 100644 --- a/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java @@ -158,14 +158,6 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo if (this.solr0 != null) this.solr0.deleteByQuery(querystring); if (this.solr1 != null) this.solr1.deleteByQuery(querystring); } - - @Override - public boolean existsByQuery(final String query) throws IOException { - if ((solr0 != null && solr0.existsByQuery(query)) || (solr1 != null && solr1.existsByQuery(query))) { - return true; - } - return false; - } @Override public SolrDocument getDocumentById(final String key, final String ... fields) throws IOException { diff --git a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java index 131f9c38a..7cbeda40a 100644 --- a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java @@ -112,14 +112,6 @@ public interface SolrConnector extends Iterable /* Iterable of document * @throws IOException */ public Set existsByIds(Set ids) throws IOException; - - /** - * check if a given document exists in solr - * @param solrquery - * @return true if any entry in solr exists - * @throws IOException - */ - public boolean existsByQuery(final String solrquery) throws IOException; /** * add a solr input document diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java index 9d473d22c..702db22d9 100644 --- a/source/net/yacy/crawler/CrawlStacker.java +++ b/source/net/yacy/crawler/CrawlStacker.java @@ -393,7 +393,7 @@ public final class CrawlStacker { final String urlstring = url.toString(); // check if the url is double registered final HarvestProcess dbocc = this.nextQueue.exists(url.hash()); // returns the name of the queue if entry exists - final Date oldDate = this.indexSegment.fulltext().getLoadDate(ASCII.String(url.hash())); + final Date oldDate = this.indexSegment.fulltext().getLoadDate(ASCII.String(url.hash())); // TODO: combine the exists-query with this one if (oldDate == null) { if (dbocc != null) { // do double-check diff --git a/source/net/yacy/search/index/ErrorCache.java b/source/net/yacy/search/index/ErrorCache.java index 3edd65c4e..5e608a771 100644 --- a/source/net/yacy/search/index/ErrorCache.java +++ b/source/net/yacy/search/index/ErrorCache.java @@ -160,7 +160,11 @@ public class ErrorCache { public boolean exists(final byte[] urlHash) { try { - return this.fulltext.getDefaultConnector().existsByQuery(CollectionSchema.id.getSolrFieldName() + ":\"" + ASCII.String(urlHash) + "\" AND " + CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]"); + final SolrDocument doc = this.fulltext.getDefaultConnector().getDocumentById(ASCII.String(urlHash), CollectionSchema.failreason_s.getSolrFieldName()); + if (doc == null) return false; + // check if the document contains a value in the field CollectionSchema.failreason_s + Object failreason = doc.getFieldValue(CollectionSchema.failreason_s.getSolrFieldName()); + return failreason == null || failreason.toString().length() == 0; } catch (IOException e) { return false; }