avoid usage of existsByQuery. If a document can be loaded by the ID

before testing other fields from the existsByQuery request, then a
document cache fills and queries after that one can be avoided.
pull/1/head
Michael Peter Christen 11 years ago
parent 67e7dc0cc6
commit 303f5694ba

@ -31,7 +31,6 @@ import java.util.Map;
import java.util.Set; import java.util.Set;
import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField; import org.apache.solr.common.SolrInputField;
@ -158,17 +157,15 @@ public class SchemaConfiguration extends Configuration implements Serializable {
continue uniquecheck; continue uniquecheck;
} }
try { try {
if (segment.fulltext().getDefaultConnector().existsByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\"")) { final SolrDocument doc = segment.fulltext().getDefaultConnector().getDocumentById(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\"");
if (doc != null) {
// switch unique attribute in new document // switch unique attribute in new document
sid.setField(uniquefield.getSolrFieldName(), false); sid.setField(uniquefield.getSolrFieldName(), false);
// switch attribute also in all existing documents (which should be exactly only one!) // switch attribute in existing document
SolrDocumentList docs = segment.fulltext().getDefaultConnector().getDocumentListByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\" AND " + uniquefield.getSolrFieldName() + ":true", 0, 1000); SolrInputDocument sidContext = segment.fulltext().getDefaultConfiguration().toSolrInputDocument(doc);
for (SolrDocument doc: docs) { sidContext.setField(uniquefield.getSolrFieldName(), false);
SolrInputDocument sidContext = segment.fulltext().getDefaultConfiguration().toSolrInputDocument(doc); segment.putDocumentInQueue(sidContext);
sidContext.setField(uniquefield.getSolrFieldName(), false); changed = true;
segment.putDocumentInQueue(sidContext);
changed = true;
}
} else { } else {
sid.setField(uniquefield.getSolrFieldName(), true); sid.setField(uniquefield.getSolrFieldName(), true);
} }

@ -71,16 +71,6 @@ public abstract class AbstractSolrConnector implements SolrConnector {
} }
protected final static int pagesize = 100; protected final static int pagesize = 100;
@Override
public boolean existsByQuery(final String query) throws IOException {
try {
long count = getCountByQuery(query);
return count > 0;
} catch (final Throwable e) {
return false;
}
}
/** /**
* Get a query result from solr as a stream of documents. * Get a query result from solr as a stream of documents.
* The result queue is considered as terminated if AbstractSolrConnector.POISON_DOCUMENT is returned. * The result queue is considered as terminated if AbstractSolrConnector.POISON_DOCUMENT is returned.

@ -123,34 +123,6 @@ public class CachedSolrConnector extends AbstractSolrConnector implements SolrCo
this.solr.deleteByQuery(querystring); this.solr.deleteByQuery(querystring);
} }
@Override
public boolean existsByQuery(final String query) throws IOException {
if (this.hitCache.containsKey(query)) {
this.hitCache_Hit++;
return true;
}
this.hitCache_Miss++;
if (this.documentCache.containsKey(query)) {
this.documentCache_Hit++;
return true;
}
this.documentCache_Miss++;
if (this.missCache.containsKey(query)) {
this.missCache_Hit++;
return false;
}
this.missCache_Miss++;
if (solr != null && solr.existsByQuery(query)) {
this.missCache.remove(query);
this.hitCache.put(query, EXIST);
this.hitCache_Insert++;
return true;
}
this.missCache.put(query, EXIST);
this.missCache_Insert++;
return false;
}
@Override @Override
public SolrDocument getDocumentById(final String id, final String ... fields) throws IOException { public SolrDocument getDocumentById(final String id, final String ... fields) throws IOException {
String q = idQuery(id); String q = idQuery(id);

@ -377,12 +377,6 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
return e; return e;
} }
@Override
public boolean existsByQuery(String solrquery) throws IOException {
// this is actually wrong but to make it right we need to wait until all queues are flushed. But that may take very long when the queues are filled again all the time.
return this.connector.existsByQuery(solrquery);
}
@Override @Override
public void add(SolrInputDocument solrdoc) throws IOException, SolrException { public void add(SolrInputDocument solrdoc) throws IOException, SolrException {
String id = (String) solrdoc.getFieldValue(CollectionSchema.id.getSolrFieldName()); String id = (String) solrdoc.getFieldValue(CollectionSchema.id.getSolrFieldName());

@ -159,14 +159,6 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo
if (this.solr1 != null) this.solr1.deleteByQuery(querystring); if (this.solr1 != null) this.solr1.deleteByQuery(querystring);
} }
@Override
public boolean existsByQuery(final String query) throws IOException {
if ((solr0 != null && solr0.existsByQuery(query)) || (solr1 != null && solr1.existsByQuery(query))) {
return true;
}
return false;
}
@Override @Override
public SolrDocument getDocumentById(final String key, final String ... fields) throws IOException { public SolrDocument getDocumentById(final String key, final String ... fields) throws IOException {
SolrDocument doc; SolrDocument doc;

@ -113,14 +113,6 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
*/ */
public Set<String> existsByIds(Set<String> ids) throws IOException; public Set<String> existsByIds(Set<String> ids) throws IOException;
/**
* check if a given document exists in solr
* @param solrquery
* @return true if any entry in solr exists
* @throws IOException
*/
public boolean existsByQuery(final String solrquery) throws IOException;
/** /**
* add a solr input document * add a solr input document
* @param solrdoc * @param solrdoc

@ -393,7 +393,7 @@ public final class CrawlStacker {
final String urlstring = url.toString(); final String urlstring = url.toString();
// check if the url is double registered // check if the url is double registered
final HarvestProcess dbocc = this.nextQueue.exists(url.hash()); // returns the name of the queue if entry exists final HarvestProcess dbocc = this.nextQueue.exists(url.hash()); // returns the name of the queue if entry exists
final Date oldDate = this.indexSegment.fulltext().getLoadDate(ASCII.String(url.hash())); final Date oldDate = this.indexSegment.fulltext().getLoadDate(ASCII.String(url.hash())); // TODO: combine the exists-query with this one
if (oldDate == null) { if (oldDate == null) {
if (dbocc != null) { if (dbocc != null) {
// do double-check // do double-check

@ -160,7 +160,11 @@ public class ErrorCache {
public boolean exists(final byte[] urlHash) { public boolean exists(final byte[] urlHash) {
try { try {
return this.fulltext.getDefaultConnector().existsByQuery(CollectionSchema.id.getSolrFieldName() + ":\"" + ASCII.String(urlHash) + "\" AND " + CollectionSchema.failreason_s.getSolrFieldName() + ":[* TO *]"); final SolrDocument doc = this.fulltext.getDefaultConnector().getDocumentById(ASCII.String(urlHash), CollectionSchema.failreason_s.getSolrFieldName());
if (doc == null) return false;
// check if the document contains a value in the field CollectionSchema.failreason_s
Object failreason = doc.getFieldValue(CollectionSchema.failreason_s.getSolrFieldName());
return failreason == null || failreason.toString().length() == 0;
} catch (IOException e) { } catch (IOException e) {
return false; return false;
} }

Loading…
Cancel
Save