changed caching of ConcurrentUpdateSolrConnector: it caches now also the

url along with the load date. While this takes much more memory, it
eliminates database lookups for getURL() requests, which happen equally
often. This speeds up remote solr configurations.
pull/1/head
orbiter 11 years ago
parent f6e441dd77
commit a87d8e4a8e

@ -82,24 +82,32 @@ public abstract class AbstractSolrConnector implements SolrConnector {
}
protected final static int pagesize = 100;
protected static long getLoadDate(final Object doc) {
protected static Metadata getMetadata(final Object doc) {
if (doc == null) return null;
Object d = null;
if (doc != null) {
if (doc instanceof SolrInputDocument) d = ((SolrInputDocument) doc).getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName());
if (doc instanceof SolrDocument) d = ((SolrDocument) doc).getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName());
if (doc instanceof org.apache.lucene.document.Document) {
String ds = ((org.apache.lucene.document.Document) doc).get(CollectionSchema.load_date_dt.getSolrFieldName());
try {
d = Long.parseLong(ds);
} catch (NumberFormatException e) {
d = -1l;
}
String url = null;
if (doc instanceof SolrInputDocument) {
d = ((SolrInputDocument) doc).getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName());
url = (String) ((SolrInputDocument) doc).getFieldValue(CollectionSchema.sku.getSolrFieldName());
}
if (doc instanceof SolrDocument) {
d = ((SolrDocument) doc).getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName());
url = (String) ((SolrDocument) doc).getFieldValue(CollectionSchema.sku.getSolrFieldName());
}
if (doc instanceof org.apache.lucene.document.Document) {
String ds = ((org.apache.lucene.document.Document) doc).get(CollectionSchema.load_date_dt.getSolrFieldName());
try {
d = Long.parseLong(ds);
} catch (NumberFormatException e) {
d = -1l;
}
url = ((org.apache.lucene.document.Document) doc).get(CollectionSchema.sku.getSolrFieldName());
}
if (d == null) return -1l;
if (d instanceof Long) return ((Long) d).longValue();
if (d instanceof Date) return ((Date) d).getTime();
return -1l;
if (d == null) return null;
long date = -1;
if (d instanceof Long) date = ((Long) d).longValue();
if (d instanceof Date) date = ((Date) d).getTime();
return new Metadata(url, date);
}
/**
@ -239,11 +247,11 @@ public abstract class AbstractSolrConnector implements SolrConnector {
/**
* check if a given document, identified by url hash as document id exists
* @param id the url hash and document id
* @return the load date if any entry in solr exists, -1 otherwise
* @return metadata if any entry in solr exists, null otherwise
* @throws IOException
*/
@Override
public long getLoadTime(String id) throws IOException {
public Metadata getMetadata(String id) throws IOException {
// construct raw query
final SolrQuery params = new SolrQuery();
//params.setQuery(CollectionSchema.id.getSolrFieldName() + ":\"" + id + "\"");
@ -253,15 +261,15 @@ public abstract class AbstractSolrConnector implements SolrConnector {
params.setStart(0);
params.setFacet(false);
params.clearSorts();
params.setFields(CollectionSchema.id.getSolrFieldName(), CollectionSchema.load_date_dt.getSolrFieldName());
params.setFields(CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.load_date_dt.getSolrFieldName());
params.setIncludeScore(false);
// query the server
final SolrDocumentList sdl = getDocumentListByParams(params);
if (sdl == null || sdl.getNumFound() <= 0) return -1;
if (sdl == null || sdl.getNumFound() <= 0) return null;
SolrDocument doc = sdl.iterator().next();
long d = getLoadDate(doc);
return d;
Metadata md = getMetadata(doc);
return md;
}
/**

@ -89,8 +89,8 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
Collection<SolrInputDocument> docs = new ArrayList<SolrInputDocument>(getmore + 1);
docs.add(doc);
String id = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
long date = AbstractSolrConnector.getLoadDate(doc);
updateIdCache(id, date);
Metadata md = AbstractSolrConnector.getMetadata(doc);
updateIdCache(id, md);
for (int i = 0; i < getmore; i++) {
SolrInputDocument d = ConcurrentUpdateSolrConnector.this.updateQueue.take();
if (d == POISON_DOCUMENT) {
@ -99,8 +99,8 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
}
docs.add(d);
id = (String) d.getFieldValue(CollectionSchema.id.getSolrFieldName());
date = AbstractSolrConnector.getLoadDate(d);
updateIdCache(id, date);
md = AbstractSolrConnector.getMetadata(d);
updateIdCache(id, md);
}
//ConcurrentLog.info("ConcurrentUpdateSolrConnector", "sending " + docs.size() + " documents to solr");
try {
@ -112,8 +112,8 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
// if there is only a single document, send this directly to solr
//ConcurrentLog.info("ConcurrentUpdateSolrConnector", "sending one document to solr");
String id = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
long date = AbstractSolrConnector.getLoadDate(doc);
updateIdCache(id, date);
Metadata md = AbstractSolrConnector.getMetadata(doc);
updateIdCache(id, md);
try {
ConcurrentUpdateSolrConnector.this.connector.add(doc);
} catch (final OutOfMemoryError e) {
@ -134,15 +134,15 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
}
}
}
private ARC<String, Long> idCache;
private ARC<String, Metadata> idCache;
private BlockingQueue<SolrInputDocument> updateQueue;
private BlockingQueue<String> deleteQueue;
private Thread deletionHandler, updateHandler;
public ConcurrentUpdateSolrConnector(SolrConnector connector, int updateCapacity, int idCacheCapacity, int concurrency) {
this.connector = connector;
this.idCache = new ConcurrentARC<String, Long>(idCacheCapacity, concurrency); // url hash to load time
this.idCache = new ConcurrentARC<String, Metadata>(idCacheCapacity, concurrency); // url hash to load time
this.updateQueue = new ArrayBlockingQueue<SolrInputDocument>(updateCapacity);
this.deleteQueue = new LinkedBlockingQueue<String>();
this.deletionHandler = null;
@ -192,16 +192,18 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
return null;
}
private long existIdFromUpdateQueue(String id) {
if (this.updateQueue.size() == 0) return -1;
private Metadata existIdFromUpdateQueue(String id) {
if (this.updateQueue.size() == 0) return null;
Iterator<SolrInputDocument> i = this.updateQueue.iterator();
while (i.hasNext()) {
SolrInputDocument doc = i.next();
if (doc == null) break;
String docID = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
if (docID != null && docID.equals(id)) return AbstractSolrConnector.getLoadDate(doc);
if (docID != null && docID.equals(id)) {
return AbstractSolrConnector.getMetadata(doc);
}
}
return -1;
return null;
}
private void removeIdFromUpdateQueue(String id) {
@ -231,10 +233,10 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
}
}
private void updateIdCache(String id, long time) {
private void updateIdCache(final String id, final Metadata md) {
if (id == null) return;
if (MemoryControl.shortStatus()) this.idCache.clear();
this.idCache.put(id, time);
this.idCache.put(id, md);
}
public void ensureAliveDeletionHandler() {
@ -361,25 +363,23 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
}
@Override
public long getLoadTime(String id) throws IOException {
Long date = this.idCache.get(id);
if (date != null) {cacheSuccessSign(); return date.longValue();}
if (existIdFromDeleteQueue(id)) {cacheSuccessSign(); return -1;}
long d = existIdFromUpdateQueue(id);
if (d >= 0) {cacheSuccessSign(); return d;}
d = this.connector.getLoadTime(id);
if (d >= 0) {
updateIdCache(id, d);
return d;
}
return -1;
public Metadata getMetadata(String id) throws IOException {
Metadata md = this.idCache.get(id);
if (md != null) {cacheSuccessSign(); return md;}
if (existIdFromDeleteQueue(id)) {cacheSuccessSign(); return null;}
md = existIdFromUpdateQueue(id);
if (md != null) {cacheSuccessSign(); return md;}
md = this.connector.getMetadata(id);
if (md == null) return null;
updateIdCache(id, md);
return md;
}
@Override
public void add(SolrInputDocument solrdoc) throws IOException, SolrException {
String id = (String) solrdoc.getFieldValue(CollectionSchema.id.getSolrFieldName());
removeIdFromDeleteQueue(id);
updateIdCache(id, AbstractSolrConnector.getLoadDate(solrdoc));
updateIdCache(id, AbstractSolrConnector.getMetadata(solrdoc));
if (this.updateHandler.isAlive()) {
try {this.updateQueue.put(solrdoc);} catch (final InterruptedException e) {}
} else {
@ -392,7 +392,7 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
for (SolrInputDocument doc: solrdocs) {
String id = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
removeIdFromDeleteQueue(id);
updateIdCache(id, AbstractSolrConnector.getLoadDate(doc));
updateIdCache(id, AbstractSolrConnector.getMetadata(doc));
}
if (this.updateHandler.isAlive()) {
for (SolrInputDocument doc: solrdocs) try {this.updateQueue.put(doc);} catch (final InterruptedException e) {}
@ -407,7 +407,7 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
SolrInputDocument idoc = getFromUpdateQueue(id);
if (idoc != null) {cacheSuccessSign(); return ClientUtils.toSolrDocument(idoc);}
SolrDocument doc = this.connector.getDocumentById(id, AbstractSolrConnector.ensureEssentialFieldsIncluded(fields));
if (doc != null) updateIdCache(id, AbstractSolrConnector.getLoadDate(doc));
if (doc != null) updateIdCache(id, AbstractSolrConnector.getMetadata(doc));
return doc;
}

@ -396,24 +396,24 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
* @throws IOException
*/
@Override
public synchronized long getLoadTime(String id) {
public synchronized Metadata getMetadata(String id) {
int responseCount = 0;
DocListSearcher docListSearcher = null;
try {
docListSearcher = new DocListSearcher("{!raw f=" + CollectionSchema.id.getSolrFieldName() + "}" + id, 0, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.load_date_dt.getSolrFieldName());
responseCount = docListSearcher.response.size();
if (responseCount == 0) return -1;
if (responseCount == 0) return null;
SolrIndexSearcher searcher = docListSearcher.request.getSearcher();
DocIterator iterator = docListSearcher.response.iterator();
//for (int i = 0; i < responseCount; i++) {
Document doc = searcher.doc(iterator.nextDoc(), AbstractSolrConnector.SOLR_ID_and_LOAD_DATE_FIELDS);
if (doc == null) return -1;
return AbstractSolrConnector.getLoadDate(doc);
if (doc == null) return null;
return AbstractSolrConnector.getMetadata(doc);
//}
} catch (Throwable e) {} finally {
if (docListSearcher != null) docListSearcher.close();
}
return -1;
return null;
}
@Override

@ -394,19 +394,19 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo
}
@Override
public long getLoadTime(String id) throws IOException {
if (this.solr0 != null && this.solr1 == null) return this.solr0.getLoadTime(id);
if (this.solr0 == null && this.solr1 != null) return this.solr1.getLoadTime(id);
if (this.solr0 == null && this.solr1 == null) return -1;
return Math.max(this.solr0.getLoadTime(id), this.solr1.getLoadTime(id));
public Metadata getMetadata(String id) throws IOException {
if (this.solr0 != null && this.solr1 == null) return this.solr0.getMetadata(id);
if (this.solr0 == null && this.solr1 != null) return this.solr1.getMetadata(id);
if (this.solr0 == null && this.solr1 == null) return null;
Metadata md0 = this.solr0.getMetadata(id);
Metadata md1 = this.solr1.getMetadata(id);
if (md0 == null) return md1;
if (md1 == null) return md0;
long date = Math.max(md0.date, md1.date);
assert md0.url.equals(md1.url);
return new Metadata(md0.url, date);
}
/*
@Override
public BlockingQueue<SolrDocument> concurrentDocumentsByQuery(String querystring, int offset, int maxcount, long maxtime, int buffersize, String... fields) {
return null;
}
*/
@Override
public BlockingQueue<String> concurrentIDsByQuery(String querystring, int offset, int maxcount, long maxtime) {
if (this.solr0 != null && this.solr1 == null) return this.solr0.concurrentIDsByQuery(querystring, offset, maxcount, maxtime);

@ -36,6 +36,15 @@ import org.apache.solr.common.params.ModifiableSolrParams;
public interface SolrConnector extends Iterable<String> /* Iterable of document IDs */ {
public static class Metadata {
public long date;
public String url;
public Metadata(final String url, final long date) {
this.url = url;
this.date = date;
}
}
/**
* clear all caches: inside solr and ouside solr within the implementations of this interface
*/
@ -110,11 +119,11 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
/**
* check if a given document, identified by url hash as document id exists
* @param id the url hash and document id
* @return the load time if any entry in solr exists, -1 otherwise
* @return the metadata (url and load data) if any entry in solr exists, null otherwise
* @throws IOException
*/
public long getLoadTime(final String id) throws IOException;
public Metadata getMetadata(final String id) throws IOException;
/**
* add a solr input document
* @param solrdoc

@ -176,8 +176,13 @@ public class ErrorCache {
}
public boolean exists(final byte[] urlHash) {
String urlHashString = ASCII.String(urlHash);
try {
final SolrDocument doc = this.fulltext.getDefaultConnector().getDocumentById(ASCII.String(urlHash), CollectionSchema.failreason_s.getSolrFieldName());
// first try to check if the document exists at all.
long loaddate = this.fulltext.getLoadTime(urlHashString);
if (loaddate < 0) return false;
// then load the fail reason, if exists
final SolrDocument doc = this.fulltext.getDefaultConnector().getDocumentById(urlHashString, CollectionSchema.failreason_s.getSolrFieldName());
if (doc == null) return false;
// check if the document contains a value in the field CollectionSchema.failreason_s
Object failreason = doc.getFieldValue(CollectionSchema.failreason_s.getSolrFieldName());

@ -269,21 +269,6 @@ public final class Fulltext {
if (this.writeWebgraph) getWebgraphConnector().commit(softCommit);
}
public DigestURL getURL(final String urlHashS) {
if (urlHashS == null || this.getDefaultConnector() == null) return null;
try {
SolrDocument doc = this.getDefaultConnector().getDocumentById(urlHashS, CollectionSchema.sku.getSolrFieldName());
Object u = doc == null ? null : doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
if (u == null) return null;
assert u instanceof String : "u = " + u.toString();
if (u instanceof String) return new DigestURL((String) u, ASCII.getBytes(urlHashS));
return null;
} catch (final IOException e) {
return null;
}
}
public URIMetadataNode getMetadata(final WeakPriorityBlockingQueue.Element<WordReferenceVars> element) {
if (element == null) return null;
WordReferenceVars wre = element.getElement();
@ -483,6 +468,18 @@ public final class Fulltext {
return false;
}
public DigestURL getURL(final String urlHash) {
if (urlHash == null || this.getDefaultConnector() == null) return null;
try {
SolrConnector.Metadata md = this.getDefaultConnector().getMetadata(urlHash);
if (md == null) return null;
return new DigestURL(md.url, ASCII.getBytes(urlHash));
} catch (final IOException e) {
return null;
}
}
/**
* get the load time of a resource.
* @param urlHash
@ -491,7 +488,9 @@ public final class Fulltext {
public long getLoadTime(final String urlHash) {
if (urlHash == null) return -1l;
try {
return this.getDefaultConnector().getLoadTime(urlHash);
SolrConnector.Metadata md = this.getDefaultConnector().getMetadata(urlHash);
if (md == null) return -1;
return md.date;
} catch (final Throwable e) {
ConcurrentLog.logException(e);
}

Loading…
Cancel
Save