From a87d8e4a8eb777eea384eed80020fd64d21e6ac4 Mon Sep 17 00:00:00 2001 From: orbiter Date: Mon, 24 Feb 2014 22:59:58 +0100 Subject: [PATCH] changed caching of ConcurrentUpdateSolrConnector: it caches now also the url along with the load date. While this takes much more memory, it eliminates database lookups for getURL() requests, which happen equally often. This speeds up remote solr configurations. --- .../solr/connector/AbstractSolrConnector.java | 50 +++++++++------- .../ConcurrentUpdateSolrConnector.java | 60 +++++++++---------- .../solr/connector/EmbeddedSolrConnector.java | 10 ++-- .../solr/connector/MirrorSolrConnector.java | 22 +++---- .../solr/connector/SolrConnector.java | 15 ++++- source/net/yacy/search/index/ErrorCache.java | 7 ++- source/net/yacy/search/index/Fulltext.java | 31 +++++----- 7 files changed, 108 insertions(+), 87 deletions(-) diff --git a/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java index b1ea16360..0aaad4f2e 100644 --- a/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java @@ -82,24 +82,32 @@ public abstract class AbstractSolrConnector implements SolrConnector { } protected final static int pagesize = 100; - protected static long getLoadDate(final Object doc) { + protected static Metadata getMetadata(final Object doc) { + if (doc == null) return null; Object d = null; - if (doc != null) { - if (doc instanceof SolrInputDocument) d = ((SolrInputDocument) doc).getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName()); - if (doc instanceof SolrDocument) d = ((SolrDocument) doc).getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName()); - if (doc instanceof org.apache.lucene.document.Document) { - String ds = ((org.apache.lucene.document.Document) doc).get(CollectionSchema.load_date_dt.getSolrFieldName()); - try { - d = Long.parseLong(ds); - } catch (NumberFormatException e) { - d = -1l; - } + String url = null; + if (doc instanceof SolrInputDocument) { + d = ((SolrInputDocument) doc).getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName()); + url = (String) ((SolrInputDocument) doc).getFieldValue(CollectionSchema.sku.getSolrFieldName()); + } + if (doc instanceof SolrDocument) { + d = ((SolrDocument) doc).getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName()); + url = (String) ((SolrDocument) doc).getFieldValue(CollectionSchema.sku.getSolrFieldName()); + } + if (doc instanceof org.apache.lucene.document.Document) { + String ds = ((org.apache.lucene.document.Document) doc).get(CollectionSchema.load_date_dt.getSolrFieldName()); + try { + d = Long.parseLong(ds); + } catch (NumberFormatException e) { + d = -1l; } + url = ((org.apache.lucene.document.Document) doc).get(CollectionSchema.sku.getSolrFieldName()); } - if (d == null) return -1l; - if (d instanceof Long) return ((Long) d).longValue(); - if (d instanceof Date) return ((Date) d).getTime(); - return -1l; + if (d == null) return null; + long date = -1; + if (d instanceof Long) date = ((Long) d).longValue(); + if (d instanceof Date) date = ((Date) d).getTime(); + return new Metadata(url, date); } /** @@ -239,11 +247,11 @@ public abstract class AbstractSolrConnector implements SolrConnector { /** * check if a given document, identified by url hash as document id exists * @param id the url hash and document id - * @return the load date if any entry in solr exists, -1 otherwise + * @return metadata if any entry in solr exists, null otherwise * @throws IOException */ @Override - public long getLoadTime(String id) throws IOException { + public Metadata getMetadata(String id) throws IOException { // construct raw query final SolrQuery params = new SolrQuery(); //params.setQuery(CollectionSchema.id.getSolrFieldName() + ":\"" + id + "\""); @@ -253,15 +261,15 @@ public abstract class AbstractSolrConnector implements SolrConnector { params.setStart(0); params.setFacet(false); params.clearSorts(); - params.setFields(CollectionSchema.id.getSolrFieldName(), CollectionSchema.load_date_dt.getSolrFieldName()); + params.setFields(CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.load_date_dt.getSolrFieldName()); params.setIncludeScore(false); // query the server final SolrDocumentList sdl = getDocumentListByParams(params); - if (sdl == null || sdl.getNumFound() <= 0) return -1; + if (sdl == null || sdl.getNumFound() <= 0) return null; SolrDocument doc = sdl.iterator().next(); - long d = getLoadDate(doc); - return d; + Metadata md = getMetadata(doc); + return md; } /** diff --git a/source/net/yacy/cora/federate/solr/connector/ConcurrentUpdateSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/ConcurrentUpdateSolrConnector.java index a696e1fa0..113f8cb06 100644 --- a/source/net/yacy/cora/federate/solr/connector/ConcurrentUpdateSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/ConcurrentUpdateSolrConnector.java @@ -89,8 +89,8 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector { Collection docs = new ArrayList(getmore + 1); docs.add(doc); String id = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()); - long date = AbstractSolrConnector.getLoadDate(doc); - updateIdCache(id, date); + Metadata md = AbstractSolrConnector.getMetadata(doc); + updateIdCache(id, md); for (int i = 0; i < getmore; i++) { SolrInputDocument d = ConcurrentUpdateSolrConnector.this.updateQueue.take(); if (d == POISON_DOCUMENT) { @@ -99,8 +99,8 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector { } docs.add(d); id = (String) d.getFieldValue(CollectionSchema.id.getSolrFieldName()); - date = AbstractSolrConnector.getLoadDate(d); - updateIdCache(id, date); + md = AbstractSolrConnector.getMetadata(d); + updateIdCache(id, md); } //ConcurrentLog.info("ConcurrentUpdateSolrConnector", "sending " + docs.size() + " documents to solr"); try { @@ -112,8 +112,8 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector { // if there is only a single document, send this directly to solr //ConcurrentLog.info("ConcurrentUpdateSolrConnector", "sending one document to solr"); String id = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()); - long date = AbstractSolrConnector.getLoadDate(doc); - updateIdCache(id, date); + Metadata md = AbstractSolrConnector.getMetadata(doc); + updateIdCache(id, md); try { ConcurrentUpdateSolrConnector.this.connector.add(doc); } catch (final OutOfMemoryError e) { @@ -134,15 +134,15 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector { } } } - - private ARC idCache; + + private ARC idCache; private BlockingQueue updateQueue; private BlockingQueue deleteQueue; private Thread deletionHandler, updateHandler; public ConcurrentUpdateSolrConnector(SolrConnector connector, int updateCapacity, int idCacheCapacity, int concurrency) { this.connector = connector; - this.idCache = new ConcurrentARC(idCacheCapacity, concurrency); // url hash to load time + this.idCache = new ConcurrentARC(idCacheCapacity, concurrency); // url hash to load time this.updateQueue = new ArrayBlockingQueue(updateCapacity); this.deleteQueue = new LinkedBlockingQueue(); this.deletionHandler = null; @@ -192,16 +192,18 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector { return null; } - private long existIdFromUpdateQueue(String id) { - if (this.updateQueue.size() == 0) return -1; + private Metadata existIdFromUpdateQueue(String id) { + if (this.updateQueue.size() == 0) return null; Iterator i = this.updateQueue.iterator(); while (i.hasNext()) { SolrInputDocument doc = i.next(); if (doc == null) break; String docID = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()); - if (docID != null && docID.equals(id)) return AbstractSolrConnector.getLoadDate(doc); + if (docID != null && docID.equals(id)) { + return AbstractSolrConnector.getMetadata(doc); + } } - return -1; + return null; } private void removeIdFromUpdateQueue(String id) { @@ -231,10 +233,10 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector { } } - private void updateIdCache(String id, long time) { + private void updateIdCache(final String id, final Metadata md) { if (id == null) return; if (MemoryControl.shortStatus()) this.idCache.clear(); - this.idCache.put(id, time); + this.idCache.put(id, md); } public void ensureAliveDeletionHandler() { @@ -361,25 +363,23 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector { } @Override - public long getLoadTime(String id) throws IOException { - Long date = this.idCache.get(id); - if (date != null) {cacheSuccessSign(); return date.longValue();} - if (existIdFromDeleteQueue(id)) {cacheSuccessSign(); return -1;} - long d = existIdFromUpdateQueue(id); - if (d >= 0) {cacheSuccessSign(); return d;} - d = this.connector.getLoadTime(id); - if (d >= 0) { - updateIdCache(id, d); - return d; - } - return -1; + public Metadata getMetadata(String id) throws IOException { + Metadata md = this.idCache.get(id); + if (md != null) {cacheSuccessSign(); return md;} + if (existIdFromDeleteQueue(id)) {cacheSuccessSign(); return null;} + md = existIdFromUpdateQueue(id); + if (md != null) {cacheSuccessSign(); return md;} + md = this.connector.getMetadata(id); + if (md == null) return null; + updateIdCache(id, md); + return md; } @Override public void add(SolrInputDocument solrdoc) throws IOException, SolrException { String id = (String) solrdoc.getFieldValue(CollectionSchema.id.getSolrFieldName()); removeIdFromDeleteQueue(id); - updateIdCache(id, AbstractSolrConnector.getLoadDate(solrdoc)); + updateIdCache(id, AbstractSolrConnector.getMetadata(solrdoc)); if (this.updateHandler.isAlive()) { try {this.updateQueue.put(solrdoc);} catch (final InterruptedException e) {} } else { @@ -392,7 +392,7 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector { for (SolrInputDocument doc: solrdocs) { String id = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()); removeIdFromDeleteQueue(id); - updateIdCache(id, AbstractSolrConnector.getLoadDate(doc)); + updateIdCache(id, AbstractSolrConnector.getMetadata(doc)); } if (this.updateHandler.isAlive()) { for (SolrInputDocument doc: solrdocs) try {this.updateQueue.put(doc);} catch (final InterruptedException e) {} @@ -407,7 +407,7 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector { SolrInputDocument idoc = getFromUpdateQueue(id); if (idoc != null) {cacheSuccessSign(); return ClientUtils.toSolrDocument(idoc);} SolrDocument doc = this.connector.getDocumentById(id, AbstractSolrConnector.ensureEssentialFieldsIncluded(fields)); - if (doc != null) updateIdCache(id, AbstractSolrConnector.getLoadDate(doc)); + if (doc != null) updateIdCache(id, AbstractSolrConnector.getMetadata(doc)); return doc; } diff --git a/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java index 136bfa23e..bb052ec2b 100644 --- a/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java @@ -396,24 +396,24 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo * @throws IOException */ @Override - public synchronized long getLoadTime(String id) { + public synchronized Metadata getMetadata(String id) { int responseCount = 0; DocListSearcher docListSearcher = null; try { docListSearcher = new DocListSearcher("{!raw f=" + CollectionSchema.id.getSolrFieldName() + "}" + id, 0, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.load_date_dt.getSolrFieldName()); responseCount = docListSearcher.response.size(); - if (responseCount == 0) return -1; + if (responseCount == 0) return null; SolrIndexSearcher searcher = docListSearcher.request.getSearcher(); DocIterator iterator = docListSearcher.response.iterator(); //for (int i = 0; i < responseCount; i++) { Document doc = searcher.doc(iterator.nextDoc(), AbstractSolrConnector.SOLR_ID_and_LOAD_DATE_FIELDS); - if (doc == null) return -1; - return AbstractSolrConnector.getLoadDate(doc); + if (doc == null) return null; + return AbstractSolrConnector.getMetadata(doc); //} } catch (Throwable e) {} finally { if (docListSearcher != null) docListSearcher.close(); } - return -1; + return null; } @Override diff --git a/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java index b44bc68c2..a745a0841 100644 --- a/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java @@ -394,19 +394,19 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo } @Override - public long getLoadTime(String id) throws IOException { - if (this.solr0 != null && this.solr1 == null) return this.solr0.getLoadTime(id); - if (this.solr0 == null && this.solr1 != null) return this.solr1.getLoadTime(id); - if (this.solr0 == null && this.solr1 == null) return -1; - return Math.max(this.solr0.getLoadTime(id), this.solr1.getLoadTime(id)); + public Metadata getMetadata(String id) throws IOException { + if (this.solr0 != null && this.solr1 == null) return this.solr0.getMetadata(id); + if (this.solr0 == null && this.solr1 != null) return this.solr1.getMetadata(id); + if (this.solr0 == null && this.solr1 == null) return null; + Metadata md0 = this.solr0.getMetadata(id); + Metadata md1 = this.solr1.getMetadata(id); + if (md0 == null) return md1; + if (md1 == null) return md0; + long date = Math.max(md0.date, md1.date); + assert md0.url.equals(md1.url); + return new Metadata(md0.url, date); } - /* - @Override - public BlockingQueue concurrentDocumentsByQuery(String querystring, int offset, int maxcount, long maxtime, int buffersize, String... fields) { - return null; - } - */ @Override public BlockingQueue concurrentIDsByQuery(String querystring, int offset, int maxcount, long maxtime) { if (this.solr0 != null && this.solr1 == null) return this.solr0.concurrentIDsByQuery(querystring, offset, maxcount, maxtime); diff --git a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java index 5b61ed310..125fd32ac 100644 --- a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java @@ -36,6 +36,15 @@ import org.apache.solr.common.params.ModifiableSolrParams; public interface SolrConnector extends Iterable /* Iterable of document IDs */ { + public static class Metadata { + public long date; + public String url; + public Metadata(final String url, final long date) { + this.url = url; + this.date = date; + } + } + /** * clear all caches: inside solr and ouside solr within the implementations of this interface */ @@ -110,11 +119,11 @@ public interface SolrConnector extends Iterable /* Iterable of document /** * check if a given document, identified by url hash as document id exists * @param id the url hash and document id - * @return the load time if any entry in solr exists, -1 otherwise + * @return the metadata (url and load data) if any entry in solr exists, null otherwise * @throws IOException */ - public long getLoadTime(final String id) throws IOException; - + public Metadata getMetadata(final String id) throws IOException; + /** * add a solr input document * @param solrdoc diff --git a/source/net/yacy/search/index/ErrorCache.java b/source/net/yacy/search/index/ErrorCache.java index 1339cd2bc..b181765c4 100644 --- a/source/net/yacy/search/index/ErrorCache.java +++ b/source/net/yacy/search/index/ErrorCache.java @@ -176,8 +176,13 @@ public class ErrorCache { } public boolean exists(final byte[] urlHash) { + String urlHashString = ASCII.String(urlHash); try { - final SolrDocument doc = this.fulltext.getDefaultConnector().getDocumentById(ASCII.String(urlHash), CollectionSchema.failreason_s.getSolrFieldName()); + // first try to check if the document exists at all. + long loaddate = this.fulltext.getLoadTime(urlHashString); + if (loaddate < 0) return false; + // then load the fail reason, if exists + final SolrDocument doc = this.fulltext.getDefaultConnector().getDocumentById(urlHashString, CollectionSchema.failreason_s.getSolrFieldName()); if (doc == null) return false; // check if the document contains a value in the field CollectionSchema.failreason_s Object failreason = doc.getFieldValue(CollectionSchema.failreason_s.getSolrFieldName()); diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index 5dd2bcb18..d679a91e8 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -269,21 +269,6 @@ public final class Fulltext { if (this.writeWebgraph) getWebgraphConnector().commit(softCommit); } - public DigestURL getURL(final String urlHashS) { - if (urlHashS == null || this.getDefaultConnector() == null) return null; - - try { - SolrDocument doc = this.getDefaultConnector().getDocumentById(urlHashS, CollectionSchema.sku.getSolrFieldName()); - Object u = doc == null ? null : doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()); - if (u == null) return null; - assert u instanceof String : "u = " + u.toString(); - if (u instanceof String) return new DigestURL((String) u, ASCII.getBytes(urlHashS)); - return null; - } catch (final IOException e) { - return null; - } - } - public URIMetadataNode getMetadata(final WeakPriorityBlockingQueue.Element element) { if (element == null) return null; WordReferenceVars wre = element.getElement(); @@ -483,6 +468,18 @@ public final class Fulltext { return false; } + public DigestURL getURL(final String urlHash) { + if (urlHash == null || this.getDefaultConnector() == null) return null; + + try { + SolrConnector.Metadata md = this.getDefaultConnector().getMetadata(urlHash); + if (md == null) return null; + return new DigestURL(md.url, ASCII.getBytes(urlHash)); + } catch (final IOException e) { + return null; + } + } + /** * get the load time of a resource. * @param urlHash @@ -491,7 +488,9 @@ public final class Fulltext { public long getLoadTime(final String urlHash) { if (urlHash == null) return -1l; try { - return this.getDefaultConnector().getLoadTime(urlHash); + SolrConnector.Metadata md = this.getDefaultConnector().getMetadata(urlHash); + if (md == null) return -1; + return md.date; } catch (final Throwable e) { ConcurrentLog.logException(e); }