diff --git a/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java index b1ea16360..0aaad4f2e 100644 --- a/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java @@ -82,24 +82,32 @@ public abstract class AbstractSolrConnector implements SolrConnector { } protected final static int pagesize = 100; - protected static long getLoadDate(final Object doc) { + protected static Metadata getMetadata(final Object doc) { + if (doc == null) return null; Object d = null; - if (doc != null) { - if (doc instanceof SolrInputDocument) d = ((SolrInputDocument) doc).getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName()); - if (doc instanceof SolrDocument) d = ((SolrDocument) doc).getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName()); - if (doc instanceof org.apache.lucene.document.Document) { - String ds = ((org.apache.lucene.document.Document) doc).get(CollectionSchema.load_date_dt.getSolrFieldName()); - try { - d = Long.parseLong(ds); - } catch (NumberFormatException e) { - d = -1l; - } + String url = null; + if (doc instanceof SolrInputDocument) { + d = ((SolrInputDocument) doc).getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName()); + url = (String) ((SolrInputDocument) doc).getFieldValue(CollectionSchema.sku.getSolrFieldName()); + } + if (doc instanceof SolrDocument) { + d = ((SolrDocument) doc).getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName()); + url = (String) ((SolrDocument) doc).getFieldValue(CollectionSchema.sku.getSolrFieldName()); + } + if (doc instanceof org.apache.lucene.document.Document) { + String ds = ((org.apache.lucene.document.Document) doc).get(CollectionSchema.load_date_dt.getSolrFieldName()); + try { + d = Long.parseLong(ds); + } catch (NumberFormatException e) { + d = -1l; } + url = ((org.apache.lucene.document.Document) doc).get(CollectionSchema.sku.getSolrFieldName()); } - if (d == null) return -1l; - if (d instanceof Long) return ((Long) d).longValue(); - if (d instanceof Date) return ((Date) d).getTime(); - return -1l; + if (d == null) return null; + long date = -1; + if (d instanceof Long) date = ((Long) d).longValue(); + if (d instanceof Date) date = ((Date) d).getTime(); + return new Metadata(url, date); } /** @@ -239,11 +247,11 @@ public abstract class AbstractSolrConnector implements SolrConnector { /** * check if a given document, identified by url hash as document id exists * @param id the url hash and document id - * @return the load date if any entry in solr exists, -1 otherwise + * @return metadata if any entry in solr exists, null otherwise * @throws IOException */ @Override - public long getLoadTime(String id) throws IOException { + public Metadata getMetadata(String id) throws IOException { // construct raw query final SolrQuery params = new SolrQuery(); //params.setQuery(CollectionSchema.id.getSolrFieldName() + ":\"" + id + "\""); @@ -253,15 +261,15 @@ public abstract class AbstractSolrConnector implements SolrConnector { params.setStart(0); params.setFacet(false); params.clearSorts(); - params.setFields(CollectionSchema.id.getSolrFieldName(), CollectionSchema.load_date_dt.getSolrFieldName()); + params.setFields(CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.load_date_dt.getSolrFieldName()); params.setIncludeScore(false); // query the server final SolrDocumentList sdl = getDocumentListByParams(params); - if (sdl == null || sdl.getNumFound() <= 0) return -1; + if (sdl == null || sdl.getNumFound() <= 0) return null; SolrDocument doc = sdl.iterator().next(); - long d = getLoadDate(doc); - return d; + Metadata md = getMetadata(doc); + return md; } /** diff --git a/source/net/yacy/cora/federate/solr/connector/ConcurrentUpdateSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/ConcurrentUpdateSolrConnector.java index a696e1fa0..113f8cb06 100644 --- a/source/net/yacy/cora/federate/solr/connector/ConcurrentUpdateSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/ConcurrentUpdateSolrConnector.java @@ -89,8 +89,8 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector { Collection docs = new ArrayList(getmore + 1); docs.add(doc); String id = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()); - long date = AbstractSolrConnector.getLoadDate(doc); - updateIdCache(id, date); + Metadata md = AbstractSolrConnector.getMetadata(doc); + updateIdCache(id, md); for (int i = 0; i < getmore; i++) { SolrInputDocument d = ConcurrentUpdateSolrConnector.this.updateQueue.take(); if (d == POISON_DOCUMENT) { @@ -99,8 +99,8 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector { } docs.add(d); id = (String) d.getFieldValue(CollectionSchema.id.getSolrFieldName()); - date = AbstractSolrConnector.getLoadDate(d); - updateIdCache(id, date); + md = AbstractSolrConnector.getMetadata(d); + updateIdCache(id, md); } //ConcurrentLog.info("ConcurrentUpdateSolrConnector", "sending " + docs.size() + " documents to solr"); try { @@ -112,8 +112,8 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector { // if there is only a single document, send this directly to solr //ConcurrentLog.info("ConcurrentUpdateSolrConnector", "sending one document to solr"); String id = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()); - long date = AbstractSolrConnector.getLoadDate(doc); - updateIdCache(id, date); + Metadata md = AbstractSolrConnector.getMetadata(doc); + updateIdCache(id, md); try { ConcurrentUpdateSolrConnector.this.connector.add(doc); } catch (final OutOfMemoryError e) { @@ -134,15 +134,15 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector { } } } - - private ARC idCache; + + private ARC idCache; private BlockingQueue updateQueue; private BlockingQueue deleteQueue; private Thread deletionHandler, updateHandler; public ConcurrentUpdateSolrConnector(SolrConnector connector, int updateCapacity, int idCacheCapacity, int concurrency) { this.connector = connector; - this.idCache = new ConcurrentARC(idCacheCapacity, concurrency); // url hash to load time + this.idCache = new ConcurrentARC(idCacheCapacity, concurrency); // url hash to load time this.updateQueue = new ArrayBlockingQueue(updateCapacity); this.deleteQueue = new LinkedBlockingQueue(); this.deletionHandler = null; @@ -192,16 +192,18 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector { return null; } - private long existIdFromUpdateQueue(String id) { - if (this.updateQueue.size() == 0) return -1; + private Metadata existIdFromUpdateQueue(String id) { + if (this.updateQueue.size() == 0) return null; Iterator i = this.updateQueue.iterator(); while (i.hasNext()) { SolrInputDocument doc = i.next(); if (doc == null) break; String docID = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()); - if (docID != null && docID.equals(id)) return AbstractSolrConnector.getLoadDate(doc); + if (docID != null && docID.equals(id)) { + return AbstractSolrConnector.getMetadata(doc); + } } - return -1; + return null; } private void removeIdFromUpdateQueue(String id) { @@ -231,10 +233,10 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector { } } - private void updateIdCache(String id, long time) { + private void updateIdCache(final String id, final Metadata md) { if (id == null) return; if (MemoryControl.shortStatus()) this.idCache.clear(); - this.idCache.put(id, time); + this.idCache.put(id, md); } public void ensureAliveDeletionHandler() { @@ -361,25 +363,23 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector { } @Override - public long getLoadTime(String id) throws IOException { - Long date = this.idCache.get(id); - if (date != null) {cacheSuccessSign(); return date.longValue();} - if (existIdFromDeleteQueue(id)) {cacheSuccessSign(); return -1;} - long d = existIdFromUpdateQueue(id); - if (d >= 0) {cacheSuccessSign(); return d;} - d = this.connector.getLoadTime(id); - if (d >= 0) { - updateIdCache(id, d); - return d; - } - return -1; + public Metadata getMetadata(String id) throws IOException { + Metadata md = this.idCache.get(id); + if (md != null) {cacheSuccessSign(); return md;} + if (existIdFromDeleteQueue(id)) {cacheSuccessSign(); return null;} + md = existIdFromUpdateQueue(id); + if (md != null) {cacheSuccessSign(); return md;} + md = this.connector.getMetadata(id); + if (md == null) return null; + updateIdCache(id, md); + return md; } @Override public void add(SolrInputDocument solrdoc) throws IOException, SolrException { String id = (String) solrdoc.getFieldValue(CollectionSchema.id.getSolrFieldName()); removeIdFromDeleteQueue(id); - updateIdCache(id, AbstractSolrConnector.getLoadDate(solrdoc)); + updateIdCache(id, AbstractSolrConnector.getMetadata(solrdoc)); if (this.updateHandler.isAlive()) { try {this.updateQueue.put(solrdoc);} catch (final InterruptedException e) {} } else { @@ -392,7 +392,7 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector { for (SolrInputDocument doc: solrdocs) { String id = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()); removeIdFromDeleteQueue(id); - updateIdCache(id, AbstractSolrConnector.getLoadDate(doc)); + updateIdCache(id, AbstractSolrConnector.getMetadata(doc)); } if (this.updateHandler.isAlive()) { for (SolrInputDocument doc: solrdocs) try {this.updateQueue.put(doc);} catch (final InterruptedException e) {} @@ -407,7 +407,7 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector { SolrInputDocument idoc = getFromUpdateQueue(id); if (idoc != null) {cacheSuccessSign(); return ClientUtils.toSolrDocument(idoc);} SolrDocument doc = this.connector.getDocumentById(id, AbstractSolrConnector.ensureEssentialFieldsIncluded(fields)); - if (doc != null) updateIdCache(id, AbstractSolrConnector.getLoadDate(doc)); + if (doc != null) updateIdCache(id, AbstractSolrConnector.getMetadata(doc)); return doc; } diff --git a/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java index 136bfa23e..bb052ec2b 100644 --- a/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java @@ -396,24 +396,24 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo * @throws IOException */ @Override - public synchronized long getLoadTime(String id) { + public synchronized Metadata getMetadata(String id) { int responseCount = 0; DocListSearcher docListSearcher = null; try { docListSearcher = new DocListSearcher("{!raw f=" + CollectionSchema.id.getSolrFieldName() + "}" + id, 0, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.load_date_dt.getSolrFieldName()); responseCount = docListSearcher.response.size(); - if (responseCount == 0) return -1; + if (responseCount == 0) return null; SolrIndexSearcher searcher = docListSearcher.request.getSearcher(); DocIterator iterator = docListSearcher.response.iterator(); //for (int i = 0; i < responseCount; i++) { Document doc = searcher.doc(iterator.nextDoc(), AbstractSolrConnector.SOLR_ID_and_LOAD_DATE_FIELDS); - if (doc == null) return -1; - return AbstractSolrConnector.getLoadDate(doc); + if (doc == null) return null; + return AbstractSolrConnector.getMetadata(doc); //} } catch (Throwable e) {} finally { if (docListSearcher != null) docListSearcher.close(); } - return -1; + return null; } @Override diff --git a/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java index b44bc68c2..a745a0841 100644 --- a/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java @@ -394,19 +394,19 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo } @Override - public long getLoadTime(String id) throws IOException { - if (this.solr0 != null && this.solr1 == null) return this.solr0.getLoadTime(id); - if (this.solr0 == null && this.solr1 != null) return this.solr1.getLoadTime(id); - if (this.solr0 == null && this.solr1 == null) return -1; - return Math.max(this.solr0.getLoadTime(id), this.solr1.getLoadTime(id)); + public Metadata getMetadata(String id) throws IOException { + if (this.solr0 != null && this.solr1 == null) return this.solr0.getMetadata(id); + if (this.solr0 == null && this.solr1 != null) return this.solr1.getMetadata(id); + if (this.solr0 == null && this.solr1 == null) return null; + Metadata md0 = this.solr0.getMetadata(id); + Metadata md1 = this.solr1.getMetadata(id); + if (md0 == null) return md1; + if (md1 == null) return md0; + long date = Math.max(md0.date, md1.date); + assert md0.url.equals(md1.url); + return new Metadata(md0.url, date); } - /* - @Override - public BlockingQueue concurrentDocumentsByQuery(String querystring, int offset, int maxcount, long maxtime, int buffersize, String... fields) { - return null; - } - */ @Override public BlockingQueue concurrentIDsByQuery(String querystring, int offset, int maxcount, long maxtime) { if (this.solr0 != null && this.solr1 == null) return this.solr0.concurrentIDsByQuery(querystring, offset, maxcount, maxtime); diff --git a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java index 5b61ed310..125fd32ac 100644 --- a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java @@ -36,6 +36,15 @@ import org.apache.solr.common.params.ModifiableSolrParams; public interface SolrConnector extends Iterable /* Iterable of document IDs */ { + public static class Metadata { + public long date; + public String url; + public Metadata(final String url, final long date) { + this.url = url; + this.date = date; + } + } + /** * clear all caches: inside solr and ouside solr within the implementations of this interface */ @@ -110,11 +119,11 @@ public interface SolrConnector extends Iterable /* Iterable of document /** * check if a given document, identified by url hash as document id exists * @param id the url hash and document id - * @return the load time if any entry in solr exists, -1 otherwise + * @return the metadata (url and load data) if any entry in solr exists, null otherwise * @throws IOException */ - public long getLoadTime(final String id) throws IOException; - + public Metadata getMetadata(final String id) throws IOException; + /** * add a solr input document * @param solrdoc diff --git a/source/net/yacy/search/index/ErrorCache.java b/source/net/yacy/search/index/ErrorCache.java index 1339cd2bc..b181765c4 100644 --- a/source/net/yacy/search/index/ErrorCache.java +++ b/source/net/yacy/search/index/ErrorCache.java @@ -176,8 +176,13 @@ public class ErrorCache { } public boolean exists(final byte[] urlHash) { + String urlHashString = ASCII.String(urlHash); try { - final SolrDocument doc = this.fulltext.getDefaultConnector().getDocumentById(ASCII.String(urlHash), CollectionSchema.failreason_s.getSolrFieldName()); + // first try to check if the document exists at all. + long loaddate = this.fulltext.getLoadTime(urlHashString); + if (loaddate < 0) return false; + // then load the fail reason, if exists + final SolrDocument doc = this.fulltext.getDefaultConnector().getDocumentById(urlHashString, CollectionSchema.failreason_s.getSolrFieldName()); if (doc == null) return false; // check if the document contains a value in the field CollectionSchema.failreason_s Object failreason = doc.getFieldValue(CollectionSchema.failreason_s.getSolrFieldName()); diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index 5dd2bcb18..d679a91e8 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -269,21 +269,6 @@ public final class Fulltext { if (this.writeWebgraph) getWebgraphConnector().commit(softCommit); } - public DigestURL getURL(final String urlHashS) { - if (urlHashS == null || this.getDefaultConnector() == null) return null; - - try { - SolrDocument doc = this.getDefaultConnector().getDocumentById(urlHashS, CollectionSchema.sku.getSolrFieldName()); - Object u = doc == null ? null : doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()); - if (u == null) return null; - assert u instanceof String : "u = " + u.toString(); - if (u instanceof String) return new DigestURL((String) u, ASCII.getBytes(urlHashS)); - return null; - } catch (final IOException e) { - return null; - } - } - public URIMetadataNode getMetadata(final WeakPriorityBlockingQueue.Element element) { if (element == null) return null; WordReferenceVars wre = element.getElement(); @@ -483,6 +468,18 @@ public final class Fulltext { return false; } + public DigestURL getURL(final String urlHash) { + if (urlHash == null || this.getDefaultConnector() == null) return null; + + try { + SolrConnector.Metadata md = this.getDefaultConnector().getMetadata(urlHash); + if (md == null) return null; + return new DigestURL(md.url, ASCII.getBytes(urlHash)); + } catch (final IOException e) { + return null; + } + } + /** * get the load time of a resource. * @param urlHash @@ -491,7 +488,9 @@ public final class Fulltext { public long getLoadTime(final String urlHash) { if (urlHash == null) return -1l; try { - return this.getDefaultConnector().getLoadTime(urlHash); + SolrConnector.Metadata md = this.getDefaultConnector().getMetadata(urlHash); + if (md == null) return -1; + return md.date; } catch (final Throwable e) { ConcurrentLog.logException(e); }