changed strategy to test existence of documents in Solr: using the

update time. The reason for that is a better caching for the crawler
double-check, which needs the update time for crawler steering.
pull/1/head
Michael Peter Christen 11 years ago
parent 790f103f32
commit 69391e5d9e

@ -70,7 +70,7 @@ public class HostBrowser {
LINK, INDEX, EXCLUDED, FAILED, RELOAD;
}
@SuppressWarnings({ "deprecation", "unchecked" })
@SuppressWarnings({ "unchecked" })
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
// return variable that accumulates replacements
final Switchboard sb = (Switchboard) env;
@ -125,7 +125,7 @@ public class HostBrowser {
String load = post.get("load", "");
boolean wait = false;
if (loadRight && autoload && path.length() != 0 && pathURI != null && load.length() == 0 && !sb.index.exists(ASCII.String(pathURI.hash()))) {
if (loadRight && autoload && path.length() != 0 && pathURI != null && load.length() == 0 && sb.index.getLoadTime(ASCII.String(pathURI.hash())) < 0) {
// in case that the url does not exist and loading is wanted turn this request into a loading request
load = path;
wait = true;
@ -144,7 +144,7 @@ public class HostBrowser {
));
prop.putHTML("result", reasonString == null ? ("added url to indexer: " + load) : ("not indexed url '" + load + "': " + reasonString));
if (wait) for (int i = 0; i < 30; i++) {
if (sb.index.exists(ASCII.String(url.hash()))) break;
if (sb.index.getLoadTime(ASCII.String(url.hash())) >= 0) break;
try {Thread.sleep(100);} catch (final InterruptedException e) {}
}
} catch (final MalformedURLException e) {

@ -73,7 +73,6 @@ public class IndexControlRWIs_p {
private final static String errmsg = "not possible to compute word from hash";
@SuppressWarnings("deprecation")
public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) {
// return variable that accumulates replacements
final Switchboard sb = (Switchboard) env;
@ -281,7 +280,7 @@ public class IndexControlRWIs_p {
Reference iEntry;
while (urlIter.hasNext()) {
iEntry = urlIter.next();
if (!segment.fulltext().exists(ASCII.String(iEntry.urlhash()))) {
if (segment.fulltext().getLoadTime(ASCII.String(iEntry.urlhash())) >= 0) {
try {
unknownURLEntries.put(iEntry.urlhash());
} catch (final SpaceExceededException e) {

@ -292,12 +292,12 @@ public class Load_RSS_p {
ConcurrentLog.logException(e);
}
}
Map<String, HarvestProcess> existingurls = sb.urlExists(messages.keySet());
loop: for (final Map.Entry<String, RSSMessage> entry: messages.entrySet()) {
try {
final RSSMessage message = entry.getValue();
final DigestURL messageurl = new DigestURL(message.getLink());
if (existingurls.get(ASCII.String(messageurl.hash())) != null) continue loop;
HarvestProcess harvestProcess = sb.urlExists(ASCII.String(messageurl.hash()));
if (harvestProcess != null) continue loop;
list.add(messageurl);
RSSLoader.indexTriggered.insertIfAbsent(messageurl.hash(), new Date());
} catch (final IOException e) {
@ -344,7 +344,6 @@ public class Load_RSS_p {
continue;
}
}
Map<String, HarvestProcess> ids = sb.urlExists(urls.keySet());
int i = 0;
for (final Hit item: feed) {
@ -353,7 +352,8 @@ public class Load_RSS_p {
author = item.getAuthor();
if (author == null) author = item.getCopyright();
pubDate = item.getPubDate();
prop.put("showitems_item_" + i + "_state", ids.get(ASCII.String(messageurl.hash())) != null ? 2 : RSSLoader.indexTriggered.containsKey(messageurl.hash()) ? 1 : 0);
HarvestProcess harvestProcess = sb.urlExists(ASCII.String(messageurl.hash()));
prop.put("showitems_item_" + i + "_state", harvestProcess != null ? 2 : RSSLoader.indexTriggered.containsKey(messageurl.hash()) ? 1 : 0);
prop.put("showitems_item_" + i + "_state_count", i);
prop.putHTML("showitems_item_" + i + "_state_guid", item.getGuid());
prop.putHTML("showitems_item_" + i + "_author", author == null ? "" : author);

@ -232,10 +232,9 @@ public final class transferRWI {
testids.add(ASCII.String(urlHash));
received++;
}
Set<String> existing = sb.index.fulltext().exists(testids);
for (String id: testids) {
try {
if (existing.contains(id)) {
if (sb.index.fulltext().getLoadTime(id) >= 0) {
knownURL.put(ASCII.getBytes(id));
} else {
unknownURL.put(ASCII.getBytes(id));

@ -30,7 +30,6 @@ import java.io.IOException;
import java.text.ParseException;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.encoding.ASCII;
@ -144,10 +143,9 @@ public final class transferURL {
lEm.put(ASCII.String(lEntry.hash()), lEntry);
}
Set<String> doubles = sb.index.exists(lEm.keySet());
doublecheck = doubles.size();
doublecheck = 0;
for (String id : lEm.keySet()) {
if (!doubles.contains(id)) {
if (sb.index.getLoadTime(id) < 0) {
lEntry = lEm.get(id);
// write entry to database
@ -160,6 +158,8 @@ public final class transferURL {
} catch (final IOException e) {
ConcurrentLog.logException(e);
}
} else {
doublecheck++;
}
}

@ -21,6 +21,7 @@
package net.yacy.cora.federate.solr.connector;
import java.io.IOException;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
@ -45,11 +46,20 @@ import org.apache.solr.client.solrj.response.FacetField.Count;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.FacetParams;
import org.apache.solr.common.params.ModifiableSolrParams;
public abstract class AbstractSolrConnector implements SolrConnector {
protected static Set<String> SOLR_ID_FIELDS = new HashSet<String>();
protected static Set<String> SOLR_ID_and_LOAD_DATE_FIELDS = new HashSet<String>();
static {
SOLR_ID_FIELDS.add(CollectionSchema.id.getSolrFieldName());
SOLR_ID_and_LOAD_DATE_FIELDS.add(CollectionSchema.id.getSolrFieldName());
SOLR_ID_and_LOAD_DATE_FIELDS.add(CollectionSchema.load_date_dt.getSolrFieldName());
}
public final static SolrDocument POISON_DOCUMENT = new SolrDocument();
public final static String POISON_ID = "POISON_ID";
public final static String CATCHALL_TERM = "*:*";
@ -72,6 +82,42 @@ public abstract class AbstractSolrConnector implements SolrConnector {
}
protected final static int pagesize = 100;
protected static long getLoadDate(final Object doc) {
Object d = null;
if (doc != null) {
if (doc instanceof SolrInputDocument) d = ((SolrInputDocument) doc).getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName());
if (doc instanceof SolrDocument) d = ((SolrDocument) doc).getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName());
if (doc instanceof org.apache.lucene.document.Document) {
String ds = ((org.apache.lucene.document.Document) doc).get(CollectionSchema.load_date_dt.getSolrFieldName());
try {
d = Long.parseLong(ds);
} catch (NumberFormatException e) {
d = -1l;
}
}
}
if (d == null) return -1l;
if (d instanceof Long) return ((Long) d).longValue();
if (d instanceof Date) return ((Date) d).getTime();
return -1l;
}
/**
* check if fields contain id and load_date_dt date
* @param fields
* @return fields with added id and load_date_dt if necessary
*/
protected static String[] ensureEssentialFieldsIncluded(String[] fields) {
if (fields != null && fields.length > 0) {
Set<String> f = new HashSet<String>();
for (String s: fields) f.add(s);
f.add(CollectionSchema.id.getSolrFieldName());
f.add(CollectionSchema.load_date_dt.getSolrFieldName());
fields = f.toArray(new String[f.size()]);
}
return fields;
}
/**
* Get a query result from solr as a stream of documents.
* The result queue is considered as terminated if AbstractSolrConnector.POISON_DOCUMENT is returned.
@ -191,62 +237,31 @@ public abstract class AbstractSolrConnector implements SolrConnector {
}
/**
* check if a given document, identified by url hash as ducument id exists
* check if a given document, identified by url hash as document id exists
* @param id the url hash and document id
* @return true if any entry in solr exists
* @return the load date if any entry in solr exists, -1 otherwise
* @throws IOException
*/
@Override
public boolean existsById(String id) throws IOException {
public long getLoadTime(String id) throws IOException {
// construct raw query
final SolrQuery params = new SolrQuery();
//params.setQuery(CollectionSchema.id.getSolrFieldName() + ":\"" + id + "\"");
params.setQuery("{!raw f=" + CollectionSchema.id.getSolrFieldName() + "}" + id);
//params.set("defType", "raw");
params.setRows(0);
params.setStart(0);
params.setFacet(false);
params.clearSorts();
params.setFields(CollectionSchema.id.getSolrFieldName());
params.setIncludeScore(false);
// query the server
return getDocumentCountByParams(params) > 0;
}
/**
* check a set of ids for existence.
* @param ids a collection of document ids
* @return a collection of a subset of the ids which exist in the index
* @throws IOException
*/
public Set<String> existsByIds(Set<String> ids) throws IOException {
if (ids == null || ids.size() == 0) return new HashSet<String>();
// construct raw query
final SolrQuery params = new SolrQuery();
//params.setQuery(CollectionSchema.id.getSolrFieldName() + ":\"" + id + "\"");
StringBuilder sb = new StringBuilder(); // construct something like "({!raw f=id}Ij7B63g-gSHA) OR ({!raw f=id}PBcGI3g-gSHA)"
for (String id: ids) {
sb.append("({!raw f=").append(CollectionSchema.id.getSolrFieldName()).append('}').append(id).append(") OR ");
}
if (sb.length() > 0) sb.setLength(sb.length() - 4); // cut off the last 'or'
params.setQuery(sb.toString());
//params.set("defType", "raw");
params.setRows(ids.size()); // we want all lines
params.setRows(1);
params.setStart(0);
params.setFacet(false);
params.clearSorts();
params.setFields(CollectionSchema.id.getSolrFieldName());
params.setFields(CollectionSchema.id.getSolrFieldName(), CollectionSchema.load_date_dt.getSolrFieldName());
params.setIncludeScore(false);
// query the server
final SolrDocumentList docs = getDocumentListByParams(params);
// construct a new id list from that
HashSet<String> idsr = new HashSet<String>();
for (SolrDocument doc : docs) {
idsr.add((String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()));
}
return idsr;
final SolrDocumentList sdl = getDocumentListByParams(params);
if (sdl == null || sdl.getNumFound() <= 0) return -1;
SolrDocument doc = sdl.iterator().next();
long d = getLoadDate(doc);
return d;
}
/**

@ -23,17 +23,15 @@ package net.yacy.cora.federate.solr.connector;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.cora.storage.ARH;
import net.yacy.cora.storage.ConcurrentARH;
import net.yacy.cora.storage.ARC;
import net.yacy.cora.storage.ConcurrentARC;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.search.schema.CollectionSchema;
@ -68,7 +66,7 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
try {
removeIdFromUpdateQueue(id);
ConcurrentUpdateSolrConnector.this.connector.deleteById(id);
ConcurrentUpdateSolrConnector.this.idCache.delete(id);
ConcurrentUpdateSolrConnector.this.idCache.remove(id);
} catch (final IOException e) {
ConcurrentLog.logException(e);
}
@ -90,7 +88,9 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
// accumulate a collection of documents because that is better to send at once to a remote server
Collection<SolrInputDocument> docs = new ArrayList<SolrInputDocument>(getmore + 1);
docs.add(doc);
updateIdCache((String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()));
String id = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
long date = AbstractSolrConnector.getLoadDate(doc);
updateIdCache(id, date);
for (int i = 0; i < getmore; i++) {
SolrInputDocument d = ConcurrentUpdateSolrConnector.this.updateQueue.take();
if (d == POISON_DOCUMENT) {
@ -98,7 +98,9 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
break;
}
docs.add(d);
updateIdCache((String) d.getFieldValue(CollectionSchema.id.getSolrFieldName()));
id = (String) d.getFieldValue(CollectionSchema.id.getSolrFieldName());
date = AbstractSolrConnector.getLoadDate(d);
updateIdCache(id, date);
}
//ConcurrentLog.info("ConcurrentUpdateSolrConnector", "sending " + docs.size() + " documents to solr");
try {
@ -109,7 +111,9 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
} else {
// if there is only a single document, send this directly to solr
//ConcurrentLog.info("ConcurrentUpdateSolrConnector", "sending one document to solr");
updateIdCache((String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()));
String id = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
long date = AbstractSolrConnector.getLoadDate(doc);
updateIdCache(id, date);
try {
ConcurrentUpdateSolrConnector.this.connector.add(doc);
} catch (final OutOfMemoryError e) {
@ -131,14 +135,14 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
}
}
private ARH<String> idCache;
private ARC<String, Long> idCache;
private BlockingQueue<SolrInputDocument> updateQueue;
private BlockingQueue<String> deleteQueue;
private Thread deletionHandler, updateHandler;
public ConcurrentUpdateSolrConnector(SolrConnector connector, int updateCapacity, int idCacheCapacity, int concurrency) {
this.connector = connector;
this.idCache = new ConcurrentARH<String>(idCacheCapacity, concurrency);
this.idCache = new ConcurrentARC<String, Long>(idCacheCapacity, concurrency); // url hash to load time
this.updateQueue = new ArrayBlockingQueue<SolrInputDocument>(updateCapacity);
this.deleteQueue = new LinkedBlockingQueue<String>();
this.deletionHandler = null;
@ -162,7 +166,7 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
* used for debugging
*/
private static void cacheSuccessSign() {
//Log.logInfo("ConcurrentUpdate", "**** cache hit");
//ConcurrentLog.info("ConcurrentUpdate", "**** cache hit");
}
private boolean existIdFromDeleteQueue(String id) {
@ -188,16 +192,16 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
return null;
}
private boolean existIdFromUpdateQueue(String id) {
if (this.updateQueue.size() == 0) return false;
private long existIdFromUpdateQueue(String id) {
if (this.updateQueue.size() == 0) return -1;
Iterator<SolrInputDocument> i = this.updateQueue.iterator();
while (i.hasNext()) {
SolrInputDocument doc = i.next();
if (doc == null) break;
String docID = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
if (docID != null && docID.equals(id)) return true;
if (docID != null && docID.equals(id)) return AbstractSolrConnector.getLoadDate(doc);
}
return false;
return -1;
}
private void removeIdFromUpdateQueue(String id) {
@ -227,10 +231,10 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
}
}
private void updateIdCache(String id) {
private void updateIdCache(String id, long time) {
if (id == null) return;
if (MemoryControl.shortStatus()) this.idCache.clear();
this.idCache.add(id);
this.idCache.put(id, time);
}
public void ensureAliveDeletionHandler() {
@ -312,7 +316,7 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
@Override
public void deleteById(String id) throws IOException {
removeIdFromUpdateQueue(id);
this.idCache.delete(id);
this.idCache.remove(id);
if (this.deletionHandler.isAlive()) {
try {this.deleteQueue.put(id);} catch (final InterruptedException e) {}
} else {
@ -324,7 +328,7 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
public void deleteByIds(Collection<String> ids) throws IOException {
for (String id: ids) {
removeIdFromUpdateQueue(id);
this.idCache.delete(id);
this.idCache.remove(id);
}
if (this.deletionHandler.isAlive()) {
for (String id: ids) try {this.deleteQueue.put(id);} catch (final InterruptedException e) {}
@ -335,8 +339,8 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
@Override
public void deleteByQuery(final String querystring) throws IOException {
new Thread() {
public void run() {
//new Thread() {
// public void run() {
ConcurrentUpdateSolrConnector.this.idCache.clear();
try {
ConcurrentUpdateSolrConnector.this.connector.deleteByQuery(querystring);
@ -345,47 +349,30 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
ConcurrentLog.severe("ConcurrentUpdateSolrConnector", e.getMessage(), e);
}
ConcurrentUpdateSolrConnector.this.connector.commit(true);
}
}.start();
// }
//}.start();
}
@Override
public boolean existsById(String id) throws IOException {
if (this.idCache.contains(id)) {cacheSuccessSign(); return true;}
if (existIdFromDeleteQueue(id)) {cacheSuccessSign(); return false;}
if (existIdFromUpdateQueue(id)) {cacheSuccessSign(); return true;}
if (this.connector.existsById(id)) {
updateIdCache(id);
return true;
public long getLoadTime(String id) throws IOException {
Long date = this.idCache.get(id);
if (date != null) {cacheSuccessSign(); return date.longValue();}
if (existIdFromDeleteQueue(id)) {cacheSuccessSign(); return -1;}
long d = existIdFromUpdateQueue(id);
if (d >= 0) {cacheSuccessSign(); return d;}
d = this.connector.getLoadTime(id);
if (d >= 0) {
updateIdCache(id, d);
return d;
}
return false;
}
@Override
public Set<String> existsByIds(Set<String> ids) throws IOException {
HashSet<String> e = new HashSet<String>();
if (ids == null || ids.size() == 0) return e;
if (ids.size() == 1) return existsById(ids.iterator().next()) ? ids : e;
Set<String> idsC = new HashSet<String>();
for (String id: ids) {
if (this.idCache.contains(id)) {cacheSuccessSign(); e.add(id); continue;}
if (existIdFromDeleteQueue(id)) {cacheSuccessSign(); continue;}
if (existIdFromUpdateQueue(id)) {cacheSuccessSign(); e.add(id); continue;}
idsC.add(id);
}
Set<String> e1 = this.connector.existsByIds(idsC);
for (String id1: e1) {
updateIdCache(id1);
}
e.addAll(e1);
return e;
return -1;
}
@Override
public void add(SolrInputDocument solrdoc) throws IOException, SolrException {
String id = (String) solrdoc.getFieldValue(CollectionSchema.id.getSolrFieldName());
removeIdFromDeleteQueue(id);
updateIdCache(id);
updateIdCache(id, AbstractSolrConnector.getLoadDate(solrdoc));
if (this.updateHandler.isAlive()) {
try {this.updateQueue.put(solrdoc);} catch (final InterruptedException e) {}
} else {
@ -398,7 +385,7 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
for (SolrInputDocument doc: solrdocs) {
String id = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
removeIdFromDeleteQueue(id);
updateIdCache(id);
updateIdCache(id, AbstractSolrConnector.getLoadDate(doc));
}
if (this.updateHandler.isAlive()) {
for (SolrInputDocument doc: solrdocs) try {this.updateQueue.put(doc);} catch (final InterruptedException e) {}
@ -406,14 +393,14 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
this.connector.add(solrdocs);
}
}
@Override
public SolrDocument getDocumentById(String id, String... fields) throws IOException {
public SolrDocument getDocumentById(final String id, String... fields) throws IOException {
if (existIdFromDeleteQueue(id)) return null;
SolrInputDocument idoc = getFromUpdateQueue(id);
if (idoc != null) {cacheSuccessSign(); return ClientUtils.toSolrDocument(idoc);}
SolrDocument doc = this.connector.getDocumentById(id, fields);
if (doc != null) updateIdCache(id);
SolrDocument doc = this.connector.getDocumentById(id, AbstractSolrConnector.ensureEssentialFieldsIncluded(fields));
if (doc != null) updateIdCache(id, AbstractSolrConnector.getLoadDate(doc));
return doc;
}
@ -436,7 +423,16 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
@Override
public SolrDocumentList getDocumentListByQuery(String querystring, int offset, int count, String... fields) throws IOException, SolrException {
return this.connector.getDocumentListByQuery(querystring, offset, count, fields);
SolrDocumentList sdl = this.connector.getDocumentListByQuery(querystring, offset, count, AbstractSolrConnector.ensureEssentialFieldsIncluded(fields));
/*
Iterator<SolrDocument> i = sdl.iterator();
while (i.hasNext()) {
SolrDocument doc = i.next();
String id = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
if (doc != null) updateIdCache(id, AbstractSolrConnector.getLoadDate(doc));
}
*/
return sdl;
}
@Override

@ -23,10 +23,8 @@ package net.yacy.cora.federate.solr.connector;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
@ -73,11 +71,6 @@ import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.RefCounted;
public class EmbeddedSolrConnector extends SolrServerConnector implements SolrConnector {
private static Set<String> SOLR_ID_FIELDS = new HashSet<String>();
static {
SOLR_ID_FIELDS.add(CollectionSchema.id.getSolrFieldName());
}
public static final String SELECT = "/select";
public static final String CONTEXT = "/solr";
@ -385,38 +378,33 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
return numFound;
}
/**
* check if a given document, identified by url hash as document id exists
* @param id the url hash and document id
* @return the load date if any entry in solr exists, -1 otherwise
* @throws IOException
*/
@Override
public synchronized boolean existsById(String id) {
return getCountByQuery("{!raw f=" + CollectionSchema.id.getSolrFieldName() + "}" + id) > 0;
}
@Override
public synchronized Set<String> existsByIds(Set<String> ids) {
if (ids == null || ids.size() == 0) return new HashSet<String>();
if (ids.size() == 1) return existsById(ids.iterator().next()) ? ids : new HashSet<String>();
Set<String> idsr = new TreeSet<String>();
final SolrQuery params = new SolrQuery();
params.setRows(0);
params.setStart(0);
params.setFacet(false);
params.clearSorts();
params.setFields(CollectionSchema.id.getSolrFieldName());
params.setIncludeScore(false);
SolrQueryRequest req = new SolrQueryRequestBase(this.core, params){};
req.getContext().put("path", SELECT);
req.getContext().put("webapp", CONTEXT);
public synchronized long getLoadTime(String id) {
int responseCount = 0;
SolrIndexSearcher searcher = null;
DocListSearcher docListSearcher = null;
try {
for (String id: ids) {
params.setQuery("{!raw f=" + CollectionSchema.id.getSolrFieldName() + "}" + id);
SolrQueryResponse rsp = new SolrQueryResponse();
this.requestHandler.handleRequest(req, rsp);
DocList response = ((ResultContext) rsp.getValues().get("response")).docs;
if (response.matches() > 0) idsr.add(id);
}
} finally {
req.close();
docListSearcher = new DocListSearcher("{!raw f=" + CollectionSchema.id.getSolrFieldName() + "}" + id, 0, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.load_date_dt.getSolrFieldName());
responseCount = docListSearcher.response.size();
if (responseCount == 0) return -1;
searcher = docListSearcher.request.getSearcher();
DocIterator iterator = docListSearcher.response.iterator();
//for (int i = 0; i < responseCount; i++) {
Document doc = searcher.doc(iterator.nextDoc(), AbstractSolrConnector.SOLR_ID_and_LOAD_DATE_FIELDS);
if (doc == null) return -1;
return AbstractSolrConnector.getLoadDate(doc);
//}
} catch (Throwable e) {} finally {
if (searcher != null) try {searcher.close();} catch (IOException e) {}
if (docListSearcher != null) docListSearcher.close();
}
return idsr;
return -1;
}
@Override

@ -23,9 +23,7 @@ package net.yacy.cora.federate.solr.connector;
import java.io.IOException;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.atomic.AtomicLong;
@ -391,20 +389,11 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo
}
@Override
public boolean existsById(String id) throws IOException {
return (this.solr0 != null && this.solr0.existsById(id)) || (this.solr1 != null && this.solr1.existsById(id));
}
@Override
public Set<String> existsByIds(Set<String> ids) throws IOException {
if (ids == null || ids.size() == 0) return new HashSet<String>();
if (ids.size() == 1) return existsById(ids.iterator().next()) ? ids : new HashSet<String>();
if (this.solr0 != null && this.solr1 == null) return this.solr0.existsByIds(ids);
if (this.solr0 == null && this.solr1 != null) return this.solr1.existsByIds(ids);
Set<String> s = new HashSet<String>();
s.addAll(this.solr0.existsByIds(ids));
s.addAll(this.solr1.existsByIds(ids));
return s;
public long getLoadTime(String id) throws IOException {
if (this.solr0 != null && this.solr1 == null) return this.solr0.getLoadTime(id);
if (this.solr0 == null && this.solr1 != null) return this.solr1.getLoadTime(id);
if (this.solr0 == null && this.solr1 == null) return -1;
return Math.max(this.solr0.getLoadTime(id), this.solr1.getLoadTime(id));
}
/*

@ -23,7 +23,6 @@ package net.yacy.cora.federate.solr.connector;
import java.io.IOException;
import java.util.Collection;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import net.yacy.cora.sorting.ReversibleScoreMap;
@ -105,18 +104,10 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
/**
* check if a given document, identified by url hash as document id exists
* @param id the url hash and document id
* @return true if any entry in solr exists
* @return the load time if any entry in solr exists, -1 otherwise
* @throws IOException
*/
public boolean existsById(final String id) throws IOException;
/**
* check a set of ids for existence.
* @param ids a collection of document ids
* @return a collection of a subset of the ids which exist in the index
* @throws IOException
*/
public Set<String> existsByIds(Set<String> ids) throws IOException;
public long getLoadTime(final String id) throws IOException;
/**
* add a solr input document

@ -399,8 +399,8 @@ public final class CrawlStacker {
// check if the url is double registered
String urlhash = ASCII.String(url.hash());
final HarvestProcess dbocc = this.nextQueue.exists(url.hash()); // returns the name of the queue if entry exists
final Date oldDate = this.indexSegment.fulltext().getLoadDate(urlhash); // TODO: combine the exists-query with this one
if (oldDate == null) {
final long oldTime = this.indexSegment.fulltext().getLoadTime(urlhash);
if (oldTime < 0) {
if (dbocc != null) {
// do double-check
if (dbocc == HarvestProcess.ERRORS) {
@ -410,12 +410,13 @@ public final class CrawlStacker {
return "double in: " + dbocc.toString();
}
} else {
final boolean recrawl = profile.recrawlIfOlder() > oldDate.getTime();
final boolean recrawl = profile.recrawlIfOlder() > oldTime;
if (recrawl) {
if (CrawlStacker.log.isInfo())
CrawlStacker.log.info("RE-CRAWL of URL '" + urlstring + "': this url was crawled " +
((System.currentTimeMillis() - oldDate.getTime()) / 60000 / 60 / 24) + " days ago.");
((System.currentTimeMillis() - oldTime) / 60000 / 60 / 24) + " days ago.");
} else {
Date oldDate = new Date(oldTime);
if (dbocc == null) {
return "double in: LURL-DB, oldDate = " + oldDate.toString();
}

@ -107,9 +107,9 @@ public class RSSLoader extends Thread {
ConcurrentLog.logException(e);
}
}
Map<String, HarvestProcess> existingids = sb.urlExists(urlmap.keySet());
for (final Map.Entry<String, DigestURL> e: urlmap.entrySet()) {
if (existingids.get(e.getKey()) != null) continue;
HarvestProcess harvestProcess = sb.urlExists(e.getKey());
if (harvestProcess != null) continue;
list.add(e.getValue());
indexTriggered.insertIfAbsent(ASCII.getBytes(e.getKey()), new Date());
loadCount++;

@ -82,7 +82,6 @@ public class SitemapImporter extends Thread {
// check if the url is known and needs to be recrawled
Date lastMod = entry.lastmod(null);
if (lastMod != null) {
@SuppressWarnings("deprecation")
final HarvestProcess dbocc = this.sb.urlExists(ASCII.String(nexturlhash));
if (dbocc != null && dbocc == HarvestProcess.LOADED) {
// the url was already loaded. we need to check the date

@ -174,11 +174,10 @@ public class Transmission {
}
testids.add(ASCII.String(e.urlhash()));
}
Set<String> existingids = Transmission.this.segment.fulltext().exists(testids);
i = c.entries();
while (i.hasNext()) {
final WordReference e = i.next();
if (existingids.contains(ASCII.String(e.urlhash()))) {
if (Transmission.this.segment.fulltext().getLoadTime(ASCII.String(e.urlhash())) >= 0) {
this.references.put(e.urlhash());
} else {
notFoundx.add(e.urlhash());

@ -1596,26 +1596,11 @@ public final class Switchboard extends serverSwitch {
* @param hash
* @return if it exists, the name of the database is returned, if it not exists, null is returned
*/
@Deprecated
public HarvestProcess urlExists(final String hash) {
if (this.index.exists(hash)) return HarvestProcess.LOADED;
if (this.index.getLoadTime(hash) >= 0) return HarvestProcess.LOADED;
return this.crawlQueues.exists(ASCII.getBytes(hash));
}
/**
* tests if hashes occur in any database.
* @param ids a collection of url hashes
* @return a map from the hash id to: if it exists, the name of the database, otherwise null
*/
public Map<String, HarvestProcess> urlExists(final Set<String> ids) {
Set<String> e = this.index.exists(ids);
Map<String, HarvestProcess> m = new HashMap<String, HarvestProcess>();
for (String id: ids) {
m.put(id, e.contains(id) ? HarvestProcess.LOADED : this.crawlQueues.exists(ASCII.getBytes(id)));
}
return m;
}
public void urlRemove(final Segment segment, final byte[] hash) {
segment.fulltext().remove(hash);
ResultURLs.remove(ASCII.String(hash));
@ -2990,8 +2975,7 @@ public final class Switchboard extends serverSwitch {
// stacking may fail because of double occurrences of that url. Therefore
// we must wait here until the url has actually disappeared
int t = 100;
Set<String> ids = new HashSet<String>(1); ids.add(ASCII.String(urlhash));
while (t-- > 0 && this.index.exists(ids).size() > 0) {
while (t-- > 0 && this.index.getLoadTime(ASCII.String(urlhash)) >= 0) {
try {Thread.sleep(100);} catch (final InterruptedException e) {}
ConcurrentLog.fine("Switchboard", "STACKURL: waiting for deletion, t=" + t);
//if (t == 20) this.index.fulltext().commit(true);
@ -3106,11 +3090,10 @@ public final class Switchboard extends serverSwitch {
if (searchEvent != null) {
for (String id: urlmap.keySet()) searchEvent.addHeuristic(ASCII.getBytes(id), heuristicName, true);
}
final Set<String> existing = doublecheck ? this.index.exists(urlmap.keySet()) : null;
final List<Request> requests = new ArrayList<Request>();
for (Map.Entry<String, DigestURL> e: urlmap.entrySet()) {
final String urlName = e.getValue().toNormalform(true);
if (doublecheck && existing.contains(e.getKey())) {
if (doublecheck && this.index.getLoadTime(e.getKey()) >= 0) {
this.log.info("addToIndex: double " + urlName);
continue;
}
@ -3183,9 +3166,8 @@ public final class Switchboard extends serverSwitch {
public void addToCrawler(final Collection<DigestURL> urls, final boolean asglobal) {
Map<String, DigestURL> urlmap = new HashMap<String, DigestURL>();
for (DigestURL url: urls) urlmap.put(ASCII.String(url.hash()), url);
Set<String> existingids = this.index.exists(urlmap.keySet());
for (Map.Entry<String, DigestURL> e: urlmap.entrySet()) {
if (existingids.contains(e.getKey())) continue; // double
if (this.index.getLoadTime(e.getKey()) >= 0) continue; // double
DigestURL url = e.getValue();
final Request request = this.loader.request(url, true, true);
final CrawlProfile profile = this.crawler.get(ASCII.getBytes(request.profileHandle()));

@ -32,7 +32,6 @@ import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
@ -273,7 +272,7 @@ public final class Fulltext {
getDefaultConnector().commit(softCommit);
if (this.writeWebgraph) getWebgraphConnector().commit(softCommit);
}
/*
public Date getLoadDate(final String urlHash) {
if (urlHash == null) return null;
try {
@ -288,7 +287,7 @@ public final class Fulltext {
return null;
}
}
*/
public DigestURL getURL(final byte[] urlHash) {
if (urlHash == null || this.getDefaultConnector() == null) return null;
@ -526,36 +525,19 @@ public final class Fulltext {
return false;
}
@Deprecated
public boolean exists(final String urlHash) {
if (urlHash == null) return false;
try {
if (this.getDefaultConnector().existsById(urlHash)) return true;
} catch (final Throwable e) {
ConcurrentLog.logException(e);
}
return false;
}
/**
* Multiple-test for existing url hashes in the search index.
* All given ids are tested and a subset of the given ids are returned.
* @param ids
* @return a set of ids which exist in the database
* get the load time of a resource.
* @param urlHash
* @return the time in milliseconds since epoch for the load time or -1 if the document does not exist
*/
public Set<String> exists(Set<String> ids) {
HashSet<String> e = new HashSet<String>();
if (ids == null || ids.size() == 0) return e;
if (ids.size() == 1) return exists(ids.iterator().next()) ? ids : e;
Set<String> idsC = new HashSet<String>();
idsC.addAll(ids);
public long getLoadTime(final String urlHash) {
if (urlHash == null) return -1l;
try {
Set<String> e1 = this.getDefaultConnector().existsByIds(idsC);
e.addAll(e1);
} catch (final Throwable ee) {
ConcurrentLog.logException(ee);
return this.getDefaultConnector().getLoadTime(urlHash);
} catch (final Throwable e) {
ConcurrentLog.logException(e);
}
return e;
return -1l;
}
public String failReason(final String urlHash) throws IOException {

@ -467,19 +467,13 @@ public class Segment {
}
}
@Deprecated
public boolean exists(final String urlhash) {
return this.fulltext.exists(urlhash);
}
/**
* Multiple-test for existing url hashes in the search index.
* All given ids are tested and a subset of the given ids are returned.
* @param ids
* @return a set of ids which exist in the database
* get the load time of a resource.
* @param urlHash
* @return the time in milliseconds since epoch for the load time or -1 if the document does not exist
*/
public Set<String> exists(final Set<String> ids) {
return this.fulltext.exists(ids);
public long getLoadTime(final String urlhash) {
return this.fulltext.getLoadTime(urlhash);
}
/**

Loading…
Cancel
Save