- extended the solr interface by a references-by-word-count method

- reduced danger that a non-existing RWI database causes NPEs
- added Solr queries to did-you-mean: this makes it possible that our
did-you-mean algorithm works together with only Solr and without RWIs
pull/1/head
Michael Peter Christen 13 years ago
parent 528d6763fa
commit 31d4d38804

@ -489,7 +489,7 @@ public class IndexControlRWIs_p {
}
// insert constants
prop.putNum("wcount", segment.termIndex().sizesMax());
prop.putNum("wcount", segment.RWICount());
prop.put("cleanup_maxReferencesRadioChecked", ReferenceContainer.maxReferences > 0 ? 1 : 0);
prop.put("cleanup_maxReferences", ReferenceContainer.maxReferences > 0
? ReferenceContainer.maxReferences

@ -198,7 +198,7 @@ public class IndexControlURLs_p {
// generate list
if (post.containsKey("urlhashsimilar")) {
final Iterator<URIMetadata> entryIt = new RotateIterator<URIMetadata>(segment.fulltext().entries(), ASCII.String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), segment.termIndex().sizesMax());
final Iterator<URIMetadata> entryIt = new RotateIterator<URIMetadata>(segment.fulltext().entries(), ASCII.String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), (int) segment.RWICount());
final StringBuilder result = new StringBuilder("Sequential List of URL-Hashes:<br />");
URIMetadata entry;
int i = 0, rows = 0, cols = 0;

@ -50,7 +50,7 @@ public class IndexShare_p {
prop.put("wordfreq", sb.getConfigLong("defaultWordReceiveFrequency",10));
prop.put("dtable", "");
prop.put("rtable", "");
prop.putNum("wcount", indexSegment.termIndex().sizesMax());
prop.putNum("wcount", indexSegment.RWICount());
prop.putNum("ucount", indexSegment.fulltext().size());
return prop; // be save
}
@ -63,7 +63,7 @@ public class IndexShare_p {
}
// insert constants
prop.putNum("wcount", indexSegment.termIndex().sizesMax());
prop.putNum("wcount", indexSegment.RWICount());
prop.putNum("ucount", indexSegment.fulltext().size());
// return rewrite properties

@ -299,7 +299,7 @@ public class PerformanceQueues_p {
prop.put("minimumGlobalDelta", sb.crawlQueues.noticeURL.getMinimumGlobalDelta());
// table cache settings
prop.putNum("wordCacheSize", indexSegment.termIndex().getBufferSize());
prop.putNum("wordCacheSize", indexSegment.RWIBufferCount());
prop.putNum("wordCacheSizeKBytes", indexSegment.termIndex().getBufferSizeBytes()/1024);
prop.putNum("maxURLinCache", indexSegment.termIndex().getBufferMaxReferences());
prop.putNum("maxAgeOfCache", indexSegment.termIndex().getBufferMaxAge() / 1000 / 60); // minutes

@ -53,7 +53,7 @@ public class status_p {
final int cacheMaxSize = (int) sb.getConfigLong(SwitchboardConstants.WORDCACHE_MAX_COUNT, 10000);
prop.putNum("ppm", Switchboard.currentPPM());
prop.putNum("qpm", sb.peers.mySeed().getQPM());
prop.putNum("wordCacheSize", segment.termIndex().getBufferSize());
prop.putNum("wordCacheSize", segment.RWIBufferCount());
prop.putNum("wordCacheMaxSize", cacheMaxSize);
// crawl queues
@ -77,7 +77,7 @@ public class status_p {
// index size
prop.putNum("urlpublictextSize", segment.fulltext().size());
prop.putNum("rwipublictextSize", segment.termIndex().sizesMax());
prop.putNum("rwipublictextSize", segment.RWICount());
// loader queue
prop.putNum("loaderSize", sb.crawlQueues.workerSize());

@ -27,9 +27,7 @@ import java.util.Iterator;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.kelondro.data.word.Word;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Segment;
import de.anomic.data.DidYouMean;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -67,15 +65,9 @@ public class suggest {
final int timeout = (post == null) ? 300 : post.getInt("timeout", 300);
final int count = (post == null) ? 20 : post.getInt("count", 20);
// get segment
final Segment indexSegment = sb.index;
int c = 0;
if (more ||
(indexSegment != null &&
!indexSegment.termIndex().has(Word.word2hash(querystring))))
{
final DidYouMean didYouMean = new DidYouMean(indexSegment.termIndex(), new StringBuilder(querystring));
if (more || (sb.index.getQueryCount(querystring) == 0)) {
final DidYouMean didYouMean = new DidYouMean(sb.index, new StringBuilder(querystring));
final Iterator<StringBuilder> meanIt = didYouMean.getSuggestions(timeout, count).iterator();
String suggestion;
//[#[query]#,[#{suggestions}##[text]##(eol)#,::#(/eol)##{/suggestions}#]]

@ -103,7 +103,7 @@ public final class query {
if (obj.equals("rwicount")) {
// return the total number of available word indexes
prop.put("response", sb.index.termIndex().sizesMax());
prop.put("response", sb.index.RWICount());
return prop;
}

@ -123,9 +123,9 @@ public final class transferRWI {
sb.getLog().logInfo("Rejecting RWIs from peer " + otherPeerName + ". Not granted. This peer is in robinson mode");
result = "not_granted";
pause = 60000;
} else if (sb.index.termIndex().getBufferSize() > cachelimit) {
} else if (sb.index.RWIBufferCount() > cachelimit) {
// we are too busy to receive indexes
sb.getLog().logInfo("Rejecting RWIs from peer " + otherPeerName + ". We are too busy (buffersize=" + sb.index.termIndex().getBufferSize() + ").");
sb.getLog().logInfo("Rejecting RWIs from peer " + otherPeerName + ". We are too busy (buffersize=" + sb.index.RWIBufferCount() + ").");
granted = false; // don't accept more words if there are too many words to flush
result = "busy";
pause = 60000;
@ -237,7 +237,7 @@ public final class transferRWI {
}
result = "ok";
pause = (int) (sb.index.termIndex().getBufferSize() * 20000 / sb.getConfigLong(SwitchboardConstants.WORDCACHE_MAX_COUNT, 100000)); // estimation of necessary pause time
pause = (int) (sb.index.RWIBufferCount() * 20000 / sb.getConfigLong(SwitchboardConstants.WORDCACHE_MAX_COUNT, 100000)); // estimation of necessary pause time
}
prop.put("unknownURL", unknownURLs.toString());

@ -867,8 +867,7 @@ public class yacysearch {
prop.put("meanCount", meanMax);
if ( meanMax > 0 && !json && !rss ) {
final DidYouMean didYouMean =
new DidYouMean(indexSegment.termIndex(), new StringBuilder(querystring));
final DidYouMean didYouMean = new DidYouMean(indexSegment, new StringBuilder(querystring));
final Iterator<StringBuilder> meanIt = didYouMean.getSuggestions(100, 5).iterator();
int meanCount = 0;
String suggestion;

@ -12,10 +12,8 @@ import net.yacy.cora.sorting.ClusteredScoreMap;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.document.LibraryProvider;
import net.yacy.document.StringBuilderComparator;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.rwi.IndexCell;
import net.yacy.search.index.Segment;
/**
@ -62,7 +60,7 @@ public class DidYouMean {
public static final int AVAILABLE_CPU = Runtime.getRuntime().availableProcessors();
private static final wordLengthComparator WORD_LENGTH_COMPARATOR = new wordLengthComparator();
private final IndexCell<WordReference> index;
private final Segment segment;
private final StringBuilder word;
private final int wordLen;
private final LinkedBlockingQueue<StringBuilder> guessGen, guessLib;
@ -77,11 +75,11 @@ public class DidYouMean {
* @param index a termIndex - most likely retrieved from a switchboard object.
* @param sort true/false - sorts the resulting TreeSet by index.count(); <b>Warning:</b> this causes heavy i/o.
*/
public DidYouMean(final IndexCell<WordReference> index, final StringBuilder word0) {
public DidYouMean(final Segment segment, final StringBuilder word0) {
this.resultSet = Collections.synchronizedSortedSet(new TreeSet<StringBuilder>(new headMatchingComparator(word0, WORD_LENGTH_COMPARATOR)));
this.word = word0;
this.wordLen = this.word.length();
this.index = index;
this.segment = segment;
this.guessGen = new LinkedBlockingQueue<StringBuilder>();
this.guessLib = new LinkedBlockingQueue<StringBuilder>();
this.createGen = true;
@ -143,7 +141,7 @@ public class DidYouMean {
final long startTime = System.currentTimeMillis();
final long timelimit = startTime + timeout;
if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.indexOf(this.word, ' ') > 0) {
return getSuggestions(StringBuilderComparator.CASE_INSENSITIVE_ORDER.split(this.word, ' '), timeout, preSortSelection, this.index);
return getSuggestions(StringBuilderComparator.CASE_INSENSITIVE_ORDER.split(this.word, ' '), timeout, preSortSelection, this.segment);
}
final SortedSet<StringBuilder> preSorted = getSuggestions(timeout);
if (System.currentTimeMillis() > timelimit) {
@ -161,12 +159,12 @@ public class DidYouMean {
if (!(scored.sizeSmaller(2 * preSortSelection))) {
break;
}
scored.inc(s, this.index.count(Word.word2hash(s)));
scored.inc(s, this.segment.getQueryCount(s));
}
} catch (ConcurrentModificationException e) {
}
final SortedSet<StringBuilder> countSorted = Collections.synchronizedSortedSet(new TreeSet<StringBuilder>(new headMatchingComparator(this.word, this.INDEX_SIZE_COMPARATOR)));
final int wc = this.index.count(Word.word2hash(this.word)); // all counts must be greater than this
final int wc = this.segment.getQueryCount(this.word); // all counts must be greater than this
while (!scored.isEmpty() && countSorted.size() < preSortSelection) {
final StringBuilder s = scored.getMaxKey();
final int score = scored.delete(s);
@ -198,10 +196,10 @@ public class DidYouMean {
* @return
*/
@SuppressWarnings("unchecked")
private static SortedSet<StringBuilder> getSuggestions(final StringBuilder[] words, final long timeout, final int preSortSelection, final IndexCell<WordReference> index) {
private static SortedSet<StringBuilder> getSuggestions(final StringBuilder[] words, final long timeout, final int preSortSelection, final Segment segment) {
final SortedSet<StringBuilder>[] s = new SortedSet[words.length];
for (int i = 0; i < words.length; i++) {
s[i] = new DidYouMean(index, words[i]).getSuggestions(timeout / words.length, preSortSelection);
s[i] = new DidYouMean(segment, words[i]).getSuggestions(timeout / words.length, preSortSelection);
}
// make all permutations
final SortedSet<StringBuilder> result = new TreeSet<StringBuilder>(StringBuilderComparator.CASE_INSENSITIVE_ORDER);
@ -435,7 +433,7 @@ public class DidYouMean {
StringBuilder s;
try {
while ((s = DidYouMean.this.guessLib.take()) != POISON_STRING) {
if (s.length() >= MinimumOutputWordLength && DidYouMean.this.index.has(Word.word2hash(s))) {
if (s.length() >= MinimumOutputWordLength && DidYouMean.this.segment.getQueryCount(s) > 0) {
DidYouMean.this.resultSet.add(s);
}
if (System.currentTimeMillis() > DidYouMean.this.timeLimit) {
@ -454,8 +452,8 @@ public class DidYouMean {
@Override
public int compare(final StringBuilder o1, final StringBuilder o2) {
final int i1 = DidYouMean.this.index.count(Word.word2hash(o1));
final int i2 = DidYouMean.this.index.count(Word.word2hash(o2));
final int i1 = DidYouMean.this.segment.getQueryCount(o1);
final int i2 = DidYouMean.this.segment.getQueryCount(o2);
if (i1 == i2) {
return WORD_LENGTH_COMPARATOR.compare(o1, o2);
}

@ -26,7 +26,6 @@ import java.io.IOException;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
import org.apache.solr.client.solrj.response.QueryResponse;
@ -157,9 +156,10 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
return rsp;
}
@Override
public QueryResponse query(SolrParams params) throws IOException {
try {
return server.query(params);
return this.server.query(params);
} catch (SolrServerException e) {
throw new IOException(e);
} catch (Throwable e) {

@ -23,6 +23,7 @@ package net.yacy.cora.services.federated.solr;
import java.io.IOException;
import java.util.Collection;
import java.util.List;
import java.util.concurrent.atomic.AtomicLong;
import net.yacy.cora.storage.ARC;
import net.yacy.cora.storage.ConcurrentARC;
@ -300,6 +301,39 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo
return list;
}
@Override
public long getQueryCount(final String querystring) throws IOException {
if (this.solr0 == null && this.solr1 == null) return 0;
if (this.solr0 != null && this.solr1 == null) {
return this.solr0.getQueryCount(querystring);
}
if (this.solr1 != null && this.solr0 == null) {
return this.solr1.getQueryCount(querystring);
}
final AtomicLong count = new AtomicLong(0);
Thread t0 = new Thread() {
@Override
public void run() {
try {
count.addAndGet(MirrorSolrConnector.this.solr0.getQueryCount(querystring));
} catch (IOException e) {}
}
};
t0.start();
Thread t1 = new Thread() {
@Override
public void run() {
try {
count.addAndGet(MirrorSolrConnector.this.solr1.getQueryCount(querystring));
} catch (IOException e) {}
}
};
t1.start();
try {t0.join();} catch (InterruptedException e) {}
try {t1.join();} catch (InterruptedException e) {}
return count.get();
}
private void addToCache(SolrDocumentList list) {
if (MemoryControl.shortStatus()) clearCache();
for (final SolrDocument solrdoc: list) {

@ -163,6 +163,12 @@ public class MultipleSolrConnector extends AbstractSolrConnector implements Solr
return this.solr.query(querystring, offset, count);
}
@Override
public long getQueryCount(final String querystring) throws IOException {
return this.solr.getQueryCount(querystring);
}
@Override
public long getSize() {
return this.solr.getSize();

@ -183,6 +183,21 @@ public class RetrySolrConnector extends AbstractSolrConnector implements SolrCon
return null;
}
@Override
public long getQueryCount(final String querystring) throws IOException {
final long t = System.currentTimeMillis() + this.retryMaxTime;
Throwable ee = null;
while (System.currentTimeMillis() < t) try {
return this.solrConnector.getQueryCount(querystring);
} catch (final Throwable e) {
ee = e;
try {Thread.sleep(10);} catch (final InterruptedException e1) {}
continue;
}
if (ee != null) throw (ee instanceof IOException) ? (IOException) ee : new IOException(ee.getMessage());
return 0;
}
@Override
public long getSize() {
final long t = System.currentTimeMillis() + this.retryMaxTime;

@ -29,6 +29,7 @@ import java.net.InetAddress;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.concurrent.atomic.AtomicLong;
import net.yacy.cora.protocol.Domains;
@ -165,15 +166,50 @@ public class ShardSolrConnector extends AbstractSolrConnector implements SolrCon
@Override
public SolrDocumentList query(final String querystring, final int offset, final int count) throws IOException {
final SolrDocumentList list = new SolrDocumentList();
List<Thread> t = new ArrayList<Thread>();
for (final SolrConnector connector: this.connectors) {
Thread t0 = new Thread() {
@Override
public void run() {
try {
final SolrDocumentList l = connector.query(querystring, offset, count);
for (final SolrDocument d: l) {
list.add(d);
}
} catch (IOException e) {}
}
};
t0.start();
t.add(t0);
}
for (Thread t0: t) {
try {t0.join();} catch (InterruptedException e) {}
}
return list;
}
@Override
public long getQueryCount(final String querystring) throws IOException {
final AtomicLong count = new AtomicLong(0);
List<Thread> t = new ArrayList<Thread>();
for (final SolrConnector connector: this.connectors) {
Thread t0 = new Thread() {
@Override
public void run() {
try {
count.addAndGet(connector.getQueryCount(querystring));
} catch (IOException e) {}
}
};
t0.start();
t.add(t0);
}
for (Thread t0: t) {
try {t0.join();} catch (InterruptedException e) {}
}
return count.get();
}
public long[] getSizeList() {
final long[] size = new long[this.connectors.size()];
int i = 0;

@ -113,6 +113,14 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
*/
public SolrDocumentList query(final String querystring, final int offset, final int count) throws IOException, SolrException;
/**
* get the number of results when this query is done.
* This should only be called if the actual result is never used, and only the count is interesting
* @param querystring
* @return the number of results for this query
*/
public long getQueryCount(final String querystring) throws IOException;
/**
* Get a query result from solr as a stream of documents.
* The result queue is considered as terminated if AbstractSolrConnectro.POISON_DOCUMENT is returned.

@ -208,6 +208,20 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen
return docs;
}
@Override
public long getQueryCount(String querystring) throws IOException {
// construct query
final SolrQuery params = new SolrQuery();
params.setQuery(querystring);
params.setRows(1);
params.setStart(0);
// query the server
QueryResponse rsp = query(params);
final SolrDocumentList docs = rsp.getResults();
return docs.getNumFound();
}
abstract public QueryResponse query(SolrParams params) throws IOException;
private final char[] queryIDTemplate = "id:\" \"".toCharArray();

@ -2971,9 +2971,9 @@ public final class Switchboard extends serverSwitch
if ( size < 10 ) {
return "no DHT distribution: loadedURL.size() = " + size;
}
if ( indexSegment.termIndex().sizesMax() < 100 ) {
if ( indexSegment.RWICount() < 100 ) {
return "no DHT distribution: not enough words - wordIndex.size() = "
+ indexSegment.termIndex().sizesMax();
+ indexSegment.RWICount();
}
if ( (getConfig(SwitchboardConstants.INDEX_DIST_ALLOW_WHILE_CRAWLING, "false")
.equalsIgnoreCase("false")) && (this.crawlQueues.noticeURL.notEmptyLocal()) ) {

@ -191,6 +191,18 @@ public class Segment {
return this.termIndex.getBufferSize();
}
public int getQueryCount(String word) {
int count = this.termIndex == null ? 0 : this.termIndex.count(Word.word2hash(word));
try {count += this.fulltext.getSolr().getQueryCount(YaCySchema.text_t.name() + ':' + word);} catch (IOException e) {}
return count;
}
public int getQueryCount(StringBuilder word) {
int count = this.termIndex == null ? 0 : this.termIndex.count(Word.word2hash(word));
try {count += this.fulltext.getSolr().getQueryCount(YaCySchema.text_t.name() + ':' + word.toString());} catch (IOException e) {}
return count;
}
public boolean exists(final byte[] urlhash) {
return this.fulltext.exists(urlhash);
}

@ -990,7 +990,6 @@ public final class RWIProcess extends Thread
final Map<String, Float> counts = new HashMap<String, Float>();
final Iterator<String> i = this.ref.keys(false);
String word;
byte[] termHash;
int c;
float q, min = Float.MAX_VALUE, max = Float.MIN_VALUE;
int ic = count;
@ -999,8 +998,7 @@ public final class RWIProcess extends Thread
if ( word == null ) {
continue;
}
termHash = Word.word2hash(word);
c = this.query.getSegment().termIndex().count(termHash);
c = this.query.getSegment().getQueryCount(word);
if ( c > 0 ) {
q = ((float) this.ref.get(word)) / ((float) c);
min = Math.min(min, q);

Loading…
Cancel
Save