- extended the solr interface by a references-by-word-count method

- reduced danger that a non-existing RWI database causes NPEs - added Solr queries to did-you-mean: this makes it possible that our did-you-mean algorithm works together with only Solr and without RWIs
13 years ago · 31d4d38804
parent 528d6763fa
commit 31d4d38804
20 changed files with 171 additions and 59 deletions
--- a/htroot/IndexControlRWIs_p.java
+++ b/htroot/IndexControlRWIs_p.java
@ -489,7 +489,7 @@ public class IndexControlRWIs_p {
        }

        // insert constants
-        prop.putNum("wcount", segment.termIndex().sizesMax());
+        prop.putNum("wcount", segment.RWICount());
        prop.put("cleanup_maxReferencesRadioChecked", ReferenceContainer.maxReferences > 0 ? 1 : 0);
        prop.put("cleanup_maxReferences", ReferenceContainer.maxReferences > 0
            ? ReferenceContainer.maxReferences
--- a/htroot/IndexControlURLs_p.java
+++ b/htroot/IndexControlURLs_p.java
@ -198,7 +198,7 @@ public class IndexControlURLs_p {

        // generate list
        if (post.containsKey("urlhashsimilar")) {
-            final Iterator<URIMetadata> entryIt = new RotateIterator<URIMetadata>(segment.fulltext().entries(), ASCII.String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), segment.termIndex().sizesMax());
+            final Iterator<URIMetadata> entryIt = new RotateIterator<URIMetadata>(segment.fulltext().entries(), ASCII.String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), (int) segment.RWICount());
 			final StringBuilder result = new StringBuilder("Sequential List of URL-Hashes:<br />");
 			URIMetadata entry;
 			int i = 0, rows = 0, cols = 0;
--- a/htroot/IndexShare_p.java
+++ b/htroot/IndexShare_p.java
@ -50,7 +50,7 @@ public class IndexShare_p {
            prop.put("wordfreq", sb.getConfigLong("defaultWordReceiveFrequency",10));
            prop.put("dtable", "");
            prop.put("rtable", "");
-            prop.putNum("wcount", indexSegment.termIndex().sizesMax());
+            prop.putNum("wcount", indexSegment.RWICount());
            prop.putNum("ucount", indexSegment.fulltext().size());
            return prop; // be save
        }
@ -63,7 +63,7 @@ public class IndexShare_p {
        }

        // insert constants
-        prop.putNum("wcount", indexSegment.termIndex().sizesMax());
+        prop.putNum("wcount", indexSegment.RWICount());
        prop.putNum("ucount", indexSegment.fulltext().size());

        // return rewrite properties
--- a/htroot/PerformanceQueues_p.java
+++ b/htroot/PerformanceQueues_p.java
@ -299,7 +299,7 @@ public class PerformanceQueues_p {
        prop.put("minimumGlobalDelta", sb.crawlQueues.noticeURL.getMinimumGlobalDelta());

        // table cache settings
-        prop.putNum("wordCacheSize", indexSegment.termIndex().getBufferSize());
+        prop.putNum("wordCacheSize", indexSegment.RWIBufferCount());
        prop.putNum("wordCacheSizeKBytes", indexSegment.termIndex().getBufferSizeBytes()/1024);
        prop.putNum("maxURLinCache", indexSegment.termIndex().getBufferMaxReferences());
        prop.putNum("maxAgeOfCache", indexSegment.termIndex().getBufferMaxAge() / 1000 / 60); // minutes
--- a/htroot/api/status_p.java
+++ b/htroot/api/status_p.java
@ -53,7 +53,7 @@ public class status_p {
        final int cacheMaxSize = (int) sb.getConfigLong(SwitchboardConstants.WORDCACHE_MAX_COUNT, 10000);
        prop.putNum("ppm", Switchboard.currentPPM());
        prop.putNum("qpm", sb.peers.mySeed().getQPM());
-        prop.putNum("wordCacheSize", segment.termIndex().getBufferSize());
+        prop.putNum("wordCacheSize", segment.RWIBufferCount());
        prop.putNum("wordCacheMaxSize", cacheMaxSize);

        // crawl queues
@ -77,7 +77,7 @@ public class status_p {

        // index size
        prop.putNum("urlpublictextSize", segment.fulltext().size());
-        prop.putNum("rwipublictextSize", segment.termIndex().sizesMax());
+        prop.putNum("rwipublictextSize", segment.RWICount());

        // loader queue
        prop.putNum("loaderSize", sb.crawlQueues.workerSize());
--- a/htroot/suggest.java
+++ b/htroot/suggest.java
@ -27,9 +27,7 @@ import java.util.Iterator;
 import net.yacy.cora.protocol.HeaderFramework;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.protocol.ResponseHeader;
-import net.yacy.kelondro.data.word.Word;
 import net.yacy.search.Switchboard;
-import net.yacy.search.index.Segment;
 import de.anomic.data.DidYouMean;
 import de.anomic.server.serverObjects;
 import de.anomic.server.serverSwitch;
@ -67,15 +65,9 @@ public class suggest {
        final int timeout = (post == null) ? 300 : post.getInt("timeout", 300);
        final int count = (post == null) ? 20 : post.getInt("count", 20);

-        // get segment
-        final Segment indexSegment = sb.index;
-
        int c = 0;
-        if (more ||
-                (indexSegment != null &&
-                !indexSegment.termIndex().has(Word.word2hash(querystring))))
-        {
-            final DidYouMean didYouMean = new DidYouMean(indexSegment.termIndex(), new StringBuilder(querystring));
+        if (more || (sb.index.getQueryCount(querystring) == 0)) {
+            final DidYouMean didYouMean = new DidYouMean(sb.index, new StringBuilder(querystring));
            final Iterator<StringBuilder> meanIt = didYouMean.getSuggestions(timeout, count).iterator();
            String suggestion;
            //[#[query]#,[#{suggestions}##[text]##(eol)#,::#(/eol)##{/suggestions}#]]
--- a/htroot/yacy/query.java
+++ b/htroot/yacy/query.java
@ -103,7 +103,7 @@ public final class query {

        if (obj.equals("rwicount")) {
            // return the total number of available word indexes
-            prop.put("response", sb.index.termIndex().sizesMax());
+            prop.put("response", sb.index.RWICount());
            return prop;
        }

--- a/htroot/yacy/transferRWI.java
+++ b/htroot/yacy/transferRWI.java
@ -123,9 +123,9 @@ public final class transferRWI {
            sb.getLog().logInfo("Rejecting RWIs from peer " + otherPeerName + ". Not granted. This peer is in robinson mode");
            result = "not_granted";
            pause = 60000;
-        } else if (sb.index.termIndex().getBufferSize() > cachelimit) {
+        } else if (sb.index.RWIBufferCount() > cachelimit) {
            // we are too busy to receive indexes
-            sb.getLog().logInfo("Rejecting RWIs from peer " + otherPeerName + ". We are too busy (buffersize=" + sb.index.termIndex().getBufferSize() + ").");
+            sb.getLog().logInfo("Rejecting RWIs from peer " + otherPeerName + ". We are too busy (buffersize=" + sb.index.RWIBufferCount() + ").");
            granted = false; // don't accept more words if there are too many words to flush
            result = "busy";
            pause = 60000;
@ -237,7 +237,7 @@ public final class transferRWI {
            }
            result = "ok";

-            pause = (int) (sb.index.termIndex().getBufferSize() * 20000 / sb.getConfigLong(SwitchboardConstants.WORDCACHE_MAX_COUNT, 100000)); // estimation of necessary pause time
+            pause = (int) (sb.index.RWIBufferCount() * 20000 / sb.getConfigLong(SwitchboardConstants.WORDCACHE_MAX_COUNT, 100000)); // estimation of necessary pause time
        }

        prop.put("unknownURL", unknownURLs.toString());
--- a/htroot/yacysearch.java
+++ b/htroot/yacysearch.java
@ -867,8 +867,7 @@ public class yacysearch {

            prop.put("meanCount", meanMax);
            if ( meanMax > 0 && !json && !rss ) {
-                final DidYouMean didYouMean =
-                    new DidYouMean(indexSegment.termIndex(), new StringBuilder(querystring));
+                final DidYouMean didYouMean = new DidYouMean(indexSegment, new StringBuilder(querystring));
                final Iterator<StringBuilder> meanIt = didYouMean.getSuggestions(100, 5).iterator();
                int meanCount = 0;
                String suggestion;
--- a/source/de/anomic/data/DidYouMean.java
+++ b/source/de/anomic/data/DidYouMean.java
@ -12,10 +12,8 @@ import net.yacy.cora.sorting.ClusteredScoreMap;
 import net.yacy.cora.sorting.ReversibleScoreMap;
 import net.yacy.document.LibraryProvider;
 import net.yacy.document.StringBuilderComparator;
-import net.yacy.kelondro.data.word.Word;
-import net.yacy.kelondro.data.word.WordReference;
 import net.yacy.kelondro.logging.Log;
-import net.yacy.kelondro.rwi.IndexCell;
+import net.yacy.search.index.Segment;


 /**
@ -62,7 +60,7 @@ public class DidYouMean {
    public  static final int AVAILABLE_CPU = Runtime.getRuntime().availableProcessors();
    private static final wordLengthComparator WORD_LENGTH_COMPARATOR = new wordLengthComparator();

-    private final IndexCell<WordReference> index;
+    private final Segment segment;
    private final StringBuilder word;
    private final int wordLen;
    private final LinkedBlockingQueue<StringBuilder> guessGen, guessLib;
@ -77,11 +75,11 @@ public class DidYouMean {
     * @param index a termIndex - most likely retrieved from a switchboard object.
     * @param sort true/false -  sorts the resulting TreeSet by index.count(); <b>Warning:</b> this causes heavy i/o.
     */
-    public DidYouMean(final IndexCell<WordReference> index, final StringBuilder word0) {
+    public DidYouMean(final Segment segment, final StringBuilder word0) {
        this.resultSet = Collections.synchronizedSortedSet(new TreeSet<StringBuilder>(new headMatchingComparator(word0, WORD_LENGTH_COMPARATOR)));
        this.word = word0;
        this.wordLen = this.word.length();
-        this.index = index;
+        this.segment = segment;
        this.guessGen = new LinkedBlockingQueue<StringBuilder>();
        this.guessLib = new LinkedBlockingQueue<StringBuilder>();
        this.createGen = true;
@ -143,7 +141,7 @@ public class DidYouMean {
        final long startTime = System.currentTimeMillis();
        final long timelimit = startTime + timeout;
        if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.indexOf(this.word, ' ') > 0) {
-            return getSuggestions(StringBuilderComparator.CASE_INSENSITIVE_ORDER.split(this.word, ' '), timeout, preSortSelection, this.index);
+            return getSuggestions(StringBuilderComparator.CASE_INSENSITIVE_ORDER.split(this.word, ' '), timeout, preSortSelection, this.segment);
        }
        final SortedSet<StringBuilder> preSorted = getSuggestions(timeout);
        if (System.currentTimeMillis() > timelimit) {
@ -161,12 +159,12 @@ public class DidYouMean {
 	            if (!(scored.sizeSmaller(2 * preSortSelection))) {
 	                break;
 	            }
-	            scored.inc(s, this.index.count(Word.word2hash(s)));
+	            scored.inc(s, this.segment.getQueryCount(s));
 	        }
        } catch (ConcurrentModificationException e) {
        }
        final SortedSet<StringBuilder> countSorted = Collections.synchronizedSortedSet(new TreeSet<StringBuilder>(new headMatchingComparator(this.word, this.INDEX_SIZE_COMPARATOR)));
-        final int wc = this.index.count(Word.word2hash(this.word)); // all counts must be greater than this
+        final int wc = this.segment.getQueryCount(this.word); // all counts must be greater than this
        while (!scored.isEmpty() && countSorted.size() < preSortSelection) {
            final StringBuilder s = scored.getMaxKey();
            final int score = scored.delete(s);
@ -198,10 +196,10 @@ public class DidYouMean {
     * @return
     */
    @SuppressWarnings("unchecked")
-    private static SortedSet<StringBuilder> getSuggestions(final StringBuilder[] words, final long timeout, final int preSortSelection, final IndexCell<WordReference> index) {
+    private static SortedSet<StringBuilder> getSuggestions(final StringBuilder[] words, final long timeout, final int preSortSelection, final Segment segment) {
        final SortedSet<StringBuilder>[] s = new SortedSet[words.length];
        for (int i = 0; i < words.length; i++) {
-            s[i] = new DidYouMean(index, words[i]).getSuggestions(timeout / words.length, preSortSelection);
+            s[i] = new DidYouMean(segment, words[i]).getSuggestions(timeout / words.length, preSortSelection);
        }
        // make all permutations
        final SortedSet<StringBuilder> result = new TreeSet<StringBuilder>(StringBuilderComparator.CASE_INSENSITIVE_ORDER);
@ -435,7 +433,7 @@ public class DidYouMean {
                StringBuilder s;
                try {
                    while ((s = DidYouMean.this.guessLib.take()) != POISON_STRING) {
-                        if (s.length() >= MinimumOutputWordLength && DidYouMean.this.index.has(Word.word2hash(s))) {
+                        if (s.length() >= MinimumOutputWordLength && DidYouMean.this.segment.getQueryCount(s) > 0) {
                            DidYouMean.this.resultSet.add(s);
                        }
                        if (System.currentTimeMillis() > DidYouMean.this.timeLimit) {
@ -454,8 +452,8 @@ public class DidYouMean {

        @Override
        public int compare(final StringBuilder o1, final StringBuilder o2) {
-            final int i1 = DidYouMean.this.index.count(Word.word2hash(o1));
-            final int i2 = DidYouMean.this.index.count(Word.word2hash(o2));
+            final int i1 = DidYouMean.this.segment.getQueryCount(o1);
+            final int i2 = DidYouMean.this.segment.getQueryCount(o2);
            if (i1 == i2) {
                return WORD_LENGTH_COMPARATOR.compare(o1, o2);
            }
--- a/source/net/yacy/cora/services/federated/solr/EmbeddedSolrConnector.java
+++ b/source/net/yacy/cora/services/federated/solr/EmbeddedSolrConnector.java
@ -26,7 +26,6 @@ import java.io.IOException;

 import javax.xml.parsers.ParserConfigurationException;

-
 import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
 import org.apache.solr.client.solrj.response.QueryResponse;
@ -157,9 +156,10 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
        return rsp;
    }

+    @Override
    public QueryResponse query(SolrParams params) throws IOException {
        try {
-            return server.query(params);
+            return this.server.query(params);
        } catch (SolrServerException e) {
            throw new IOException(e);
        } catch (Throwable e) {
--- a/source/net/yacy/cora/services/federated/solr/MirrorSolrConnector.java
+++ b/source/net/yacy/cora/services/federated/solr/MirrorSolrConnector.java
@ -23,6 +23,7 @@ package net.yacy.cora.services.federated.solr;
 import java.io.IOException;
 import java.util.Collection;
 import java.util.List;
+import java.util.concurrent.atomic.AtomicLong;

 import net.yacy.cora.storage.ARC;
 import net.yacy.cora.storage.ConcurrentARC;
@ -300,6 +301,39 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo
        return list;
    }

+    @Override
+    public long getQueryCount(final String querystring) throws IOException {
+        if (this.solr0 == null && this.solr1 == null) return 0;
+        if (this.solr0 != null && this.solr1 == null) {
+            return this.solr0.getQueryCount(querystring);
+        }
+        if (this.solr1 != null && this.solr0 == null) {
+            return this.solr1.getQueryCount(querystring);
+        }
+        final AtomicLong count = new AtomicLong(0);
+        Thread t0 = new Thread() {
+            @Override
+            public void run() {
+                try {
+                    count.addAndGet(MirrorSolrConnector.this.solr0.getQueryCount(querystring));
+                } catch (IOException e) {}
+            }
+        };
+        t0.start();
+        Thread t1 = new Thread() {
+            @Override
+            public void run() {
+                try {
+                    count.addAndGet(MirrorSolrConnector.this.solr1.getQueryCount(querystring));
+                } catch (IOException e) {}
+            }
+        };
+        t1.start();
+        try {t0.join();} catch (InterruptedException e) {}
+        try {t1.join();} catch (InterruptedException e) {}
+        return count.get();
+    }
+
    private void addToCache(SolrDocumentList list) {
        if (MemoryControl.shortStatus()) clearCache();
        for (final SolrDocument solrdoc: list) {
--- a/source/net/yacy/cora/services/federated/solr/MultipleSolrConnector.java
+++ b/source/net/yacy/cora/services/federated/solr/MultipleSolrConnector.java
@ -163,6 +163,12 @@ public class MultipleSolrConnector extends AbstractSolrConnector implements Solr
        return this.solr.query(querystring, offset, count);
    }

+
+    @Override
+    public long getQueryCount(final String querystring) throws IOException {
+        return this.solr.getQueryCount(querystring);
+    }
+
    @Override
    public long getSize() {
        return this.solr.getSize();
--- a/source/net/yacy/cora/services/federated/solr/RetrySolrConnector.java
+++ b/source/net/yacy/cora/services/federated/solr/RetrySolrConnector.java
@ -183,6 +183,21 @@ public class RetrySolrConnector extends AbstractSolrConnector implements SolrCon
        return null;
    }

+    @Override
+    public long getQueryCount(final String querystring) throws IOException {
+        final long t = System.currentTimeMillis() + this.retryMaxTime;
+        Throwable ee = null;
+        while (System.currentTimeMillis() < t) try {
+            return this.solrConnector.getQueryCount(querystring);
+        } catch (final Throwable e) {
+            ee = e;
+            try {Thread.sleep(10);} catch (final InterruptedException e1) {}
+            continue;
+        }
+        if (ee != null) throw (ee instanceof IOException) ? (IOException) ee : new IOException(ee.getMessage());
+        return 0;
+    }
+
    @Override
    public long getSize() {
        final long t = System.currentTimeMillis() + this.retryMaxTime;
--- a/source/net/yacy/cora/services/federated/solr/ShardSolrConnector.java
+++ b/source/net/yacy/cora/services/federated/solr/ShardSolrConnector.java
@ -29,6 +29,7 @@ import java.net.InetAddress;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.List;
+import java.util.concurrent.atomic.AtomicLong;

 import net.yacy.cora.protocol.Domains;

@ -165,15 +166,50 @@ public class ShardSolrConnector extends AbstractSolrConnector implements SolrCon
    @Override
    public SolrDocumentList query(final String querystring, final int offset, final int count) throws IOException {
        final SolrDocumentList list = new SolrDocumentList();
+        List<Thread> t = new ArrayList<Thread>();
        for (final SolrConnector connector: this.connectors) {
+            Thread t0 = new Thread() {
+                @Override
+                public void run() {
+                    try {
                        final SolrDocumentList l = connector.query(querystring, offset, count);
                        for (final SolrDocument d: l) {
                            list.add(d);
                        }
+                    } catch (IOException e) {}
+                }
+            };
+            t0.start();
+            t.add(t0);
+        }
+        for (Thread t0: t) {
+            try {t0.join();} catch (InterruptedException e) {}
        }
        return list;
    }

+    @Override
+    public long getQueryCount(final String querystring) throws IOException {
+        final AtomicLong count = new AtomicLong(0);
+        List<Thread> t = new ArrayList<Thread>();
+        for (final SolrConnector connector: this.connectors) {
+            Thread t0 = new Thread() {
+                @Override
+                public void run() {
+                    try {
+                        count.addAndGet(connector.getQueryCount(querystring));
+                    } catch (IOException e) {}
+                }
+            };
+            t0.start();
+            t.add(t0);
+        }
+        for (Thread t0: t) {
+            try {t0.join();} catch (InterruptedException e) {}
+        }
+        return count.get();
+    }
+
    public long[] getSizeList() {
        final long[] size = new long[this.connectors.size()];
        int i = 0;
--- a/source/net/yacy/cora/services/federated/solr/SolrConnector.java
+++ b/source/net/yacy/cora/services/federated/solr/SolrConnector.java
@ -113,6 +113,14 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
     */
    public SolrDocumentList query(final String querystring, final int offset, final int count) throws IOException, SolrException;

+    /**
+     * get the number of results when this query is done.
+     * This should only be called if the actual result is never used, and only the count is interesting
+     * @param querystring
+     * @return the number of results for this query
+     */
+    public long getQueryCount(final String querystring) throws IOException;
+
    /**
     * Get a query result from solr as a stream of documents.
     * The result queue is considered as terminated if AbstractSolrConnectro.POISON_DOCUMENT is returned.
--- a/source/net/yacy/cora/services/federated/solr/SolrServerConnector.java
+++ b/source/net/yacy/cora/services/federated/solr/SolrServerConnector.java
@ -208,6 +208,20 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen
        return docs;
    }

+    @Override
+    public long getQueryCount(String querystring) throws IOException {
+        // construct query
+        final SolrQuery params = new SolrQuery();
+        params.setQuery(querystring);
+        params.setRows(1);
+        params.setStart(0);
+
+        // query the server
+        QueryResponse rsp = query(params);
+        final SolrDocumentList docs = rsp.getResults();
+        return docs.getNumFound();
+    }
+
    abstract public QueryResponse query(SolrParams params) throws IOException;

    private final char[] queryIDTemplate = "id:\"            \"".toCharArray();
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -2971,9 +2971,9 @@ public final class Switchboard extends serverSwitch
        if ( size < 10 ) {
            return "no DHT distribution: loadedURL.size() = " + size;
        }
-        if ( indexSegment.termIndex().sizesMax() < 100 ) {
+        if ( indexSegment.RWICount() < 100 ) {
            return "no DHT distribution: not enough words - wordIndex.size() = "
-                + indexSegment.termIndex().sizesMax();
+                + indexSegment.RWICount();
        }
        if ( (getConfig(SwitchboardConstants.INDEX_DIST_ALLOW_WHILE_CRAWLING, "false")
            .equalsIgnoreCase("false")) && (this.crawlQueues.noticeURL.notEmptyLocal()) ) {
--- a/source/net/yacy/search/index/Segment.java
+++ b/source/net/yacy/search/index/Segment.java
@ -191,6 +191,18 @@ public class Segment {
        return this.termIndex.getBufferSize();
    }

+    public int getQueryCount(String word) {
+        int count = this.termIndex == null ? 0 : this.termIndex.count(Word.word2hash(word));
+        try {count += this.fulltext.getSolr().getQueryCount(YaCySchema.text_t.name() + ':' + word);} catch (IOException e) {}
+        return count;
+    }
+
+    public int getQueryCount(StringBuilder word) {
+        int count = this.termIndex == null ? 0 : this.termIndex.count(Word.word2hash(word));
+        try {count += this.fulltext.getSolr().getQueryCount(YaCySchema.text_t.name() + ':' + word.toString());} catch (IOException e) {}
+        return count;
+    }
+
    public boolean exists(final byte[] urlhash) {
        return this.fulltext.exists(urlhash);
    }
--- a/source/net/yacy/search/query/RWIProcess.java
+++ b/source/net/yacy/search/query/RWIProcess.java
@ -990,7 +990,6 @@ public final class RWIProcess extends Thread
        final Map<String, Float> counts = new HashMap<String, Float>();
        final Iterator<String> i = this.ref.keys(false);
        String word;
-        byte[] termHash;
        int c;
        float q, min = Float.MAX_VALUE, max = Float.MIN_VALUE;
        int ic = count;
@ -999,8 +998,7 @@ public final class RWIProcess extends Thread
            if ( word == null ) {
                continue;
            }
-            termHash = Word.word2hash(word);
-            c = this.query.getSegment().termIndex().count(termHash);
+            c = this.query.getSegment().getQueryCount(word);
            if ( c > 0 ) {
                q = ((float) this.ref.get(word)) / ((float) c);
                min = Math.min(min, q);