From 31d4d38804942fa16efec7cb5ccf150d5e4c237e Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Fri, 31 Aug 2012 13:03:00 +0200 Subject: [PATCH] - extended the solr interface by a references-by-word-count method - reduced danger that a non-existing RWI database causes NPEs - added Solr queries to did-you-mean: this makes it possible that our did-you-mean algorithm works together with only Solr and without RWIs --- htroot/IndexControlRWIs_p.java | 2 +- htroot/IndexControlURLs_p.java | 2 +- htroot/IndexShare_p.java | 4 +- htroot/PerformanceQueues_p.java | 2 +- htroot/api/status_p.java | 4 +- htroot/suggest.java | 12 +---- htroot/yacy/query.java | 2 +- htroot/yacy/transferRWI.java | 6 +-- htroot/yacysearch.java | 3 +- source/de/anomic/data/DidYouMean.java | 26 +++++------ .../federated/solr/EmbeddedSolrConnector.java | 8 ++-- .../federated/solr/MirrorSolrConnector.java | 34 ++++++++++++++ .../federated/solr/MultipleSolrConnector.java | 6 +++ .../federated/solr/RetrySolrConnector.java | 15 +++++++ .../federated/solr/ShardSolrConnector.java | 44 +++++++++++++++++-- .../federated/solr/SolrConnector.java | 10 ++++- .../federated/solr/SolrServerConnector.java | 18 +++++++- source/net/yacy/search/Switchboard.java | 10 ++--- source/net/yacy/search/index/Segment.java | 14 +++++- source/net/yacy/search/query/RWIProcess.java | 8 ++-- 20 files changed, 171 insertions(+), 59 deletions(-) diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index f80095c86..d8a20f199 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -489,7 +489,7 @@ public class IndexControlRWIs_p { } // insert constants - prop.putNum("wcount", segment.termIndex().sizesMax()); + prop.putNum("wcount", segment.RWICount()); prop.put("cleanup_maxReferencesRadioChecked", ReferenceContainer.maxReferences > 0 ? 1 : 0); prop.put("cleanup_maxReferences", ReferenceContainer.maxReferences > 0 ? ReferenceContainer.maxReferences diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java index 88469a9b7..49cbd91ba 100644 --- a/htroot/IndexControlURLs_p.java +++ b/htroot/IndexControlURLs_p.java @@ -198,7 +198,7 @@ public class IndexControlURLs_p { // generate list if (post.containsKey("urlhashsimilar")) { - final Iterator entryIt = new RotateIterator(segment.fulltext().entries(), ASCII.String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), segment.termIndex().sizesMax()); + final Iterator entryIt = new RotateIterator(segment.fulltext().entries(), ASCII.String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), (int) segment.RWICount()); final StringBuilder result = new StringBuilder("Sequential List of URL-Hashes:
"); URIMetadata entry; int i = 0, rows = 0, cols = 0; diff --git a/htroot/IndexShare_p.java b/htroot/IndexShare_p.java index d8672803c..7568597f5 100644 --- a/htroot/IndexShare_p.java +++ b/htroot/IndexShare_p.java @@ -50,7 +50,7 @@ public class IndexShare_p { prop.put("wordfreq", sb.getConfigLong("defaultWordReceiveFrequency",10)); prop.put("dtable", ""); prop.put("rtable", ""); - prop.putNum("wcount", indexSegment.termIndex().sizesMax()); + prop.putNum("wcount", indexSegment.RWICount()); prop.putNum("ucount", indexSegment.fulltext().size()); return prop; // be save } @@ -63,7 +63,7 @@ public class IndexShare_p { } // insert constants - prop.putNum("wcount", indexSegment.termIndex().sizesMax()); + prop.putNum("wcount", indexSegment.RWICount()); prop.putNum("ucount", indexSegment.fulltext().size()); // return rewrite properties diff --git a/htroot/PerformanceQueues_p.java b/htroot/PerformanceQueues_p.java index d9f50d0c3..bf8ec2b07 100644 --- a/htroot/PerformanceQueues_p.java +++ b/htroot/PerformanceQueues_p.java @@ -299,7 +299,7 @@ public class PerformanceQueues_p { prop.put("minimumGlobalDelta", sb.crawlQueues.noticeURL.getMinimumGlobalDelta()); // table cache settings - prop.putNum("wordCacheSize", indexSegment.termIndex().getBufferSize()); + prop.putNum("wordCacheSize", indexSegment.RWIBufferCount()); prop.putNum("wordCacheSizeKBytes", indexSegment.termIndex().getBufferSizeBytes()/1024); prop.putNum("maxURLinCache", indexSegment.termIndex().getBufferMaxReferences()); prop.putNum("maxAgeOfCache", indexSegment.termIndex().getBufferMaxAge() / 1000 / 60); // minutes diff --git a/htroot/api/status_p.java b/htroot/api/status_p.java index faa705e28..443ad76ae 100644 --- a/htroot/api/status_p.java +++ b/htroot/api/status_p.java @@ -53,7 +53,7 @@ public class status_p { final int cacheMaxSize = (int) sb.getConfigLong(SwitchboardConstants.WORDCACHE_MAX_COUNT, 10000); prop.putNum("ppm", Switchboard.currentPPM()); prop.putNum("qpm", sb.peers.mySeed().getQPM()); - prop.putNum("wordCacheSize", segment.termIndex().getBufferSize()); + prop.putNum("wordCacheSize", segment.RWIBufferCount()); prop.putNum("wordCacheMaxSize", cacheMaxSize); // crawl queues @@ -77,7 +77,7 @@ public class status_p { // index size prop.putNum("urlpublictextSize", segment.fulltext().size()); - prop.putNum("rwipublictextSize", segment.termIndex().sizesMax()); + prop.putNum("rwipublictextSize", segment.RWICount()); // loader queue prop.putNum("loaderSize", sb.crawlQueues.workerSize()); diff --git a/htroot/suggest.java b/htroot/suggest.java index 84c1af6aa..5401d36f3 100644 --- a/htroot/suggest.java +++ b/htroot/suggest.java @@ -27,9 +27,7 @@ import java.util.Iterator; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.ResponseHeader; -import net.yacy.kelondro.data.word.Word; import net.yacy.search.Switchboard; -import net.yacy.search.index.Segment; import de.anomic.data.DidYouMean; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -67,15 +65,9 @@ public class suggest { final int timeout = (post == null) ? 300 : post.getInt("timeout", 300); final int count = (post == null) ? 20 : post.getInt("count", 20); - // get segment - final Segment indexSegment = sb.index; - int c = 0; - if (more || - (indexSegment != null && - !indexSegment.termIndex().has(Word.word2hash(querystring)))) - { - final DidYouMean didYouMean = new DidYouMean(indexSegment.termIndex(), new StringBuilder(querystring)); + if (more || (sb.index.getQueryCount(querystring) == 0)) { + final DidYouMean didYouMean = new DidYouMean(sb.index, new StringBuilder(querystring)); final Iterator meanIt = didYouMean.getSuggestions(timeout, count).iterator(); String suggestion; //[#[query]#,[#{suggestions}##[text]##(eol)#,::#(/eol)##{/suggestions}#]] diff --git a/htroot/yacy/query.java b/htroot/yacy/query.java index 462839535..9fc18bab6 100644 --- a/htroot/yacy/query.java +++ b/htroot/yacy/query.java @@ -103,7 +103,7 @@ public final class query { if (obj.equals("rwicount")) { // return the total number of available word indexes - prop.put("response", sb.index.termIndex().sizesMax()); + prop.put("response", sb.index.RWICount()); return prop; } diff --git a/htroot/yacy/transferRWI.java b/htroot/yacy/transferRWI.java index 33b52978a..9c868b087 100644 --- a/htroot/yacy/transferRWI.java +++ b/htroot/yacy/transferRWI.java @@ -123,9 +123,9 @@ public final class transferRWI { sb.getLog().logInfo("Rejecting RWIs from peer " + otherPeerName + ". Not granted. This peer is in robinson mode"); result = "not_granted"; pause = 60000; - } else if (sb.index.termIndex().getBufferSize() > cachelimit) { + } else if (sb.index.RWIBufferCount() > cachelimit) { // we are too busy to receive indexes - sb.getLog().logInfo("Rejecting RWIs from peer " + otherPeerName + ". We are too busy (buffersize=" + sb.index.termIndex().getBufferSize() + ")."); + sb.getLog().logInfo("Rejecting RWIs from peer " + otherPeerName + ". We are too busy (buffersize=" + sb.index.RWIBufferCount() + ")."); granted = false; // don't accept more words if there are too many words to flush result = "busy"; pause = 60000; @@ -237,7 +237,7 @@ public final class transferRWI { } result = "ok"; - pause = (int) (sb.index.termIndex().getBufferSize() * 20000 / sb.getConfigLong(SwitchboardConstants.WORDCACHE_MAX_COUNT, 100000)); // estimation of necessary pause time + pause = (int) (sb.index.RWIBufferCount() * 20000 / sb.getConfigLong(SwitchboardConstants.WORDCACHE_MAX_COUNT, 100000)); // estimation of necessary pause time } prop.put("unknownURL", unknownURLs.toString()); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 0681135c9..2cb580937 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -867,8 +867,7 @@ public class yacysearch { prop.put("meanCount", meanMax); if ( meanMax > 0 && !json && !rss ) { - final DidYouMean didYouMean = - new DidYouMean(indexSegment.termIndex(), new StringBuilder(querystring)); + final DidYouMean didYouMean = new DidYouMean(indexSegment, new StringBuilder(querystring)); final Iterator meanIt = didYouMean.getSuggestions(100, 5).iterator(); int meanCount = 0; String suggestion; diff --git a/source/de/anomic/data/DidYouMean.java b/source/de/anomic/data/DidYouMean.java index d99149a92..fa8d96782 100644 --- a/source/de/anomic/data/DidYouMean.java +++ b/source/de/anomic/data/DidYouMean.java @@ -12,10 +12,8 @@ import net.yacy.cora.sorting.ClusteredScoreMap; import net.yacy.cora.sorting.ReversibleScoreMap; import net.yacy.document.LibraryProvider; import net.yacy.document.StringBuilderComparator; -import net.yacy.kelondro.data.word.Word; -import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.logging.Log; -import net.yacy.kelondro.rwi.IndexCell; +import net.yacy.search.index.Segment; /** @@ -62,7 +60,7 @@ public class DidYouMean { public static final int AVAILABLE_CPU = Runtime.getRuntime().availableProcessors(); private static final wordLengthComparator WORD_LENGTH_COMPARATOR = new wordLengthComparator(); - private final IndexCell index; + private final Segment segment; private final StringBuilder word; private final int wordLen; private final LinkedBlockingQueue guessGen, guessLib; @@ -77,11 +75,11 @@ public class DidYouMean { * @param index a termIndex - most likely retrieved from a switchboard object. * @param sort true/false - sorts the resulting TreeSet by index.count(); Warning: this causes heavy i/o. */ - public DidYouMean(final IndexCell index, final StringBuilder word0) { + public DidYouMean(final Segment segment, final StringBuilder word0) { this.resultSet = Collections.synchronizedSortedSet(new TreeSet(new headMatchingComparator(word0, WORD_LENGTH_COMPARATOR))); this.word = word0; this.wordLen = this.word.length(); - this.index = index; + this.segment = segment; this.guessGen = new LinkedBlockingQueue(); this.guessLib = new LinkedBlockingQueue(); this.createGen = true; @@ -143,7 +141,7 @@ public class DidYouMean { final long startTime = System.currentTimeMillis(); final long timelimit = startTime + timeout; if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.indexOf(this.word, ' ') > 0) { - return getSuggestions(StringBuilderComparator.CASE_INSENSITIVE_ORDER.split(this.word, ' '), timeout, preSortSelection, this.index); + return getSuggestions(StringBuilderComparator.CASE_INSENSITIVE_ORDER.split(this.word, ' '), timeout, preSortSelection, this.segment); } final SortedSet preSorted = getSuggestions(timeout); if (System.currentTimeMillis() > timelimit) { @@ -161,12 +159,12 @@ public class DidYouMean { if (!(scored.sizeSmaller(2 * preSortSelection))) { break; } - scored.inc(s, this.index.count(Word.word2hash(s))); + scored.inc(s, this.segment.getQueryCount(s)); } } catch (ConcurrentModificationException e) { } final SortedSet countSorted = Collections.synchronizedSortedSet(new TreeSet(new headMatchingComparator(this.word, this.INDEX_SIZE_COMPARATOR))); - final int wc = this.index.count(Word.word2hash(this.word)); // all counts must be greater than this + final int wc = this.segment.getQueryCount(this.word); // all counts must be greater than this while (!scored.isEmpty() && countSorted.size() < preSortSelection) { final StringBuilder s = scored.getMaxKey(); final int score = scored.delete(s); @@ -198,10 +196,10 @@ public class DidYouMean { * @return */ @SuppressWarnings("unchecked") - private static SortedSet getSuggestions(final StringBuilder[] words, final long timeout, final int preSortSelection, final IndexCell index) { + private static SortedSet getSuggestions(final StringBuilder[] words, final long timeout, final int preSortSelection, final Segment segment) { final SortedSet[] s = new SortedSet[words.length]; for (int i = 0; i < words.length; i++) { - s[i] = new DidYouMean(index, words[i]).getSuggestions(timeout / words.length, preSortSelection); + s[i] = new DidYouMean(segment, words[i]).getSuggestions(timeout / words.length, preSortSelection); } // make all permutations final SortedSet result = new TreeSet(StringBuilderComparator.CASE_INSENSITIVE_ORDER); @@ -435,7 +433,7 @@ public class DidYouMean { StringBuilder s; try { while ((s = DidYouMean.this.guessLib.take()) != POISON_STRING) { - if (s.length() >= MinimumOutputWordLength && DidYouMean.this.index.has(Word.word2hash(s))) { + if (s.length() >= MinimumOutputWordLength && DidYouMean.this.segment.getQueryCount(s) > 0) { DidYouMean.this.resultSet.add(s); } if (System.currentTimeMillis() > DidYouMean.this.timeLimit) { @@ -454,8 +452,8 @@ public class DidYouMean { @Override public int compare(final StringBuilder o1, final StringBuilder o2) { - final int i1 = DidYouMean.this.index.count(Word.word2hash(o1)); - final int i2 = DidYouMean.this.index.count(Word.word2hash(o2)); + final int i1 = DidYouMean.this.segment.getQueryCount(o1); + final int i2 = DidYouMean.this.segment.getQueryCount(o2); if (i1 == i2) { return WORD_LENGTH_COMPARATOR.compare(o1, o2); } diff --git a/source/net/yacy/cora/services/federated/solr/EmbeddedSolrConnector.java b/source/net/yacy/cora/services/federated/solr/EmbeddedSolrConnector.java index dec9e76b8..ec406cd95 100644 --- a/source/net/yacy/cora/services/federated/solr/EmbeddedSolrConnector.java +++ b/source/net/yacy/cora/services/federated/solr/EmbeddedSolrConnector.java @@ -26,7 +26,6 @@ import java.io.IOException; import javax.xml.parsers.ParserConfigurationException; - import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer; import org.apache.solr.client.solrj.response.QueryResponse; @@ -90,7 +89,7 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo } } } - + try { this.cores = new CoreContainer(storagePath.getAbsolutePath(), new File(solr_config, "solr.xml")); } catch (ParserConfigurationException e) { @@ -157,16 +156,17 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo return rsp; } + @Override public QueryResponse query(SolrParams params) throws IOException { try { - return server.query(params); + return this.server.query(params); } catch (SolrServerException e) { throw new IOException(e); } catch (Throwable e) { throw new IOException("Error executing query", e); } } - + public static void main(String[] args) { File solr_config = new File("defaults/solr"); File storage = new File("DATA/INDEX/webportal/SEGMENTS/text/solr/"); diff --git a/source/net/yacy/cora/services/federated/solr/MirrorSolrConnector.java b/source/net/yacy/cora/services/federated/solr/MirrorSolrConnector.java index c3f102b32..e989efa09 100644 --- a/source/net/yacy/cora/services/federated/solr/MirrorSolrConnector.java +++ b/source/net/yacy/cora/services/federated/solr/MirrorSolrConnector.java @@ -23,6 +23,7 @@ package net.yacy.cora.services.federated.solr; import java.io.IOException; import java.util.Collection; import java.util.List; +import java.util.concurrent.atomic.AtomicLong; import net.yacy.cora.storage.ARC; import net.yacy.cora.storage.ConcurrentARC; @@ -300,6 +301,39 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo return list; } + @Override + public long getQueryCount(final String querystring) throws IOException { + if (this.solr0 == null && this.solr1 == null) return 0; + if (this.solr0 != null && this.solr1 == null) { + return this.solr0.getQueryCount(querystring); + } + if (this.solr1 != null && this.solr0 == null) { + return this.solr1.getQueryCount(querystring); + } + final AtomicLong count = new AtomicLong(0); + Thread t0 = new Thread() { + @Override + public void run() { + try { + count.addAndGet(MirrorSolrConnector.this.solr0.getQueryCount(querystring)); + } catch (IOException e) {} + } + }; + t0.start(); + Thread t1 = new Thread() { + @Override + public void run() { + try { + count.addAndGet(MirrorSolrConnector.this.solr1.getQueryCount(querystring)); + } catch (IOException e) {} + } + }; + t1.start(); + try {t0.join();} catch (InterruptedException e) {} + try {t1.join();} catch (InterruptedException e) {} + return count.get(); + } + private void addToCache(SolrDocumentList list) { if (MemoryControl.shortStatus()) clearCache(); for (final SolrDocument solrdoc: list) { diff --git a/source/net/yacy/cora/services/federated/solr/MultipleSolrConnector.java b/source/net/yacy/cora/services/federated/solr/MultipleSolrConnector.java index ed8995795..de2cd5c1d 100644 --- a/source/net/yacy/cora/services/federated/solr/MultipleSolrConnector.java +++ b/source/net/yacy/cora/services/federated/solr/MultipleSolrConnector.java @@ -163,6 +163,12 @@ public class MultipleSolrConnector extends AbstractSolrConnector implements Solr return this.solr.query(querystring, offset, count); } + + @Override + public long getQueryCount(final String querystring) throws IOException { + return this.solr.getQueryCount(querystring); + } + @Override public long getSize() { return this.solr.getSize(); diff --git a/source/net/yacy/cora/services/federated/solr/RetrySolrConnector.java b/source/net/yacy/cora/services/federated/solr/RetrySolrConnector.java index 704a177cf..9e2e1c931 100644 --- a/source/net/yacy/cora/services/federated/solr/RetrySolrConnector.java +++ b/source/net/yacy/cora/services/federated/solr/RetrySolrConnector.java @@ -183,6 +183,21 @@ public class RetrySolrConnector extends AbstractSolrConnector implements SolrCon return null; } + @Override + public long getQueryCount(final String querystring) throws IOException { + final long t = System.currentTimeMillis() + this.retryMaxTime; + Throwable ee = null; + while (System.currentTimeMillis() < t) try { + return this.solrConnector.getQueryCount(querystring); + } catch (final Throwable e) { + ee = e; + try {Thread.sleep(10);} catch (final InterruptedException e1) {} + continue; + } + if (ee != null) throw (ee instanceof IOException) ? (IOException) ee : new IOException(ee.getMessage()); + return 0; + } + @Override public long getSize() { final long t = System.currentTimeMillis() + this.retryMaxTime; diff --git a/source/net/yacy/cora/services/federated/solr/ShardSolrConnector.java b/source/net/yacy/cora/services/federated/solr/ShardSolrConnector.java index d3dca171b..8b7d5c8f0 100644 --- a/source/net/yacy/cora/services/federated/solr/ShardSolrConnector.java +++ b/source/net/yacy/cora/services/federated/solr/ShardSolrConnector.java @@ -29,6 +29,7 @@ import java.net.InetAddress; import java.util.ArrayList; import java.util.Collection; import java.util.List; +import java.util.concurrent.atomic.AtomicLong; import net.yacy.cora.protocol.Domains; @@ -165,15 +166,50 @@ public class ShardSolrConnector extends AbstractSolrConnector implements SolrCon @Override public SolrDocumentList query(final String querystring, final int offset, final int count) throws IOException { final SolrDocumentList list = new SolrDocumentList(); + List t = new ArrayList(); for (final SolrConnector connector: this.connectors) { - final SolrDocumentList l = connector.query(querystring, offset, count); - for (final SolrDocument d: l) { - list.add(d); - } + Thread t0 = new Thread() { + @Override + public void run() { + try { + final SolrDocumentList l = connector.query(querystring, offset, count); + for (final SolrDocument d: l) { + list.add(d); + } + } catch (IOException e) {} + } + }; + t0.start(); + t.add(t0); + } + for (Thread t0: t) { + try {t0.join();} catch (InterruptedException e) {} } return list; } + @Override + public long getQueryCount(final String querystring) throws IOException { + final AtomicLong count = new AtomicLong(0); + List t = new ArrayList(); + for (final SolrConnector connector: this.connectors) { + Thread t0 = new Thread() { + @Override + public void run() { + try { + count.addAndGet(connector.getQueryCount(querystring)); + } catch (IOException e) {} + } + }; + t0.start(); + t.add(t0); + } + for (Thread t0: t) { + try {t0.join();} catch (InterruptedException e) {} + } + return count.get(); + } + public long[] getSizeList() { final long[] size = new long[this.connectors.size()]; int i = 0; diff --git a/source/net/yacy/cora/services/federated/solr/SolrConnector.java b/source/net/yacy/cora/services/federated/solr/SolrConnector.java index 94938950a..be76c06f5 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrConnector.java +++ b/source/net/yacy/cora/services/federated/solr/SolrConnector.java @@ -112,7 +112,15 @@ public interface SolrConnector extends Iterable /* Iterable of document * @throws IOException */ public SolrDocumentList query(final String querystring, final int offset, final int count) throws IOException, SolrException; - + + /** + * get the number of results when this query is done. + * This should only be called if the actual result is never used, and only the count is interesting + * @param querystring + * @return the number of results for this query + */ + public long getQueryCount(final String querystring) throws IOException; + /** * Get a query result from solr as a stream of documents. * The result queue is considered as terminated if AbstractSolrConnectro.POISON_DOCUMENT is returned. diff --git a/source/net/yacy/cora/services/federated/solr/SolrServerConnector.java b/source/net/yacy/cora/services/federated/solr/SolrServerConnector.java index e72dc8e58..1c83f833f 100644 --- a/source/net/yacy/cora/services/federated/solr/SolrServerConnector.java +++ b/source/net/yacy/cora/services/federated/solr/SolrServerConnector.java @@ -186,7 +186,7 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen throw new IOException(e); } } - + /** * get a query result from solr * to get all results set the query String to "*:*" @@ -208,8 +208,22 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen return docs; } + @Override + public long getQueryCount(String querystring) throws IOException { + // construct query + final SolrQuery params = new SolrQuery(); + params.setQuery(querystring); + params.setRows(1); + params.setStart(0); + + // query the server + QueryResponse rsp = query(params); + final SolrDocumentList docs = rsp.getResults(); + return docs.getNumFound(); + } + abstract public QueryResponse query(SolrParams params) throws IOException; - + private final char[] queryIDTemplate = "id:\" \"".toCharArray(); /** diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 20dadc1a1..d5b3185c9 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -971,7 +971,7 @@ public final class Switchboard extends serverSwitch Long.parseLong(getConfig(SwitchboardConstants.INDEX_DIST_IDLESLEEP, "5000")), Long.parseLong(getConfig(SwitchboardConstants.INDEX_DIST_BUSYSLEEP, "0")), Long.parseLong(getConfig(SwitchboardConstants.INDEX_DIST_MEMPREREQ, "1000000"))); - + // content control: initialize list sync thread deployThread( "720_ccimport", @@ -1003,7 +1003,7 @@ public final class Switchboard extends serverSwitch 3000, 10000), 2000); - + // set network-specific performance attributes if ( this.firstInit ) { setRemotecrawlPPM(Math.max(1, (int) getConfigLong("network.unit.remotecrawl.speed", 60))); @@ -1015,7 +1015,7 @@ public final class Switchboard extends serverSwitch //query.add(CrawlSwitchboardEntry.word2hash("Zahl")); //plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/mobil/newsticker/meldung/mail/54980"), query, true); //plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/security/news/foren/go.shtml?read=1&msg_id=7301419&forum_id=72721"), query, true); - //plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/kiosk/archiv/ct/2003/4/20"), query, true, 260); + //plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/kiosk/archiv/ct/2003/4/20"), query, true, 260); this.trail = new LinkedBlockingQueue(); @@ -2971,9 +2971,9 @@ public final class Switchboard extends serverSwitch if ( size < 10 ) { return "no DHT distribution: loadedURL.size() = " + size; } - if ( indexSegment.termIndex().sizesMax() < 100 ) { + if ( indexSegment.RWICount() < 100 ) { return "no DHT distribution: not enough words - wordIndex.size() = " - + indexSegment.termIndex().sizesMax(); + + indexSegment.RWICount(); } if ( (getConfig(SwitchboardConstants.INDEX_DIST_ALLOW_WHILE_CRAWLING, "false") .equalsIgnoreCase("false")) && (this.crawlQueues.noticeURL.notEmptyLocal()) ) { diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 052be2496..05a98d3bc 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -191,6 +191,18 @@ public class Segment { return this.termIndex.getBufferSize(); } + public int getQueryCount(String word) { + int count = this.termIndex == null ? 0 : this.termIndex.count(Word.word2hash(word)); + try {count += this.fulltext.getSolr().getQueryCount(YaCySchema.text_t.name() + ':' + word);} catch (IOException e) {} + return count; + } + + public int getQueryCount(StringBuilder word) { + int count = this.termIndex == null ? 0 : this.termIndex.count(Word.word2hash(word)); + try {count += this.fulltext.getSolr().getQueryCount(YaCySchema.text_t.name() + ':' + word.toString());} catch (IOException e) {} + return count; + } + public boolean exists(final byte[] urlhash) { return this.fulltext.exists(urlhash); } @@ -204,7 +216,7 @@ public class Segment { final String host = stub.getHost(); String hh = DigestURI.hosthash(host); final BlockingQueue hostQueue = this.fulltext.getSolr().concurrentIDs(YaCySchema.host_id_s + ":" + hh, 0, Integer.MAX_VALUE, 10000); - + final String urlstub = stub.toNormalform(false, false); // now filter the stub from the iterated urls diff --git a/source/net/yacy/search/query/RWIProcess.java b/source/net/yacy/search/query/RWIProcess.java index f7ccbd39e..e51e38c31 100644 --- a/source/net/yacy/search/query/RWIProcess.java +++ b/source/net/yacy/search/query/RWIProcess.java @@ -708,9 +708,9 @@ public final class RWIProcess extends Thread this.sortout++; continue; } - + // content control - + if (Switchboard.getSwitchboard().getConfigBool( "contentcontrol.enabled", false) == true) { @@ -990,7 +990,6 @@ public final class RWIProcess extends Thread final Map counts = new HashMap(); final Iterator i = this.ref.keys(false); String word; - byte[] termHash; int c; float q, min = Float.MAX_VALUE, max = Float.MIN_VALUE; int ic = count; @@ -999,8 +998,7 @@ public final class RWIProcess extends Thread if ( word == null ) { continue; } - termHash = Word.word2hash(word); - c = this.query.getSegment().termIndex().count(termHash); + c = this.query.getSegment().getQueryCount(word); if ( c > 0 ) { q = ((float) this.ref.get(word)) / ((float) c); min = Math.min(min, q);