From 940c6849ee5c38c3fdbb8cc91de65657c7d3a6dd Mon Sep 17 00:00:00 2001 From: orbiter Date: Sun, 17 Mar 2013 10:52:31 +0100 Subject: [PATCH] enhanced did-you-mean (a bit): can now remember previously searched words (plus small enhancements) --- htroot/suggest.java | 2 +- source/net/yacy/cora/document/WordCache.java | 9 +++++++-- source/net/yacy/data/DidYouMean.java | 2 +- source/net/yacy/search/ResourceObserver.java | 2 +- source/net/yacy/search/Switchboard.java | 2 +- source/net/yacy/search/index/Segment.java | 17 +++++++++-------- source/net/yacy/search/query/QueryGoal.java | 4 ++++ 7 files changed, 24 insertions(+), 14 deletions(-) diff --git a/htroot/suggest.java b/htroot/suggest.java index e86b5984f..be0b409f7 100644 --- a/htroot/suggest.java +++ b/htroot/suggest.java @@ -58,7 +58,7 @@ public class suggest { final String ext = header.get("EXT", ""); final boolean json = ext.equals("json"); final boolean xml = ext.equals("xml"); - final boolean more = post != null && post.containsKey("more"); + final boolean more = sb.index.connectedRWI() || (post != null && post.containsKey("more")); // with RWIs connected the guessing is super-fast // get query final String originalquerystring = (post == null) ? "" : post.get("query", post.get("q", "")).trim(); diff --git a/source/net/yacy/cora/document/WordCache.java b/source/net/yacy/cora/document/WordCache.java index 84726f36a..8b406bf20 100644 --- a/source/net/yacy/cora/document/WordCache.java +++ b/source/net/yacy/cora/document/WordCache.java @@ -26,6 +26,7 @@ import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; +import java.util.Collection; import java.util.ConcurrentModificationException; import java.util.HashSet; import java.util.Map; @@ -219,7 +220,7 @@ public class WordCache { } } - public static void learn(Set wordset) { + public static void learn(Collection wordset) { for (String s: wordset) { learn(new StringBuilder(s)); } @@ -320,8 +321,12 @@ public class WordCache { } return size; } + + public static int sizeCommonWords() { + return commonWords.size(); + } - public static void clear() { + public static void clearCommonWords() { commonWords.clear(); } diff --git a/source/net/yacy/data/DidYouMean.java b/source/net/yacy/data/DidYouMean.java index 1c27e32af..47540b002 100644 --- a/source/net/yacy/data/DidYouMean.java +++ b/source/net/yacy/data/DidYouMean.java @@ -432,7 +432,7 @@ public class DidYouMean { StringBuilder s; try { while ((s = DidYouMean.this.guessLib.take()) != POISON_STRING) { - if (s.length() >= MinimumOutputWordLength && DidYouMean.this.segment.getWordCountGuess(s.toString()) > 0) { + if (s.length() >= MinimumOutputWordLength && DidYouMean.this.segment.getWordCountGuess(s.toString()) > 2) { DidYouMean.this.resultSet.add(s); } if (System.currentTimeMillis() > DidYouMean.this.timeLimit) { diff --git a/source/net/yacy/search/ResourceObserver.java b/source/net/yacy/search/ResourceObserver.java index 806c09622..5a5afab8c 100644 --- a/source/net/yacy/search/ResourceObserver.java +++ b/source/net/yacy/search/ResourceObserver.java @@ -93,7 +93,7 @@ public class ResourceObserver { SearchEventCache.cleanupEvents(true); this.sb.trail.clear(); Switchboard.urlBlacklist.clearblacklistCache(); - WordCache.clear(); + WordCache.clearCommonWords(); Domains.clear(); } } diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index eb069eac2..a8017d3ab 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2045,7 +2045,7 @@ public final class Switchboard extends serverSwitch { PDFont.clearResources(); // eats up megabytes, see http://markmail.org/thread/quk5odee4hbsauhu // clear caches - WordCache.clear(); + if (WordCache.sizeCommonWords() > 1000) WordCache.clearCommonWords(); Domains.clear(); // clean up image stack diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 358d3cf28..0dac40629 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -287,15 +287,16 @@ public class Segment { */ public int getWordCountGuess(String word) { if (word == null || word.indexOf(':') >= 0 || word.indexOf(' ') >= 0 || word.indexOf('/') >= 0) return 0; - if (this.termIndex == null) { - try { - return (int) this.fulltext.getDefaultConnector().getQueryCount(CollectionSchema.text_t.getSolrFieldName() + ':' + word); - } catch (Throwable e) { - Log.logException(e); - return 0; - } + if (this.termIndex != null) { + int count = this.termIndex.count(Word.word2hash(word)); + if (count > 0) return count; + } + try { + return (int) this.fulltext.getDefaultConnector().getQueryCount(CollectionSchema.text_t.getSolrFieldName() + ':' + word); + } catch (Throwable e) { + Log.logException(e); + return 0; } - return this.termIndex.count(Word.word2hash(word)); } public boolean exists(final String urlhash) { diff --git a/source/net/yacy/search/query/QueryGoal.java b/source/net/yacy/search/query/QueryGoal.java index 500b78170..2b7819d83 100644 --- a/source/net/yacy/search/query/QueryGoal.java +++ b/source/net/yacy/search/query/QueryGoal.java @@ -27,6 +27,7 @@ import java.util.ArrayList; import java.util.Map; import java.util.SortedSet; +import net.yacy.cora.document.WordCache; import net.yacy.cora.federate.solr.Ranking; import net.yacy.cora.federate.solr.SchemaDeclaration; import net.yacy.cora.federate.solr.SolrType; @@ -92,6 +93,9 @@ public class QueryGoal { for (String s: this.include_strings) parseQuery(s, this.include_words, this.include_words, this.all_words); for (String s: this.exclude_strings) parseQuery(s, this.exclude_words, this.exclude_words, this.all_words); + WordCache.learn(this.include_strings); + WordCache.learn(this.exclude_strings); + this.include_hashes = null; this.exclude_hashes = null; this.all_hashes = null;