From c40ba51ca6effd97b4e8835b2deade0ff071ac96 Mon Sep 17 00:00:00 2001 From: orbiter Date: Mon, 3 Feb 2014 12:44:52 +0100 Subject: [PATCH] added new suggest method which replaces more-than-one suggestions: instead of computing suggest permutations of the given words, the completion of a phrase using the given words is searched in the fulltext index. --- htroot/suggest.java | 43 ++++---- source/net/yacy/data/DidYouMean.java | 115 ++++++++++++++++------ source/net/yacy/search/index/Segment.java | 2 +- 3 files changed, 106 insertions(+), 54 deletions(-) diff --git a/htroot/suggest.java b/htroot/suggest.java index 12b93576f..2c8e93a24 100644 --- a/htroot/suggest.java +++ b/htroot/suggest.java @@ -59,36 +59,33 @@ public class suggest { final String ext = header.get("EXT", ""); final boolean json = ext.equals("json"); final boolean xml = ext.equals("xml"); - final boolean more = sb.index.connectedRWI() || (post != null && post.containsKey("more")); // with RWIs connected the guessing is super-fast - + // get query - final String originalquerystring = (post == null) ? "" : post.get("query", post.get("q", "")).trim(); + final String originalquerystring = (post == null) ? "" : post.get("query", post.get("q", "")); final String querystring = originalquerystring.replace('+', ' '); final int timeout = (post == null) ? 300 : post.getInt("timeout", 300); final int count = (post == null) ? 10 : Math.min(20, post.getInt("count", 10)); int c = 0; - if (more || (sb.index.getWordCountGuess(querystring) == 0)) { - final DidYouMean didYouMean = new DidYouMean(sb.index, new StringBuilder(querystring)); - final SortedSet suggestions = didYouMean.getSuggestions(timeout, count); - //[#[query]#,[#{suggestions}##[text]##(eol)#,::#(/eol)##{/suggestions}#]] - synchronized (suggestions) { - for (StringBuilder suggestion: suggestions) { - if (c >= meanMax) break; - try { - String s = suggestion.toString(); - if (json) { - prop.putJSON("suggestions_" + c + "_text", s); - } else if (xml) { - prop.putXML("suggestions_" + c + "_text", s); - } else { - prop.putHTML("suggestions_" + c + "_text", s); - } - prop.put("suggestions_" + c + "_eol", 0); - c++; - } catch (final ConcurrentModificationException e) { - ConcurrentLog.logException(e); + final DidYouMean didYouMean = new DidYouMean(sb.index, new StringBuilder(querystring)); + final SortedSet suggestions = didYouMean.getSuggestions(timeout, count); + //[#[query]#,[#{suggestions}##[text]##(eol)#,::#(/eol)##{/suggestions}#]] + synchronized (suggestions) { + for (StringBuilder suggestion: suggestions) { + if (c >= meanMax) break; + try { + String s = suggestion.toString(); + if (json) { + prop.putJSON("suggestions_" + c + "_text", s); + } else if (xml) { + prop.putXML("suggestions_" + c + "_text", s); + } else { + prop.putHTML("suggestions_" + c + "_text", s); } + prop.put("suggestions_" + c + "_eol", 0); + c++; + } catch (final ConcurrentModificationException e) { + ConcurrentLog.logException(e); } } } diff --git a/source/net/yacy/data/DidYouMean.java b/source/net/yacy/data/DidYouMean.java index cb0c11031..da281cb13 100644 --- a/source/net/yacy/data/DidYouMean.java +++ b/source/net/yacy/data/DidYouMean.java @@ -1,19 +1,29 @@ package net.yacy.data; +import java.io.IOException; import java.util.Collections; import java.util.Comparator; import java.util.ConcurrentModificationException; +import java.util.Iterator; +import java.util.List; +import java.util.Map; import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; import java.util.concurrent.LinkedBlockingQueue; +import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.common.SolrException; + import net.yacy.cora.sorting.ClusteredScoreMap; +import net.yacy.cora.sorting.OrderedScoreMap; import net.yacy.cora.sorting.ReversibleScoreMap; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.StringBuilderComparator; import net.yacy.document.LibraryProvider; import net.yacy.search.index.Segment; +import net.yacy.search.schema.CollectionSchema; /** @@ -29,7 +39,7 @@ import net.yacy.search.index.Segment; * the above mentioned four categories. Consumer threads check then the generated word variations against a term index. * Only words contained in the term index are return by the getSuggestion method.

* @author apfelmaennchen - * @author orbiter (extensions for multi-language support) + * @author orbiter (extensions for multi-language support + multi-word suggestions) */ public class DidYouMean { @@ -79,7 +89,7 @@ public class DidYouMean { private final SortedSet resultSet; private final indexSizeComparator INDEX_SIZE_COMPARATOR; private char[] alphabet; - + private boolean more; /** * @param index a termIndex - most likely retrieved from a switchboard object. @@ -94,6 +104,7 @@ public class DidYouMean { this.guessLib = new LinkedBlockingQueue(); this.createGen = true; this.INDEX_SIZE_COMPARATOR = new indexSizeComparator(); + this.more = segment.connectedRWI() && segment.RWICount() > 0; // with RWIs connected the guessing is super-fast // identify language if (this.word.length() > 0) { @@ -154,8 +165,10 @@ public class DidYouMean { } final long startTime = System.currentTimeMillis(); final long timelimit = startTime + timeout; - if (StringBuilderComparator.CASE_INSENSITIVE_ORDER.indexOf(this.word, ' ') > 0) { - return getSuggestions(StringBuilderComparator.CASE_INSENSITIVE_ORDER.split(this.word, ' '), timeout, preSortSelection, this.segment); + int lastIndexOfSpace = this.word.lastIndexOf(" "); + if (lastIndexOfSpace > 0) { + // recursion over several words + return getSuggestions(this.word.substring(0, lastIndexOfSpace), this.word.substring(lastIndexOfSpace + 1), timeout, preSortSelection, this.segment); } final SortedSet preSorted = getSuggestions(timeout); if (System.currentTimeMillis() > timelimit) { @@ -204,35 +217,75 @@ public class DidYouMean { /** * return a string that is a suggestion list for the list of given words - * @param words + * @param head - the sequence of words before the last space in the sequence + * @param tail - the word after the last space, possibly empty * @param timeout * @param preSortSelection * @return */ - @SuppressWarnings("unchecked") - private static SortedSet getSuggestions(final StringBuilder[] words, final long timeout, final int preSortSelection, final Segment segment) { - final SortedSet[] s = new SortedSet[words.length]; - for (int i = 0; i < words.length; i++) { - s[i] = new DidYouMean(segment, words[i]).getSuggestions(timeout / words.length, preSortSelection); - } - // make all permutations + private static SortedSet getSuggestions(final String head, final String tail, final long timeout, final int preSortSelection, final Segment segment) { final SortedSet result = new TreeSet(StringBuilderComparator.CASE_INSENSITIVE_ORDER); - StringBuilder sb; - for (int i = 0; i < words.length; i++) { - if (s[i].isEmpty()) { - continue; - } - sb = new StringBuilder(20); - for (int j = 0; j < words.length; j++) { - if (j > 0) { - sb.append(' '); - } - if (i == j) { - sb.append(s[j].first()); - } else { - sb.append(words[j]); + int count = 20; + final SolrQuery solrQuery = new SolrQuery(); + solrQuery.setParam("defType", "edismax"); + solrQuery.setFacet(false); + solrQuery.setQuery(CollectionSchema.title.getSolrFieldName() + ":\"" + head + "\"^10 OR " + CollectionSchema.text_t.getSolrFieldName() + ":\"" + head + "\""); + if (tail.length() > 0) solrQuery.setFilterQueries(CollectionSchema.text_t.getSolrFieldName() + ":/.*" + head + " " + tail + ".*/"); + solrQuery.setStart(0); + solrQuery.setRows(count); + solrQuery.setHighlight(true); + solrQuery.setHighlightFragsize(head.length() + tail.length() + 80); + solrQuery.setHighlightSimplePre(""); + solrQuery.setHighlightSimplePost(""); + solrQuery.setHighlightSnippets(1); + solrQuery.addHighlightField(CollectionSchema.title.getSolrFieldName()); + solrQuery.addHighlightField(CollectionSchema.text_t.getSolrFieldName()); + solrQuery.setFields(); // no fields wanted! only snippets + //List snippets = new ArrayList(); + OrderedScoreMap snippets = new OrderedScoreMap(null); + try { + QueryResponse response = segment.fulltext().getDefaultConnector().getResponseByParams(solrQuery); + Map>> rawsnippets = response.getHighlighting(); // a map from the urlhash to a map with key=field and value = list of snippets + if (rawsnippets != null) { + for (Map> re: rawsnippets.values()) { + for (List sl: re.values()) { + for (String s: sl) { + int sp = s.indexOf(""); + if (sp >= 0) { + s = s.substring(sp + 4); + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + if (c < 'A') s = s.replace(c, ' '); + } + s = s.trim(); + sp = s.indexOf(" "); + if (sp >= 0) s = s.substring(0, sp); + sp = s.indexOf(""); + if (sp >= 0) s = s.substring(0, sp).trim(); + String[] sx = s.split(" "); + StringBuilder sb = new StringBuilder(s.length()); + for (String x: sx) if (x.length() > 1 && sb.length() < 28) sb.append(x).append(' '); else break; + s = sb.toString().trim(); + int score = count; + for (String a: snippets) { + if (a.startsWith(s)) snippets.inc(a, count); + if (s.startsWith(a)) score += count; + } + if (sb.length() > 2) snippets.inc(s, score); + count--; + } + } + } } } + } catch (SolrException e) { + } catch (IOException e) { + } + Iterator si = snippets.keys(false); + while (si.hasNext() && result.size() < 10) { + String s = si.next(); + StringBuilder sb = new StringBuilder(head.length() + s.length() + 1); + sb.append(head).append(' ').append(s); result.add(sb); } return result; @@ -291,8 +344,9 @@ public class DidYouMean { // we take guessLib entries as long as there is any entry in it // to see if this is the case, we must wait for termination of the producer for (final Thread t: producers) { - if (this.timeLimit > System.currentTimeMillis()) try { - t.join(Math.max(0, this.timeLimit - System.currentTimeMillis())); + long wait = this.timeLimit - System.currentTimeMillis(); + if (wait > 0) try { + t.join(wait); } catch (final InterruptedException e) {} } @@ -315,8 +369,9 @@ public class DidYouMean { // wait for termination of consumer for (final Consumer c: consumers) { - if (this.timeLimit > System.currentTimeMillis()) try { - c.join(Math.max(0, this.timeLimit - System.currentTimeMillis())); + long wait = this.timeLimit - System.currentTimeMillis(); + if (wait > 0) try { + c.join(wait); } catch (final InterruptedException e) {} } diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index e52d63b2a..3ad3f427f 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -443,7 +443,7 @@ public class Segment { * get a guess about the word count. This is only a guess because it uses the term index if present and this index may be * influenced by index transmission processes in its statistic word distribution. However, it can be a hint for heuristics * which use the word count. Please do NOT use this if the termIndex is not present because it otherwise uses the solr index - * which makes it painfull slow. + * which makes it painfully slow. * @param word * @return the number of references for this word. */