From 2c39b654096da35412e94be7328b8347d4506fbc Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 26 Nov 2013 02:24:47 +0100 Subject: [PATCH] fixes for searches containing stopwords. The fix was done using a reconstruction of the search word set access method to protect that words are deleted from the sets from the outside of the QueryGoal class. --- htroot/yacysearch.java | 6 +- htroot/yacysearchitem.java | 11 ++-- source/net/yacy/kelondro/util/SetTools.java | 29 ++++------ source/net/yacy/peers/RemoteSearch.java | 1 + source/net/yacy/search/query/QueryGoal.java | 55 ++++++++++++++----- source/net/yacy/search/query/QueryParams.java | 12 ++-- source/net/yacy/search/query/SearchEvent.java | 6 +- 7 files changed, 69 insertions(+), 51 deletions(-) diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 54b4cc6e9..adb16d9e5 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -525,10 +525,8 @@ public class yacysearch { // filter out stopwords final SortedSet filtered = SetTools.joinConstructiveByTest(qg.getIncludeWords(), Switchboard.stopwords); //find matching stopwords - if ( !filtered.isEmpty() ) { - SetTools.excludeDestructiveByTestSmallInLarge(qg.getIncludeWords(), filtered); //remove stopwords - } - + qg.removeIncludeWords(filtered); + // if a minus-button was hit, remove a special reference first if ( post != null && post.containsKey("deleteref") ) { try { diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index ffbf6a1ef..d42b3bf7d 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -25,8 +25,7 @@ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA import java.net.MalformedURLException; -import java.util.List; - +import java.util.Iterator; import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.analysis.Classification.ContentDomain; @@ -218,11 +217,9 @@ public class yacysearchitem { prop.putHTML("content_publisher", result.publisher()); prop.putHTML("content_creator", result.creator());// author prop.putHTML("content_subject", result.subject()); - final List query = theSearch.query.getQueryGoal().getIncludeStrings(); - final StringBuilder s = new StringBuilder(query.size() * 20); - for (final String t: query) { - s.append('+').append(t); - } + final Iterator query = theSearch.query.getQueryGoal().getIncludeStrings(); + final StringBuilder s = new StringBuilder(theSearch.query.getQueryGoal().getIncludeSize() * 20); + while (query.hasNext()) s.append('+').append(query.next()); final String words = (s.length() > 0) ? s.substring(1) : ""; prop.putHTML("content_words", words); prop.putHTML("content_showParser_words", words); diff --git a/source/net/yacy/kelondro/util/SetTools.java b/source/net/yacy/kelondro/util/SetTools.java index 3306e8372..534752aaf 100644 --- a/source/net/yacy/kelondro/util/SetTools.java +++ b/source/net/yacy/kelondro/util/SetTools.java @@ -214,18 +214,17 @@ public final class SetTools { // start most efficient method if (stepsEnum > stepsTest) { - if (set1.size() < set2.size()) return joinConstructiveByTest(set1, set2); - return joinConstructiveByTest(set2, set1); + if (set1.size() < set2.size()) return joinConstructiveByTest(set1.iterator(), set2); + return joinConstructiveByTest(set2.iterator(), set1); } return joinConstructiveByEnumeration(set1, set2); } - public static SortedSet joinConstructiveByTest(final Collection small, final SortedSet large) { - final Iterator mi = small.iterator(); + public static SortedSet joinConstructiveByTest(final Iterator small, final SortedSet large) { final SortedSet result = new TreeSet(large.comparator()); A o; - while (mi.hasNext()) { - o = mi.next(); + while (small.hasNext()) { + o = small.next(); if (large.contains(o)) result.add(o); } return result; @@ -264,9 +263,9 @@ public final class SetTools { * @param large * @return true if the small set is completely included in the large set */ - public static boolean totalInclusion(final Set small, final Set large) { - for (A o: small) { - if (!large.contains(o)) return false; + public static boolean totalInclusion(final Iterator small, final Set large) { + while (small.hasNext()) { + if (!large.contains(small.next())) return false; } return true; } @@ -305,8 +304,7 @@ public final class SetTools { // start most efficient method if (stepsEnum > stepsTest) { - if (set1.size() < set2.size()) return anymatchByTest(set1, set2); - return anymatchByTest(set2, set1); + return (set1.size() < set2.size()) ? anymatchByTest(set1.iterator(), set2) : anymatchByTest(set2.iterator(), set1); } return anymatchByEnumeration(set1, set2); } @@ -337,12 +335,9 @@ public final class SetTools { return anymatchByEnumeration(set1, set2); } - private static boolean anymatchByTest(final SortedSet small, final SortedSet large) { - final Iterator mi = small.iterator(); - A o; - while (mi.hasNext()) { - o = mi.next(); - if (large.contains(o)) return true; + public static boolean anymatchByTest(final Iterator small, final SortedSet large) { + while (small.hasNext()) { + if (large.contains(small.next())) return true; } return false; } diff --git a/source/net/yacy/peers/RemoteSearch.java b/source/net/yacy/peers/RemoteSearch.java index 53685298e..9fbbe2f85 100644 --- a/source/net/yacy/peers/RemoteSearch.java +++ b/source/net/yacy/peers/RemoteSearch.java @@ -273,6 +273,7 @@ public class RemoteSearch extends Thread { final Seed targetPeer, final Blacklist blacklist) { + assert solrQuery != null; // check own peer status if (event.peers.mySeed() == null || event.peers.mySeed().getPublicAddress() == null) { return null; } // prepare seed targets and threads diff --git a/source/net/yacy/search/query/QueryGoal.java b/source/net/yacy/search/query/QueryGoal.java index c77c50532..318ba6aef 100644 --- a/source/net/yacy/search/query/QueryGoal.java +++ b/source/net/yacy/search/query/QueryGoal.java @@ -25,8 +25,10 @@ import java.io.UnsupportedEncodingException; import java.net.URLEncoder; import java.util.ArrayList; import java.util.Collection; +import java.util.Iterator; import java.util.Locale; import java.util.Map; +import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; @@ -39,13 +41,13 @@ import net.yacy.cora.storage.HandleSet; import net.yacy.document.parser.html.AbstractScraper; import net.yacy.document.parser.html.CharacterCoding; import net.yacy.kelondro.data.word.Word; +import net.yacy.kelondro.util.SetTools; import net.yacy.search.index.Segment; import net.yacy.search.schema.CollectionConfiguration; import net.yacy.search.schema.CollectionSchema; public class QueryGoal { - private static char space = ' '; private static char sq = '\''; private static char dq = '"'; @@ -193,8 +195,8 @@ public class QueryGoal { * if possible, use getIncludeWords instead */ public HandleSet getIncludeHashes() { - if (include_hashes == null) include_hashes = Word.words2hashesHandles(include_words); - return include_hashes; + if (this.include_hashes == null) this.include_hashes = Word.words2hashesHandles(include_words); + return this.include_hashes; } /** @@ -202,38 +204,56 @@ public class QueryGoal { * if possible, use getExcludeWords instead */ public HandleSet getExcludeHashes() { - if (exclude_hashes == null) exclude_hashes = Word.words2hashesHandles(exclude_words); - return exclude_hashes; + if (this.exclude_hashes == null) this.exclude_hashes = Word.words2hashesHandles(exclude_words); + return this.exclude_hashes; + } + + public int getIncludeSize() { + assert this.include_hashes.size() == this.include_words.size(); + return this.include_words.size(); } + public int getExcludeSize() { + assert this.exclude_hashes.size() == this.exclude_words.size(); + return this.exclude_words.size(); + } + /** * @return a set of words to be included in the search result */ - public NormalizedWords getIncludeWords() { - return include_words; + public Iterator getIncludeWords() { + return this.include_words.iterator(); } /** * @return a set of words to be excluded in the search result */ - public NormalizedWords getExcludeWords() { - return exclude_words; + public Iterator getExcludeWords() { + return this.exclude_words.iterator(); } /** * @return a list of include strings which reproduces the original order of the search words and quotation */ - public ArrayList getIncludeStrings() { - return include_strings; + public Iterator getIncludeStrings() { + return this.include_strings.iterator(); } /** * @return a list of exclude strings which reproduces the original order of the search words and quotation */ - public ArrayList getExcludeStrings() { - return exclude_strings; + public Iterator getExcludeStrings() { + return this.exclude_strings.iterator(); } + public void removeIncludeWords(Set words) { + if (!words.isEmpty()) { + SetTools.excludeDestructiveByTestSmallInLarge(this.exclude_words, words); //remove stopwords + SetTools.excludeDestructiveByTestSmallInLarge(this.exclude_strings, words); //remove stopwords + if (include_hashes != null) for (String word: words) this.include_hashes.remove(Word.word2hash(word)); + } + } + /** * the include string may be useful (and better) for highlight/snippet computation * @return the query string containing only the positive literals (includes) and without whitespace characters @@ -251,13 +271,20 @@ public class QueryGoal { return (Segment.catchallString.equals(w)); } + public boolean containsInclude(String word) { + if (word == null || word.length() == 0) return false; + + String t = word.toLowerCase(Locale.ENGLISH); + return this.include_strings.contains(t) || this.include_words.contains(t); + } + public boolean matches(String text) { if (text == null || text.length() == 0) return false; // parse special requests if (isCatchall()) return true; - String t = text.toLowerCase(); + String t = text.toLowerCase(Locale.ENGLISH); for (String i: this.include_strings) if (t.indexOf(i.toLowerCase()) < 0) return false; for (String e: this.exclude_strings) if (t.indexOf(e.toLowerCase()) >= 0) return false; return true; diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java index dce1bb532..3dbf38eb6 100644 --- a/source/net/yacy/search/query/QueryParams.java +++ b/source/net/yacy/search/query/QueryParams.java @@ -320,16 +320,16 @@ public final class QueryParams { private final boolean matchesText(final String text) { boolean ret = false; QueryGoal.NormalizedWords words = new QueryGoal.NormalizedWords(Condenser.getWords(text, null).keySet()); - if (!SetTools.anymatch(words, this.queryGoal.getExcludeWords())) { + if (!SetTools.anymatchByTest(this.queryGoal.getExcludeWords(), words)) { ret = SetTools.totalInclusion(this.queryGoal.getIncludeWords(), words); } return ret; } - protected static final boolean anymatch(final String text, final QueryGoal.NormalizedWords keywords) { - if (keywords == null || keywords.isEmpty()) return false; + protected static final boolean anymatch(final String text, final Iterator keywords) { + if (keywords == null || !keywords.hasNext()) return false; final SortedSet textwords = (SortedSet) Condenser.getWords(text, null).keySet(); - return SetTools.anymatch(textwords, keywords); + return SetTools.anymatchByTest(keywords, textwords); } public SolrQuery solrQuery(final ContentDomain cd, final boolean getFacets, final boolean excludeintext_image) { @@ -343,7 +343,7 @@ public final class QueryParams { if (!getFacets) this.cachedQuery.setFacet(false); return this.cachedQuery; } - if (this.queryGoal.getIncludeWords().size() == 0) return null; + if (this.queryGoal.getIncludeSize() == 0) return null; // construct query final SolrQuery params = getBasicParams(getFacets); @@ -368,7 +368,7 @@ public final class QueryParams { if (!getFacets) this.cachedQuery.setFacet(false); return this.cachedQuery; } - if (this.queryGoal.getIncludeWords().size() == 0) return null; + if (this.queryGoal.getIncludeSize() == 0) return null; // construct query final SolrQuery params = getBasicParams(getFacets); diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index 4ba335e96..0b0af1044 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -1079,7 +1079,7 @@ public final class SearchEvent { final String pagetitle = page.dc_title().toLowerCase(); // check exclusion - if (!this.query.getQueryGoal().getExcludeWords().isEmpty() && + if (this.query.getQueryGoal().getExcludeSize() != 0 && ((QueryParams.anymatch(pagetitle, this.query.getQueryGoal().getExcludeWords())) || (QueryParams.anymatch(pageurl.toLowerCase(), this.query.getQueryGoal().getExcludeWords())) || (QueryParams.anymatch(pageauthor.toLowerCase(), this.query.getQueryGoal().getExcludeWords())))) { @@ -1295,7 +1295,7 @@ public final class SearchEvent { // apply query-in-result matching final QueryGoal.NormalizedWords urlcomph = new QueryGoal.NormalizedWords(urlcomps); final QueryGoal.NormalizedWords descrcomph = new QueryGoal.NormalizedWords(descrcomps); - final Iterator shi = this.query.getQueryGoal().getIncludeWords().iterator(); + final Iterator shi = this.query.getQueryGoal().getIncludeWords(); String queryword; while (shi.hasNext()) { queryword = shi.next(); @@ -1641,7 +1641,7 @@ public final class SearchEvent { if ( word.length() > 2 && "http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_zum_der_die_das_und_the_zur_bzw_mit_blog_wiki_aus_bei_off" .indexOf(word) < 0 - && !this.query.getQueryGoal().getIncludeWords().contains(word) + && !this.query.getQueryGoal().containsInclude(word) && lettermatch.matcher(word).matches() && !Switchboard.badwords.contains(word) && !Switchboard.stopwords.contains(word) ) {