fixes for searches containing stopwords. The fix was done using a

reconstruction of the search word set access method to protect that
words are deleted from the sets from the outside of the QueryGoal class.
pull/1/head
Michael Peter Christen 11 years ago
parent 5592ea57f0
commit 2c39b65409

@ -525,10 +525,8 @@ public class yacysearch {
// filter out stopwords
final SortedSet<String> filtered = SetTools.joinConstructiveByTest(qg.getIncludeWords(), Switchboard.stopwords); //find matching stopwords
if ( !filtered.isEmpty() ) {
SetTools.excludeDestructiveByTestSmallInLarge(qg.getIncludeWords(), filtered); //remove stopwords
}
qg.removeIncludeWords(filtered);
// if a minus-button was hit, remove a special reference first
if ( post != null && post.containsKey("deleteref") ) {
try {

@ -25,8 +25,7 @@
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.net.MalformedURLException;
import java.util.List;
import java.util.Iterator;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.analysis.Classification.ContentDomain;
@ -218,11 +217,9 @@ public class yacysearchitem {
prop.putHTML("content_publisher", result.publisher());
prop.putHTML("content_creator", result.creator());// author
prop.putHTML("content_subject", result.subject());
final List<String> query = theSearch.query.getQueryGoal().getIncludeStrings();
final StringBuilder s = new StringBuilder(query.size() * 20);
for (final String t: query) {
s.append('+').append(t);
}
final Iterator<String> query = theSearch.query.getQueryGoal().getIncludeStrings();
final StringBuilder s = new StringBuilder(theSearch.query.getQueryGoal().getIncludeSize() * 20);
while (query.hasNext()) s.append('+').append(query.next());
final String words = (s.length() > 0) ? s.substring(1) : "";
prop.putHTML("content_words", words);
prop.putHTML("content_showParser_words", words);

@ -214,18 +214,17 @@ public final class SetTools {
// start most efficient method
if (stepsEnum > stepsTest) {
if (set1.size() < set2.size()) return joinConstructiveByTest(set1, set2);
return joinConstructiveByTest(set2, set1);
if (set1.size() < set2.size()) return joinConstructiveByTest(set1.iterator(), set2);
return joinConstructiveByTest(set2.iterator(), set1);
}
return joinConstructiveByEnumeration(set1, set2);
}
public static <A> SortedSet<A> joinConstructiveByTest(final Collection<A> small, final SortedSet<A> large) {
final Iterator<A> mi = small.iterator();
public static <A> SortedSet<A> joinConstructiveByTest(final Iterator<A> small, final SortedSet<A> large) {
final SortedSet<A> result = new TreeSet<A>(large.comparator());
A o;
while (mi.hasNext()) {
o = mi.next();
while (small.hasNext()) {
o = small.next();
if (large.contains(o)) result.add(o);
}
return result;
@ -264,9 +263,9 @@ public final class SetTools {
* @param large
* @return true if the small set is completely included in the large set
*/
public static <A> boolean totalInclusion(final Set<A> small, final Set<A> large) {
for (A o: small) {
if (!large.contains(o)) return false;
public static <A> boolean totalInclusion(final Iterator<A> small, final Set<A> large) {
while (small.hasNext()) {
if (!large.contains(small.next())) return false;
}
return true;
}
@ -305,8 +304,7 @@ public final class SetTools {
// start most efficient method
if (stepsEnum > stepsTest) {
if (set1.size() < set2.size()) return anymatchByTest(set1, set2);
return anymatchByTest(set2, set1);
return (set1.size() < set2.size()) ? anymatchByTest(set1.iterator(), set2) : anymatchByTest(set2.iterator(), set1);
}
return anymatchByEnumeration(set1, set2);
}
@ -337,12 +335,9 @@ public final class SetTools {
return anymatchByEnumeration(set1, set2);
}
private static <A> boolean anymatchByTest(final SortedSet<A> small, final SortedSet<A> large) {
final Iterator<A> mi = small.iterator();
A o;
while (mi.hasNext()) {
o = mi.next();
if (large.contains(o)) return true;
public static <A> boolean anymatchByTest(final Iterator<A> small, final SortedSet<A> large) {
while (small.hasNext()) {
if (large.contains(small.next())) return true;
}
return false;
}

@ -273,6 +273,7 @@ public class RemoteSearch extends Thread {
final Seed targetPeer,
final Blacklist blacklist) {
assert solrQuery != null;
// check own peer status
if (event.peers.mySeed() == null || event.peers.mySeed().getPublicAddress() == null) { return null; }
// prepare seed targets and threads

@ -25,8 +25,10 @@ import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
@ -39,13 +41,13 @@ import net.yacy.cora.storage.HandleSet;
import net.yacy.document.parser.html.AbstractScraper;
import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.util.SetTools;
import net.yacy.search.index.Segment;
import net.yacy.search.schema.CollectionConfiguration;
import net.yacy.search.schema.CollectionSchema;
public class QueryGoal {
private static char space = ' ';
private static char sq = '\'';
private static char dq = '"';
@ -193,8 +195,8 @@ public class QueryGoal {
* if possible, use getIncludeWords instead
*/
public HandleSet getIncludeHashes() {
if (include_hashes == null) include_hashes = Word.words2hashesHandles(include_words);
return include_hashes;
if (this.include_hashes == null) this.include_hashes = Word.words2hashesHandles(include_words);
return this.include_hashes;
}
/**
@ -202,38 +204,56 @@ public class QueryGoal {
* if possible, use getExcludeWords instead
*/
public HandleSet getExcludeHashes() {
if (exclude_hashes == null) exclude_hashes = Word.words2hashesHandles(exclude_words);
return exclude_hashes;
if (this.exclude_hashes == null) this.exclude_hashes = Word.words2hashesHandles(exclude_words);
return this.exclude_hashes;
}
public int getIncludeSize() {
assert this.include_hashes.size() == this.include_words.size();
return this.include_words.size();
}
public int getExcludeSize() {
assert this.exclude_hashes.size() == this.exclude_words.size();
return this.exclude_words.size();
}
/**
* @return a set of words to be included in the search result
*/
public NormalizedWords getIncludeWords() {
return include_words;
public Iterator<String> getIncludeWords() {
return this.include_words.iterator();
}
/**
* @return a set of words to be excluded in the search result
*/
public NormalizedWords getExcludeWords() {
return exclude_words;
public Iterator<String> getExcludeWords() {
return this.exclude_words.iterator();
}
/**
* @return a list of include strings which reproduces the original order of the search words and quotation
*/
public ArrayList<String> getIncludeStrings() {
return include_strings;
public Iterator<String> getIncludeStrings() {
return this.include_strings.iterator();
}
/**
* @return a list of exclude strings which reproduces the original order of the search words and quotation
*/
public ArrayList<String> getExcludeStrings() {
return exclude_strings;
public Iterator<String> getExcludeStrings() {
return this.exclude_strings.iterator();
}
public void removeIncludeWords(Set<String> words) {
if (!words.isEmpty()) {
SetTools.excludeDestructiveByTestSmallInLarge(this.exclude_words, words); //remove stopwords
SetTools.excludeDestructiveByTestSmallInLarge(this.exclude_strings, words); //remove stopwords
if (include_hashes != null) for (String word: words) this.include_hashes.remove(Word.word2hash(word));
}
}
/**
* the include string may be useful (and better) for highlight/snippet computation
* @return the query string containing only the positive literals (includes) and without whitespace characters
@ -251,13 +271,20 @@ public class QueryGoal {
return (Segment.catchallString.equals(w));
}
public boolean containsInclude(String word) {
if (word == null || word.length() == 0) return false;
String t = word.toLowerCase(Locale.ENGLISH);
return this.include_strings.contains(t) || this.include_words.contains(t);
}
public boolean matches(String text) {
if (text == null || text.length() == 0) return false;
// parse special requests
if (isCatchall()) return true;
String t = text.toLowerCase();
String t = text.toLowerCase(Locale.ENGLISH);
for (String i: this.include_strings) if (t.indexOf(i.toLowerCase()) < 0) return false;
for (String e: this.exclude_strings) if (t.indexOf(e.toLowerCase()) >= 0) return false;
return true;

@ -320,16 +320,16 @@ public final class QueryParams {
private final boolean matchesText(final String text) {
boolean ret = false;
QueryGoal.NormalizedWords words = new QueryGoal.NormalizedWords(Condenser.getWords(text, null).keySet());
if (!SetTools.anymatch(words, this.queryGoal.getExcludeWords())) {
if (!SetTools.anymatchByTest(this.queryGoal.getExcludeWords(), words)) {
ret = SetTools.totalInclusion(this.queryGoal.getIncludeWords(), words);
}
return ret;
}
protected static final boolean anymatch(final String text, final QueryGoal.NormalizedWords keywords) {
if (keywords == null || keywords.isEmpty()) return false;
protected static final boolean anymatch(final String text, final Iterator<String> keywords) {
if (keywords == null || !keywords.hasNext()) return false;
final SortedSet<String> textwords = (SortedSet<String>) Condenser.getWords(text, null).keySet();
return SetTools.anymatch(textwords, keywords);
return SetTools.anymatchByTest(keywords, textwords);
}
public SolrQuery solrQuery(final ContentDomain cd, final boolean getFacets, final boolean excludeintext_image) {
@ -343,7 +343,7 @@ public final class QueryParams {
if (!getFacets) this.cachedQuery.setFacet(false);
return this.cachedQuery;
}
if (this.queryGoal.getIncludeWords().size() == 0) return null;
if (this.queryGoal.getIncludeSize() == 0) return null;
// construct query
final SolrQuery params = getBasicParams(getFacets);
@ -368,7 +368,7 @@ public final class QueryParams {
if (!getFacets) this.cachedQuery.setFacet(false);
return this.cachedQuery;
}
if (this.queryGoal.getIncludeWords().size() == 0) return null;
if (this.queryGoal.getIncludeSize() == 0) return null;
// construct query
final SolrQuery params = getBasicParams(getFacets);

@ -1079,7 +1079,7 @@ public final class SearchEvent {
final String pagetitle = page.dc_title().toLowerCase();
// check exclusion
if (!this.query.getQueryGoal().getExcludeWords().isEmpty() &&
if (this.query.getQueryGoal().getExcludeSize() != 0 &&
((QueryParams.anymatch(pagetitle, this.query.getQueryGoal().getExcludeWords()))
|| (QueryParams.anymatch(pageurl.toLowerCase(), this.query.getQueryGoal().getExcludeWords()))
|| (QueryParams.anymatch(pageauthor.toLowerCase(), this.query.getQueryGoal().getExcludeWords())))) {
@ -1295,7 +1295,7 @@ public final class SearchEvent {
// apply query-in-result matching
final QueryGoal.NormalizedWords urlcomph = new QueryGoal.NormalizedWords(urlcomps);
final QueryGoal.NormalizedWords descrcomph = new QueryGoal.NormalizedWords(descrcomps);
final Iterator<String> shi = this.query.getQueryGoal().getIncludeWords().iterator();
final Iterator<String> shi = this.query.getQueryGoal().getIncludeWords();
String queryword;
while (shi.hasNext()) {
queryword = shi.next();
@ -1641,7 +1641,7 @@ public final class SearchEvent {
if ( word.length() > 2
&& "http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_zum_der_die_das_und_the_zur_bzw_mit_blog_wiki_aus_bei_off"
.indexOf(word) < 0
&& !this.query.getQueryGoal().getIncludeWords().contains(word)
&& !this.query.getQueryGoal().containsInclude(word)
&& lettermatch.matcher(word).matches()
&& !Switchboard.badwords.contains(word)
&& !Switchboard.stopwords.contains(word) ) {

Loading…
Cancel
Save