|
|
|
@ -24,13 +24,17 @@ package net.yacy.search.query;
|
|
|
|
|
import java.io.UnsupportedEncodingException;
|
|
|
|
|
import java.net.URLEncoder;
|
|
|
|
|
import java.util.ArrayList;
|
|
|
|
|
import java.util.Collection;
|
|
|
|
|
import java.util.Locale;
|
|
|
|
|
import java.util.Map;
|
|
|
|
|
import java.util.SortedSet;
|
|
|
|
|
import java.util.TreeSet;
|
|
|
|
|
|
|
|
|
|
import net.yacy.cora.document.WordCache;
|
|
|
|
|
import net.yacy.cora.federate.solr.Ranking;
|
|
|
|
|
import net.yacy.cora.federate.solr.SchemaDeclaration;
|
|
|
|
|
import net.yacy.cora.federate.solr.SolrType;
|
|
|
|
|
import net.yacy.cora.order.NaturalOrder;
|
|
|
|
|
import net.yacy.cora.storage.HandleSet;
|
|
|
|
|
import net.yacy.document.parser.html.AbstractScraper;
|
|
|
|
|
import net.yacy.document.parser.html.CharacterCoding;
|
|
|
|
@ -49,14 +53,43 @@ public class QueryGoal {
|
|
|
|
|
|
|
|
|
|
private String query_original;
|
|
|
|
|
private HandleSet include_hashes, exclude_hashes;
|
|
|
|
|
private final ArrayList<String> include_words, exclude_words;
|
|
|
|
|
private final NormalizedWords include_words, exclude_words;
|
|
|
|
|
private final ArrayList<String> include_strings, exclude_strings;
|
|
|
|
|
|
|
|
|
|
public static class NormalizedWords extends TreeSet<String> {
|
|
|
|
|
|
|
|
|
|
private static final long serialVersionUID = -3050851079671868007L;
|
|
|
|
|
|
|
|
|
|
public NormalizedWords() {
|
|
|
|
|
super(NaturalOrder.naturalComparator);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public NormalizedWords(String[] rawWords) {
|
|
|
|
|
super(NaturalOrder.naturalComparator);
|
|
|
|
|
for (String word: rawWords) super.add(word.toLowerCase(Locale.ENGLISH));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public NormalizedWords(Collection<String> rawWords) {
|
|
|
|
|
super(NaturalOrder.naturalComparator);
|
|
|
|
|
for (String word: rawWords) super.add(word.toLowerCase(Locale.ENGLISH));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public boolean add(String word) {
|
|
|
|
|
return super.add(word.toLowerCase(Locale.ENGLISH));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public boolean contains(Object word) {
|
|
|
|
|
if (!(word instanceof String)) return false;
|
|
|
|
|
return super.contains(((String) word).toLowerCase(Locale.ENGLISH));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public QueryGoal(HandleSet include_hashes, HandleSet exclude_hashes) {
|
|
|
|
|
this.query_original = null;
|
|
|
|
|
this.include_words = new ArrayList<String>();
|
|
|
|
|
this.exclude_words = new ArrayList<String>();
|
|
|
|
|
this.include_words = new NormalizedWords();
|
|
|
|
|
this.exclude_words = new NormalizedWords();
|
|
|
|
|
this.include_strings = new ArrayList<String>();
|
|
|
|
|
this.exclude_strings = new ArrayList<String>();
|
|
|
|
|
this.include_hashes = include_hashes;
|
|
|
|
@ -67,8 +100,8 @@ public class QueryGoal {
|
|
|
|
|
assert query_original != null;
|
|
|
|
|
assert query_words != null;
|
|
|
|
|
this.query_original = query_original;
|
|
|
|
|
this.include_words = new ArrayList<String>();
|
|
|
|
|
this.exclude_words = new ArrayList<String>();
|
|
|
|
|
this.include_words = new NormalizedWords();
|
|
|
|
|
this.exclude_words = new NormalizedWords();
|
|
|
|
|
this.include_strings = new ArrayList<String>();
|
|
|
|
|
this.exclude_strings = new ArrayList<String>();
|
|
|
|
|
|
|
|
|
@ -88,8 +121,8 @@ public class QueryGoal {
|
|
|
|
|
for (String s: this.include_strings) parseQuery(s, this.include_words, this.include_words);
|
|
|
|
|
for (String s: this.exclude_strings) parseQuery(s, this.exclude_words, this.exclude_words);
|
|
|
|
|
|
|
|
|
|
WordCache.learn(this.include_strings);
|
|
|
|
|
WordCache.learn(this.exclude_strings);
|
|
|
|
|
WordCache.learn(this.include_words);
|
|
|
|
|
WordCache.learn(this.exclude_words);
|
|
|
|
|
|
|
|
|
|
this.include_hashes = null;
|
|
|
|
|
this.exclude_hashes = null;
|
|
|
|
@ -107,7 +140,7 @@ public class QueryGoal {
|
|
|
|
|
* sq = '\''
|
|
|
|
|
* dq = '"'
|
|
|
|
|
*/
|
|
|
|
|
private static void parseQuery(String s, ArrayList<String> include_string, ArrayList<String> exclude_string) {
|
|
|
|
|
private static void parseQuery(String s, Collection<String> include_string, Collection<String> exclude_string) {
|
|
|
|
|
while (s.length() > 0) {
|
|
|
|
|
// parse query
|
|
|
|
|
int p = 0;
|
|
|
|
@ -155,15 +188,51 @@ public class QueryGoal {
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @return a set of hashes of words to be included in the search result.
|
|
|
|
|
* if possible, use getIncludeWords instead
|
|
|
|
|
*/
|
|
|
|
|
public HandleSet getIncludeHashes() {
|
|
|
|
|
if (include_hashes == null) include_hashes = Word.words2hashesHandles(include_words);
|
|
|
|
|
return include_hashes;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @return a set of hashes of words to be excluded in the search result
|
|
|
|
|
* if possible, use getExcludeWords instead
|
|
|
|
|
*/
|
|
|
|
|
public HandleSet getExcludeHashes() {
|
|
|
|
|
if (exclude_hashes == null) exclude_hashes = Word.words2hashesHandles(exclude_words);
|
|
|
|
|
return exclude_hashes;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @return a set of words to be included in the search result
|
|
|
|
|
*/
|
|
|
|
|
public NormalizedWords getIncludeWords() {
|
|
|
|
|
return include_words;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @return a set of words to be excluded in the search result
|
|
|
|
|
*/
|
|
|
|
|
public NormalizedWords getExcludeWords() {
|
|
|
|
|
return exclude_words;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @return a list of include strings which reproduces the original order of the search words and quotation
|
|
|
|
|
*/
|
|
|
|
|
public ArrayList<String> getIncludeStrings() {
|
|
|
|
|
return include_strings;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @return a list of exclude strings which reproduces the original order of the search words and quotation
|
|
|
|
|
*/
|
|
|
|
|
public ArrayList<String> getExcludeStrings() {
|
|
|
|
|
return exclude_strings;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* the include string may be useful (and better) for highlight/snippet computation
|
|
|
|
@ -175,14 +244,6 @@ public class QueryGoal {
|
|
|
|
|
for (String s: this.include_strings) sb.append(s).append(' ');
|
|
|
|
|
return sb.toString().substring(0, sb.length() - 1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public ArrayList<String> getIncludeStrings() {
|
|
|
|
|
return include_strings;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public ArrayList<String> getExcludeStrings() {
|
|
|
|
|
return exclude_strings;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public boolean isCatchall() {
|
|
|
|
|
if (include_strings.size() != 1 || exclude_strings.size() != 0) return false;
|
|
|
|
@ -205,6 +266,10 @@ public class QueryGoal {
|
|
|
|
|
public void filterOut(final SortedSet<String> blueList) {
|
|
|
|
|
// filter out words that appear in this set
|
|
|
|
|
// this is applied to the queryHashes
|
|
|
|
|
for (String word: blueList) {
|
|
|
|
|
this.include_words.remove(word);
|
|
|
|
|
this.include_strings.remove(word);
|
|
|
|
|
}
|
|
|
|
|
final HandleSet blues = Word.words2hashesHandles(blueList);
|
|
|
|
|
for (final byte[] b: blues) this.include_hashes.remove(b);
|
|
|
|
|
}
|
|
|
|
|