added exclusion search

(you can now search and exclude words from the result with '-')

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3540 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 18 years ago
parent e4734a8b6b
commit 6e7340ef52

@ -205,11 +205,11 @@ public class DetailedSearch {
wdist = 1;
}
if (sb.facilityDB != null) try { sb.facilityDB.update("zeitgeist", querystring, post); } catch (Exception e) {}
final TreeSet query = plasmaSearchQuery.cleanQuery(querystring);
final TreeSet[] query = plasmaSearchQuery.cleanQuery(querystring);
// filter out stopwords
final TreeSet filtered = kelondroMSetTools.joinConstructive(query, plasmaSwitchboard.stopwords);
final TreeSet filtered = kelondroMSetTools.joinConstructive(query[0], plasmaSwitchboard.stopwords);
if (filtered.size() > 0) {
kelondroMSetTools.excludeDestructive(query, plasmaSwitchboard.stopwords);
kelondroMSetTools.excludeDestructive(query[0], plasmaSwitchboard.stopwords);
}
boolean authenticated = sb.adminAuthenticated(header) >= 2;
@ -221,7 +221,7 @@ public class DetailedSearch {
return prop;
}
final String delHash = post.get("deleteref", "");
sb.wordIndex.removeWordReferences(query, delHash);
sb.wordIndex.removeWordReferences(query[0], delHash);
}
// prepare search order
@ -239,7 +239,7 @@ public class DetailedSearch {
}
// do the search
plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, wdist, "", plasmaSearchQuery.CONTENTDOM_TEXT, count, searchtime, urlmask,
plasmaSearchQuery thisSearch = new plasmaSearchQuery(query[0], query[1], wdist, "", plasmaSearchQuery.CONTENTDOM_TEXT, count, searchtime, urlmask,
((global) && (yacyonline) && (!(env.getConfig("last-search","").equals(querystring)))) ? plasmaSearchQuery.SEARCHDOM_GLOBALDHT : plasmaSearchQuery.SEARCHDOM_LOCAL,
"", 20, plasmaSearchQuery.catchall_constraint);
plasmaSearchRankingProfile localRanking = new plasmaSearchRankingProfile("local", post.toString());

@ -46,13 +46,13 @@ public class snippet {
if ((querystring.length() > 2) && (querystring.charAt(0) == '"') && (querystring.charAt(querystring.length() - 1) == '"')) {
querystring = querystring.substring(1, querystring.length() - 1).trim();
}
final TreeSet query = plasmaSearchQuery.cleanQuery(querystring);
Set queryHashes = plasmaCondenser.words2hashes(query);
final TreeSet[] query = plasmaSearchQuery.cleanQuery(querystring);
Set queryHashes = plasmaCondenser.words2hashes(query[0]);
// filter out stopwords
final TreeSet filtered = kelondroMSetTools.joinConstructive(query, plasmaSwitchboard.stopwords);
final TreeSet filtered = kelondroMSetTools.joinConstructive(query[0], plasmaSwitchboard.stopwords);
if (filtered.size() > 0) {
kelondroMSetTools.excludeDestructive(query, plasmaSwitchboard.stopwords);
kelondroMSetTools.excludeDestructive(query[0], plasmaSwitchboard.stopwords);
}
// find snippet

@ -196,11 +196,11 @@ public class yacysearch {
serverObjects prop = new serverObjects();
if (post.get("cat", "href").equals("href")) {
final TreeSet query = plasmaSearchQuery.cleanQuery(querystring);
final TreeSet[] query = plasmaSearchQuery.cleanQuery(querystring);
// filter out stopwords
final TreeSet filtered = kelondroMSetTools.joinConstructive(query, plasmaSwitchboard.stopwords);
final TreeSet filtered = kelondroMSetTools.joinConstructive(query[0], plasmaSwitchboard.stopwords);
if (filtered.size() > 0) {
kelondroMSetTools.excludeDestructive(query, plasmaSwitchboard.stopwords);
kelondroMSetTools.excludeDestructive(query[0], plasmaSwitchboard.stopwords);
}
// if a minus-button was hit, remove a special reference first
@ -212,7 +212,7 @@ public class yacysearch {
// delete the index entry locally
final String delHash = post.get("deleteref", ""); // urlhash
sb.wordIndex.removeWordReferences(query, delHash);
sb.wordIndex.removeWordReferences(query[0], delHash);
// make new news message with negative voting
HashMap map = new HashMap();
@ -255,7 +255,8 @@ public class yacysearch {
// do the search
plasmaSearchQuery thisSearch = new plasmaSearchQuery(
query,
query[0],
query[1],
maxDistance,
prefermask,
contentdomCode,

@ -397,9 +397,12 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
indexURLEntry page;
Long preranking;
Object[] preorderEntry;
indexURLEntry.Components comp;
String pagetitle, pageurl, pageauthor, exclw;
Iterator excli;
int minEntries = profileLocal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT);
try {
while (preorder.hasNext()) {
ordering: while (preorder.hasNext()) {
if ((System.currentTimeMillis() >= postorderLimitTime) && (acc.sizeFetched() >= minEntries)) break;
preorderEntry = preorder.next();
entry = (indexRWIEntry) preorderEntry[0];
@ -407,12 +410,26 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
preranking = (Long) preorderEntry[1];
// find the url entry
page = urlStore.load(entry.urlHash(), entry);
// add a result
if (page != null) {
if ((!(query.constraint.equals(plasmaSearchQuery.catchall_constraint))) &&
comp = page.comp();
pagetitle = comp.title().toLowerCase();
pageurl = comp.url().toString().toLowerCase();
pageauthor = comp.author().toLowerCase();
// check exclusion
excli = query.excludeWords.iterator();
while (excli.hasNext()) {
exclw = (String) excli.next();
if ((pagetitle.indexOf(exclw) >= 0) ||
(pageurl.indexOf(exclw) >= 0) ||
(pageauthor.indexOf(exclw) >= 0)) continue ordering;
}
// check constraints
if ((!(query.constraint.equals(plasmaSearchQuery.catchall_constraint))) &&
(query.constraint.get(plasmaCondenser.flag_cat_indexof)) &&
(!(page.comp().title().startsWith("Index of")))) {
log.logFine("filtered out " + page.comp().url().toString());
(!(comp.title().startsWith("Index of")))) {
log.logFine("filtered out " + comp.url().toString());
// filter out bad results
Iterator wi = query.queryHashes.iterator();
while (wi.hasNext()) wordIndex.removeEntry((String) wi.next(), page.hash());
@ -454,7 +471,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
if (rcLocal == null) return;
plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query, ranking, rcLocal, timeout - System.currentTimeMillis());
if (preorder.filteredCount()> query.wantedResults) preorder.remove(true, true);
if (preorder.filteredCount() > query.wantedResults) preorder.remove(true, true);
// start url-fetch
indexRWIEntry entry;

@ -127,20 +127,19 @@ public final class plasmaSearchPreOrder {
indexRWIEntry iEntry;
String hashpart;
boolean isWordRootURL;
String querywords = query.words("");
while (i.hasNext()) {
if (pageAcc.size() <= query.wantedResults) break;
entry = (Map.Entry) i.next();
iEntry = (indexRWIEntry) entry.getValue();
hashpart = iEntry.urlHash().substring(6);
isWordRootURL = plasmaURL.isWordRootURL(iEntry.urlHash(), query.words(""));
if ((!(isWordRootURL)) &&
(((rootDomExt) && (rootDoms.contains(hashpart))) ||
((doubleDom) && (doubleDoms.contains(hashpart))))) {
i.remove();
if (pageAcc.size() <= query.wantedResults) return;
isWordRootURL = plasmaURL.isWordRootURL(iEntry.urlHash(), querywords);
if (isWordRootURL) {
rootDoms.add(hashpart);
} else {
if (isWordRootURL) {
rootDoms.add(hashpart);
if (((rootDomExt) && (rootDoms.contains(hashpart))) ||
((doubleDom) && (doubleDoms.contains(hashpart)))) {
i.remove();
}
}
doubleDoms.add(hashpart);

@ -71,6 +71,7 @@ public final class plasmaSearchQuery {
public static final kelondroBitfield catchall_constraint = new kelondroBitfield(4, "______");
public Set queryWords, queryHashes;
public Set excludeWords;
public int wantedResults;
public String prefer;
public int contentdom;
@ -82,15 +83,16 @@ public final class plasmaSearchQuery {
public int maxDistance;
public kelondroBitfield constraint;
public plasmaSearchQuery(Set queryWords, int maxDistance, String prefer, int contentdom,
public plasmaSearchQuery(Set queryWords, Set excludeWords, int maxDistance, String prefer, int contentdom,
int wantedResults, long maximumTime, String urlMask,
int domType, String domGroupName, int domMaxTargets,
kelondroBitfield constraint) {
this.queryWords = queryWords;
this.queryHashes = plasmaCondenser.words2hashes(queryWords);
this.excludeWords = excludeWords;
this.maxDistance = maxDistance;
this.prefer = prefer;
this.contentdom = contentdom;
this.queryHashes = plasmaCondenser.words2hashes(queryWords);
this.wantedResults = wantedResults;
this.maximumTime = maximumTime;
this.urlMask = urlMask;
@ -104,6 +106,7 @@ public final class plasmaSearchQuery {
int wantedResults, long maximumTime, String urlMask,
kelondroBitfield constraint) {
this.queryWords = null;
this.excludeWords = null;
this.maxDistance = maxDistance;
this.prefer = prefer;
this.contentdom = contentdom;
@ -142,24 +145,37 @@ public final class plasmaSearchQuery {
return new String(sb);
}
public static TreeSet cleanQuery(String words) {
public static TreeSet[] cleanQuery(String words) {
// returns two sets: a query set and a exclude set
if ((words == null) || (words.length() == 0)) return new TreeSet[]{new TreeSet(), new TreeSet()};
// convert Umlaute
words = htmlFilterAbstractScraper.convertUmlaute(new serverCharBuffer(words.toCharArray())).toString();
// remove funny symbols
final String seps = "' .,:/-&";
final String seps = "'.,:/&";
words = words.toLowerCase().trim();
int c;
for (int i = 0; i < seps.length(); i++) {
if ((c = words.indexOf(seps.charAt(i))) >= 0) { words = words.substring(0, c) + (((c + 1) < words.length()) ? (" " + words.substring(c + 1)) : ""); }
while ((c = words.indexOf(seps.charAt(i))) >= 0) { words = words.substring(0, c) + (((c + 1) < words.length()) ? (" " + words.substring(c + 1)) : ""); }
}
// the string is clean now, but we must generate a set out of it
final TreeSet query = new TreeSet(kelondroNaturalOrder.naturalOrder);
if (words.length() == 0) return query; // split returns always one element
final TreeSet exclude = new TreeSet(kelondroNaturalOrder.naturalOrder);
final String[] a = words.split(" ");
for (int i = 0; i < a.length; i++) { query.add(a[i]); }
return query;
for (int i = 0; i < a.length; i++) {
if (a[i].startsWith("-")) {
exclude.add(a[i].substring(1));
} else {
while ((c = a[i].indexOf('-')) >= 0) {
query.add(a[i].substring(0, c));
a[i] = a[i].substring(c + 1);
}
if (a[i].length() > 0) query.add(a[i]);
}
}
return new TreeSet[]{query, exclude};
}
public int size() {

Loading…
Cancel
Save