added exclusion search

(you can now search and exclude words from the result with '-')

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3540 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 18 years ago
parent e4734a8b6b
commit 6e7340ef52

@ -205,11 +205,11 @@ public class DetailedSearch {
wdist = 1; wdist = 1;
} }
if (sb.facilityDB != null) try { sb.facilityDB.update("zeitgeist", querystring, post); } catch (Exception e) {} if (sb.facilityDB != null) try { sb.facilityDB.update("zeitgeist", querystring, post); } catch (Exception e) {}
final TreeSet query = plasmaSearchQuery.cleanQuery(querystring); final TreeSet[] query = plasmaSearchQuery.cleanQuery(querystring);
// filter out stopwords // filter out stopwords
final TreeSet filtered = kelondroMSetTools.joinConstructive(query, plasmaSwitchboard.stopwords); final TreeSet filtered = kelondroMSetTools.joinConstructive(query[0], plasmaSwitchboard.stopwords);
if (filtered.size() > 0) { if (filtered.size() > 0) {
kelondroMSetTools.excludeDestructive(query, plasmaSwitchboard.stopwords); kelondroMSetTools.excludeDestructive(query[0], plasmaSwitchboard.stopwords);
} }
boolean authenticated = sb.adminAuthenticated(header) >= 2; boolean authenticated = sb.adminAuthenticated(header) >= 2;
@ -221,7 +221,7 @@ public class DetailedSearch {
return prop; return prop;
} }
final String delHash = post.get("deleteref", ""); final String delHash = post.get("deleteref", "");
sb.wordIndex.removeWordReferences(query, delHash); sb.wordIndex.removeWordReferences(query[0], delHash);
} }
// prepare search order // prepare search order
@ -239,7 +239,7 @@ public class DetailedSearch {
} }
// do the search // do the search
plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, wdist, "", plasmaSearchQuery.CONTENTDOM_TEXT, count, searchtime, urlmask, plasmaSearchQuery thisSearch = new plasmaSearchQuery(query[0], query[1], wdist, "", plasmaSearchQuery.CONTENTDOM_TEXT, count, searchtime, urlmask,
((global) && (yacyonline) && (!(env.getConfig("last-search","").equals(querystring)))) ? plasmaSearchQuery.SEARCHDOM_GLOBALDHT : plasmaSearchQuery.SEARCHDOM_LOCAL, ((global) && (yacyonline) && (!(env.getConfig("last-search","").equals(querystring)))) ? plasmaSearchQuery.SEARCHDOM_GLOBALDHT : plasmaSearchQuery.SEARCHDOM_LOCAL,
"", 20, plasmaSearchQuery.catchall_constraint); "", 20, plasmaSearchQuery.catchall_constraint);
plasmaSearchRankingProfile localRanking = new plasmaSearchRankingProfile("local", post.toString()); plasmaSearchRankingProfile localRanking = new plasmaSearchRankingProfile("local", post.toString());

@ -46,13 +46,13 @@ public class snippet {
if ((querystring.length() > 2) && (querystring.charAt(0) == '"') && (querystring.charAt(querystring.length() - 1) == '"')) { if ((querystring.length() > 2) && (querystring.charAt(0) == '"') && (querystring.charAt(querystring.length() - 1) == '"')) {
querystring = querystring.substring(1, querystring.length() - 1).trim(); querystring = querystring.substring(1, querystring.length() - 1).trim();
} }
final TreeSet query = plasmaSearchQuery.cleanQuery(querystring); final TreeSet[] query = plasmaSearchQuery.cleanQuery(querystring);
Set queryHashes = plasmaCondenser.words2hashes(query); Set queryHashes = plasmaCondenser.words2hashes(query[0]);
// filter out stopwords // filter out stopwords
final TreeSet filtered = kelondroMSetTools.joinConstructive(query, plasmaSwitchboard.stopwords); final TreeSet filtered = kelondroMSetTools.joinConstructive(query[0], plasmaSwitchboard.stopwords);
if (filtered.size() > 0) { if (filtered.size() > 0) {
kelondroMSetTools.excludeDestructive(query, plasmaSwitchboard.stopwords); kelondroMSetTools.excludeDestructive(query[0], plasmaSwitchboard.stopwords);
} }
// find snippet // find snippet

@ -196,11 +196,11 @@ public class yacysearch {
serverObjects prop = new serverObjects(); serverObjects prop = new serverObjects();
if (post.get("cat", "href").equals("href")) { if (post.get("cat", "href").equals("href")) {
final TreeSet query = plasmaSearchQuery.cleanQuery(querystring); final TreeSet[] query = plasmaSearchQuery.cleanQuery(querystring);
// filter out stopwords // filter out stopwords
final TreeSet filtered = kelondroMSetTools.joinConstructive(query, plasmaSwitchboard.stopwords); final TreeSet filtered = kelondroMSetTools.joinConstructive(query[0], plasmaSwitchboard.stopwords);
if (filtered.size() > 0) { if (filtered.size() > 0) {
kelondroMSetTools.excludeDestructive(query, plasmaSwitchboard.stopwords); kelondroMSetTools.excludeDestructive(query[0], plasmaSwitchboard.stopwords);
} }
// if a minus-button was hit, remove a special reference first // if a minus-button was hit, remove a special reference first
@ -212,7 +212,7 @@ public class yacysearch {
// delete the index entry locally // delete the index entry locally
final String delHash = post.get("deleteref", ""); // urlhash final String delHash = post.get("deleteref", ""); // urlhash
sb.wordIndex.removeWordReferences(query, delHash); sb.wordIndex.removeWordReferences(query[0], delHash);
// make new news message with negative voting // make new news message with negative voting
HashMap map = new HashMap(); HashMap map = new HashMap();
@ -255,7 +255,8 @@ public class yacysearch {
// do the search // do the search
plasmaSearchQuery thisSearch = new plasmaSearchQuery( plasmaSearchQuery thisSearch = new plasmaSearchQuery(
query, query[0],
query[1],
maxDistance, maxDistance,
prefermask, prefermask,
contentdomCode, contentdomCode,

@ -397,9 +397,12 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
indexURLEntry page; indexURLEntry page;
Long preranking; Long preranking;
Object[] preorderEntry; Object[] preorderEntry;
indexURLEntry.Components comp;
String pagetitle, pageurl, pageauthor, exclw;
Iterator excli;
int minEntries = profileLocal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT); int minEntries = profileLocal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT);
try { try {
while (preorder.hasNext()) { ordering: while (preorder.hasNext()) {
if ((System.currentTimeMillis() >= postorderLimitTime) && (acc.sizeFetched() >= minEntries)) break; if ((System.currentTimeMillis() >= postorderLimitTime) && (acc.sizeFetched() >= minEntries)) break;
preorderEntry = preorder.next(); preorderEntry = preorder.next();
entry = (indexRWIEntry) preorderEntry[0]; entry = (indexRWIEntry) preorderEntry[0];
@ -407,12 +410,26 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
preranking = (Long) preorderEntry[1]; preranking = (Long) preorderEntry[1];
// find the url entry // find the url entry
page = urlStore.load(entry.urlHash(), entry); page = urlStore.load(entry.urlHash(), entry);
// add a result
if (page != null) { if (page != null) {
if ((!(query.constraint.equals(plasmaSearchQuery.catchall_constraint))) && comp = page.comp();
pagetitle = comp.title().toLowerCase();
pageurl = comp.url().toString().toLowerCase();
pageauthor = comp.author().toLowerCase();
// check exclusion
excli = query.excludeWords.iterator();
while (excli.hasNext()) {
exclw = (String) excli.next();
if ((pagetitle.indexOf(exclw) >= 0) ||
(pageurl.indexOf(exclw) >= 0) ||
(pageauthor.indexOf(exclw) >= 0)) continue ordering;
}
// check constraints
if ((!(query.constraint.equals(plasmaSearchQuery.catchall_constraint))) &&
(query.constraint.get(plasmaCondenser.flag_cat_indexof)) && (query.constraint.get(plasmaCondenser.flag_cat_indexof)) &&
(!(page.comp().title().startsWith("Index of")))) { (!(comp.title().startsWith("Index of")))) {
log.logFine("filtered out " + page.comp().url().toString()); log.logFine("filtered out " + comp.url().toString());
// filter out bad results // filter out bad results
Iterator wi = query.queryHashes.iterator(); Iterator wi = query.queryHashes.iterator();
while (wi.hasNext()) wordIndex.removeEntry((String) wi.next(), page.hash()); while (wi.hasNext()) wordIndex.removeEntry((String) wi.next(), page.hash());
@ -454,7 +471,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
if (rcLocal == null) return; if (rcLocal == null) return;
plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query, ranking, rcLocal, timeout - System.currentTimeMillis()); plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query, ranking, rcLocal, timeout - System.currentTimeMillis());
if (preorder.filteredCount()> query.wantedResults) preorder.remove(true, true); if (preorder.filteredCount() > query.wantedResults) preorder.remove(true, true);
// start url-fetch // start url-fetch
indexRWIEntry entry; indexRWIEntry entry;

@ -127,20 +127,19 @@ public final class plasmaSearchPreOrder {
indexRWIEntry iEntry; indexRWIEntry iEntry;
String hashpart; String hashpart;
boolean isWordRootURL; boolean isWordRootURL;
String querywords = query.words("");
while (i.hasNext()) { while (i.hasNext()) {
if (pageAcc.size() <= query.wantedResults) break; if (pageAcc.size() <= query.wantedResults) break;
entry = (Map.Entry) i.next(); entry = (Map.Entry) i.next();
iEntry = (indexRWIEntry) entry.getValue(); iEntry = (indexRWIEntry) entry.getValue();
hashpart = iEntry.urlHash().substring(6); hashpart = iEntry.urlHash().substring(6);
isWordRootURL = plasmaURL.isWordRootURL(iEntry.urlHash(), query.words("")); isWordRootURL = plasmaURL.isWordRootURL(iEntry.urlHash(), querywords);
if ((!(isWordRootURL)) && if (isWordRootURL) {
(((rootDomExt) && (rootDoms.contains(hashpart))) || rootDoms.add(hashpart);
((doubleDom) && (doubleDoms.contains(hashpart))))) {
i.remove();
if (pageAcc.size() <= query.wantedResults) return;
} else { } else {
if (isWordRootURL) { if (((rootDomExt) && (rootDoms.contains(hashpart))) ||
rootDoms.add(hashpart); ((doubleDom) && (doubleDoms.contains(hashpart)))) {
i.remove();
} }
} }
doubleDoms.add(hashpart); doubleDoms.add(hashpart);

@ -71,6 +71,7 @@ public final class plasmaSearchQuery {
public static final kelondroBitfield catchall_constraint = new kelondroBitfield(4, "______"); public static final kelondroBitfield catchall_constraint = new kelondroBitfield(4, "______");
public Set queryWords, queryHashes; public Set queryWords, queryHashes;
public Set excludeWords;
public int wantedResults; public int wantedResults;
public String prefer; public String prefer;
public int contentdom; public int contentdom;
@ -82,15 +83,16 @@ public final class plasmaSearchQuery {
public int maxDistance; public int maxDistance;
public kelondroBitfield constraint; public kelondroBitfield constraint;
public plasmaSearchQuery(Set queryWords, int maxDistance, String prefer, int contentdom, public plasmaSearchQuery(Set queryWords, Set excludeWords, int maxDistance, String prefer, int contentdom,
int wantedResults, long maximumTime, String urlMask, int wantedResults, long maximumTime, String urlMask,
int domType, String domGroupName, int domMaxTargets, int domType, String domGroupName, int domMaxTargets,
kelondroBitfield constraint) { kelondroBitfield constraint) {
this.queryWords = queryWords; this.queryWords = queryWords;
this.queryHashes = plasmaCondenser.words2hashes(queryWords);
this.excludeWords = excludeWords;
this.maxDistance = maxDistance; this.maxDistance = maxDistance;
this.prefer = prefer; this.prefer = prefer;
this.contentdom = contentdom; this.contentdom = contentdom;
this.queryHashes = plasmaCondenser.words2hashes(queryWords);
this.wantedResults = wantedResults; this.wantedResults = wantedResults;
this.maximumTime = maximumTime; this.maximumTime = maximumTime;
this.urlMask = urlMask; this.urlMask = urlMask;
@ -104,6 +106,7 @@ public final class plasmaSearchQuery {
int wantedResults, long maximumTime, String urlMask, int wantedResults, long maximumTime, String urlMask,
kelondroBitfield constraint) { kelondroBitfield constraint) {
this.queryWords = null; this.queryWords = null;
this.excludeWords = null;
this.maxDistance = maxDistance; this.maxDistance = maxDistance;
this.prefer = prefer; this.prefer = prefer;
this.contentdom = contentdom; this.contentdom = contentdom;
@ -142,24 +145,37 @@ public final class plasmaSearchQuery {
return new String(sb); return new String(sb);
} }
public static TreeSet cleanQuery(String words) { public static TreeSet[] cleanQuery(String words) {
// returns two sets: a query set and a exclude set
if ((words == null) || (words.length() == 0)) return new TreeSet[]{new TreeSet(), new TreeSet()};
// convert Umlaute // convert Umlaute
words = htmlFilterAbstractScraper.convertUmlaute(new serverCharBuffer(words.toCharArray())).toString(); words = htmlFilterAbstractScraper.convertUmlaute(new serverCharBuffer(words.toCharArray())).toString();
// remove funny symbols // remove funny symbols
final String seps = "' .,:/-&"; final String seps = "'.,:/&";
words = words.toLowerCase().trim(); words = words.toLowerCase().trim();
int c; int c;
for (int i = 0; i < seps.length(); i++) { for (int i = 0; i < seps.length(); i++) {
if ((c = words.indexOf(seps.charAt(i))) >= 0) { words = words.substring(0, c) + (((c + 1) < words.length()) ? (" " + words.substring(c + 1)) : ""); } while ((c = words.indexOf(seps.charAt(i))) >= 0) { words = words.substring(0, c) + (((c + 1) < words.length()) ? (" " + words.substring(c + 1)) : ""); }
} }
// the string is clean now, but we must generate a set out of it // the string is clean now, but we must generate a set out of it
final TreeSet query = new TreeSet(kelondroNaturalOrder.naturalOrder); final TreeSet query = new TreeSet(kelondroNaturalOrder.naturalOrder);
if (words.length() == 0) return query; // split returns always one element final TreeSet exclude = new TreeSet(kelondroNaturalOrder.naturalOrder);
final String[] a = words.split(" "); final String[] a = words.split(" ");
for (int i = 0; i < a.length; i++) { query.add(a[i]); } for (int i = 0; i < a.length; i++) {
return query; if (a[i].startsWith("-")) {
exclude.add(a[i].substring(1));
} else {
while ((c = a[i].indexOf('-')) >= 0) {
query.add(a[i].substring(0, c));
a[i] = a[i].substring(c + 1);
}
if (a[i].length() > 0) query.add(a[i]);
}
}
return new TreeSet[]{query, exclude};
} }
public int size() { public int size() {

Loading…
Cancel
Save