diff --git a/htroot/DetailedSearch.java b/htroot/DetailedSearch.java index faac9f2c5..ae362e0ab 100644 --- a/htroot/DetailedSearch.java +++ b/htroot/DetailedSearch.java @@ -205,11 +205,11 @@ public class DetailedSearch { wdist = 1; } if (sb.facilityDB != null) try { sb.facilityDB.update("zeitgeist", querystring, post); } catch (Exception e) {} - final TreeSet query = plasmaSearchQuery.cleanQuery(querystring); + final TreeSet[] query = plasmaSearchQuery.cleanQuery(querystring); // filter out stopwords - final TreeSet filtered = kelondroMSetTools.joinConstructive(query, plasmaSwitchboard.stopwords); + final TreeSet filtered = kelondroMSetTools.joinConstructive(query[0], plasmaSwitchboard.stopwords); if (filtered.size() > 0) { - kelondroMSetTools.excludeDestructive(query, plasmaSwitchboard.stopwords); + kelondroMSetTools.excludeDestructive(query[0], plasmaSwitchboard.stopwords); } boolean authenticated = sb.adminAuthenticated(header) >= 2; @@ -221,7 +221,7 @@ public class DetailedSearch { return prop; } final String delHash = post.get("deleteref", ""); - sb.wordIndex.removeWordReferences(query, delHash); + sb.wordIndex.removeWordReferences(query[0], delHash); } // prepare search order @@ -239,7 +239,7 @@ public class DetailedSearch { } // do the search - plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, wdist, "", plasmaSearchQuery.CONTENTDOM_TEXT, count, searchtime, urlmask, + plasmaSearchQuery thisSearch = new plasmaSearchQuery(query[0], query[1], wdist, "", plasmaSearchQuery.CONTENTDOM_TEXT, count, searchtime, urlmask, ((global) && (yacyonline) && (!(env.getConfig("last-search","").equals(querystring)))) ? plasmaSearchQuery.SEARCHDOM_GLOBALDHT : plasmaSearchQuery.SEARCHDOM_LOCAL, "", 20, plasmaSearchQuery.catchall_constraint); plasmaSearchRankingProfile localRanking = new plasmaSearchRankingProfile("local", post.toString()); diff --git a/htroot/xml/snippet.java b/htroot/xml/snippet.java index a70858d05..c8836afaf 100644 --- a/htroot/xml/snippet.java +++ b/htroot/xml/snippet.java @@ -46,13 +46,13 @@ public class snippet { if ((querystring.length() > 2) && (querystring.charAt(0) == '"') && (querystring.charAt(querystring.length() - 1) == '"')) { querystring = querystring.substring(1, querystring.length() - 1).trim(); } - final TreeSet query = plasmaSearchQuery.cleanQuery(querystring); - Set queryHashes = plasmaCondenser.words2hashes(query); + final TreeSet[] query = plasmaSearchQuery.cleanQuery(querystring); + Set queryHashes = plasmaCondenser.words2hashes(query[0]); // filter out stopwords - final TreeSet filtered = kelondroMSetTools.joinConstructive(query, plasmaSwitchboard.stopwords); + final TreeSet filtered = kelondroMSetTools.joinConstructive(query[0], plasmaSwitchboard.stopwords); if (filtered.size() > 0) { - kelondroMSetTools.excludeDestructive(query, plasmaSwitchboard.stopwords); + kelondroMSetTools.excludeDestructive(query[0], plasmaSwitchboard.stopwords); } // find snippet diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index acabfa35c..a21b86c32 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -196,11 +196,11 @@ public class yacysearch { serverObjects prop = new serverObjects(); if (post.get("cat", "href").equals("href")) { - final TreeSet query = plasmaSearchQuery.cleanQuery(querystring); + final TreeSet[] query = plasmaSearchQuery.cleanQuery(querystring); // filter out stopwords - final TreeSet filtered = kelondroMSetTools.joinConstructive(query, plasmaSwitchboard.stopwords); + final TreeSet filtered = kelondroMSetTools.joinConstructive(query[0], plasmaSwitchboard.stopwords); if (filtered.size() > 0) { - kelondroMSetTools.excludeDestructive(query, plasmaSwitchboard.stopwords); + kelondroMSetTools.excludeDestructive(query[0], plasmaSwitchboard.stopwords); } // if a minus-button was hit, remove a special reference first @@ -212,7 +212,7 @@ public class yacysearch { // delete the index entry locally final String delHash = post.get("deleteref", ""); // urlhash - sb.wordIndex.removeWordReferences(query, delHash); + sb.wordIndex.removeWordReferences(query[0], delHash); // make new news message with negative voting HashMap map = new HashMap(); @@ -255,7 +255,8 @@ public class yacysearch { // do the search plasmaSearchQuery thisSearch = new plasmaSearchQuery( - query, + query[0], + query[1], maxDistance, prefermask, contentdomCode, diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index 6b54fa357..6037af54d 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -397,9 +397,12 @@ public final class plasmaSearchEvent extends Thread implements Runnable { indexURLEntry page; Long preranking; Object[] preorderEntry; + indexURLEntry.Components comp; + String pagetitle, pageurl, pageauthor, exclw; + Iterator excli; int minEntries = profileLocal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT); try { - while (preorder.hasNext()) { + ordering: while (preorder.hasNext()) { if ((System.currentTimeMillis() >= postorderLimitTime) && (acc.sizeFetched() >= minEntries)) break; preorderEntry = preorder.next(); entry = (indexRWIEntry) preorderEntry[0]; @@ -407,12 +410,26 @@ public final class plasmaSearchEvent extends Thread implements Runnable { preranking = (Long) preorderEntry[1]; // find the url entry page = urlStore.load(entry.urlHash(), entry); - // add a result if (page != null) { - if ((!(query.constraint.equals(plasmaSearchQuery.catchall_constraint))) && + comp = page.comp(); + pagetitle = comp.title().toLowerCase(); + pageurl = comp.url().toString().toLowerCase(); + pageauthor = comp.author().toLowerCase(); + + // check exclusion + excli = query.excludeWords.iterator(); + while (excli.hasNext()) { + exclw = (String) excli.next(); + if ((pagetitle.indexOf(exclw) >= 0) || + (pageurl.indexOf(exclw) >= 0) || + (pageauthor.indexOf(exclw) >= 0)) continue ordering; + } + + // check constraints + if ((!(query.constraint.equals(plasmaSearchQuery.catchall_constraint))) && (query.constraint.get(plasmaCondenser.flag_cat_indexof)) && - (!(page.comp().title().startsWith("Index of")))) { - log.logFine("filtered out " + page.comp().url().toString()); + (!(comp.title().startsWith("Index of")))) { + log.logFine("filtered out " + comp.url().toString()); // filter out bad results Iterator wi = query.queryHashes.iterator(); while (wi.hasNext()) wordIndex.removeEntry((String) wi.next(), page.hash()); @@ -454,7 +471,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { if (rcLocal == null) return; plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query, ranking, rcLocal, timeout - System.currentTimeMillis()); - if (preorder.filteredCount()> query.wantedResults) preorder.remove(true, true); + if (preorder.filteredCount() > query.wantedResults) preorder.remove(true, true); // start url-fetch indexRWIEntry entry; diff --git a/source/de/anomic/plasma/plasmaSearchPreOrder.java b/source/de/anomic/plasma/plasmaSearchPreOrder.java index 7e5323216..441a38489 100644 --- a/source/de/anomic/plasma/plasmaSearchPreOrder.java +++ b/source/de/anomic/plasma/plasmaSearchPreOrder.java @@ -127,20 +127,19 @@ public final class plasmaSearchPreOrder { indexRWIEntry iEntry; String hashpart; boolean isWordRootURL; + String querywords = query.words(""); while (i.hasNext()) { if (pageAcc.size() <= query.wantedResults) break; entry = (Map.Entry) i.next(); iEntry = (indexRWIEntry) entry.getValue(); hashpart = iEntry.urlHash().substring(6); - isWordRootURL = plasmaURL.isWordRootURL(iEntry.urlHash(), query.words("")); - if ((!(isWordRootURL)) && - (((rootDomExt) && (rootDoms.contains(hashpart))) || - ((doubleDom) && (doubleDoms.contains(hashpart))))) { - i.remove(); - if (pageAcc.size() <= query.wantedResults) return; + isWordRootURL = plasmaURL.isWordRootURL(iEntry.urlHash(), querywords); + if (isWordRootURL) { + rootDoms.add(hashpart); } else { - if (isWordRootURL) { - rootDoms.add(hashpart); + if (((rootDomExt) && (rootDoms.contains(hashpart))) || + ((doubleDom) && (doubleDoms.contains(hashpart)))) { + i.remove(); } } doubleDoms.add(hashpart); diff --git a/source/de/anomic/plasma/plasmaSearchQuery.java b/source/de/anomic/plasma/plasmaSearchQuery.java index a92895811..e8153feea 100644 --- a/source/de/anomic/plasma/plasmaSearchQuery.java +++ b/source/de/anomic/plasma/plasmaSearchQuery.java @@ -71,6 +71,7 @@ public final class plasmaSearchQuery { public static final kelondroBitfield catchall_constraint = new kelondroBitfield(4, "______"); public Set queryWords, queryHashes; + public Set excludeWords; public int wantedResults; public String prefer; public int contentdom; @@ -82,15 +83,16 @@ public final class plasmaSearchQuery { public int maxDistance; public kelondroBitfield constraint; - public plasmaSearchQuery(Set queryWords, int maxDistance, String prefer, int contentdom, + public plasmaSearchQuery(Set queryWords, Set excludeWords, int maxDistance, String prefer, int contentdom, int wantedResults, long maximumTime, String urlMask, int domType, String domGroupName, int domMaxTargets, kelondroBitfield constraint) { this.queryWords = queryWords; + this.queryHashes = plasmaCondenser.words2hashes(queryWords); + this.excludeWords = excludeWords; this.maxDistance = maxDistance; this.prefer = prefer; this.contentdom = contentdom; - this.queryHashes = plasmaCondenser.words2hashes(queryWords); this.wantedResults = wantedResults; this.maximumTime = maximumTime; this.urlMask = urlMask; @@ -104,6 +106,7 @@ public final class plasmaSearchQuery { int wantedResults, long maximumTime, String urlMask, kelondroBitfield constraint) { this.queryWords = null; + this.excludeWords = null; this.maxDistance = maxDistance; this.prefer = prefer; this.contentdom = contentdom; @@ -142,24 +145,37 @@ public final class plasmaSearchQuery { return new String(sb); } - public static TreeSet cleanQuery(String words) { + public static TreeSet[] cleanQuery(String words) { + // returns two sets: a query set and a exclude set + if ((words == null) || (words.length() == 0)) return new TreeSet[]{new TreeSet(), new TreeSet()}; + // convert Umlaute words = htmlFilterAbstractScraper.convertUmlaute(new serverCharBuffer(words.toCharArray())).toString(); // remove funny symbols - final String seps = "' .,:/-&"; + final String seps = "'.,:/&"; words = words.toLowerCase().trim(); int c; for (int i = 0; i < seps.length(); i++) { - if ((c = words.indexOf(seps.charAt(i))) >= 0) { words = words.substring(0, c) + (((c + 1) < words.length()) ? (" " + words.substring(c + 1)) : ""); } + while ((c = words.indexOf(seps.charAt(i))) >= 0) { words = words.substring(0, c) + (((c + 1) < words.length()) ? (" " + words.substring(c + 1)) : ""); } } // the string is clean now, but we must generate a set out of it final TreeSet query = new TreeSet(kelondroNaturalOrder.naturalOrder); - if (words.length() == 0) return query; // split returns always one element + final TreeSet exclude = new TreeSet(kelondroNaturalOrder.naturalOrder); final String[] a = words.split(" "); - for (int i = 0; i < a.length; i++) { query.add(a[i]); } - return query; + for (int i = 0; i < a.length; i++) { + if (a[i].startsWith("-")) { + exclude.add(a[i].substring(1)); + } else { + while ((c = a[i].indexOf('-')) >= 0) { + query.add(a[i].substring(0, c)); + a[i] = a[i].substring(c + 1); + } + if (a[i].length() > 0) query.add(a[i]); + } + } + return new TreeSet[]{query, exclude}; } public int size() {