From e2f8f263e8885233c53f0755a0bb12602d0ef8a2 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 1 Feb 2012 18:13:31 +0100 Subject: [PATCH] changed storage of search words: keep order --- htroot/api/timeline.java | 25 +++--- htroot/yacysearch.java | 6 +- htroot/yacysearchitem.java | 4 +- source/net/yacy/kelondro/data/word/Word.java | 9 ++- source/net/yacy/kelondro/util/SetTools.java | 78 +++++++++---------- source/net/yacy/search/query/QueryParams.java | 27 ++++--- 6 files changed, 74 insertions(+), 75 deletions(-) diff --git a/htroot/api/timeline.java b/htroot/api/timeline.java index 43d59bc3b..bb269dcbb 100644 --- a/htroot/api/timeline.java +++ b/htroot/api/timeline.java @@ -9,7 +9,7 @@ // $LastChangedBy: orbiter $ // // LICENSE -// +// // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or @@ -24,9 +24,9 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +import java.util.Collection; import java.util.Date; import java.util.Iterator; -import java.util.TreeSet; import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.protocol.RequestHeader; @@ -43,7 +43,6 @@ import net.yacy.search.Switchboard; import net.yacy.search.index.Segment; import net.yacy.search.index.Segments; import net.yacy.search.query.QueryParams; - import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -52,18 +51,18 @@ public final class timeline { public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { // return variable that accumulates replacements final Switchboard sb = (Switchboard) env; - + final serverObjects prop = new serverObjects(); if ((post == null) || (env == null)) return prop; final boolean authenticated = sb.adminAuthenticated(header) >= 2; - + Segment segment = null; if (post.containsKey("segment") && authenticated) { segment = sb.indexSegments.segment(post.get("segment")); } else { segment = sb.indexSegments.segment(Segments.Process.PUBLIC); } - + final String querystring = post.get("query", ""); // a string of word hashes that shall be searched and combined final int count = Math.min((authenticated) ? 1000 : 10, post.getInt("maximumRecords", 1000)); // SRU syntax final int maxdist= post.getInt("maxdist", Integer.MAX_VALUE); @@ -75,22 +74,22 @@ public final class timeline { language = (agent == null) ? "en" : ISO639.userAgentLanguageDetection(agent); if (language == null) language = "en"; } - final TreeSet[] query = QueryParams.cleanQuery(querystring); // converts also umlaute + final Collection[] query = QueryParams.cleanQuery(querystring); // converts also umlaute HandleSet q = Word.words2hashesHandles(query[0]); - + // tell all threads to do nothing for a specific time sb.intermissionAllThreads(3000); // prepare search final long timestamp = System.currentTimeMillis(); - + // prepare an abstract result int indexabstractContainercount = 0; int joincount = 0; // retrieve index containers //yacyCore.log.logInfo("INIT TIMELINE SEARCH: " + plasmaSearchQuery.anonymizedQueryHashes(query[0]) + " - " + count + " links"); - + // get the index container with the result vector TermSearch search = null; try { @@ -99,7 +98,7 @@ public final class timeline { Log.logException(e); } ReferenceContainer index = search.joined(); - + Iterator i = index.entries(); WordReference entry; int c = 0; @@ -117,14 +116,14 @@ public final class timeline { c++; } prop.put("event", c); - + // log Network.log.logInfo("EXIT TIMELINE SEARCH: " + QueryParams.anonymizedQueryHashes(q) + " - " + joincount + " links found, " + prop.get("linkcount", "?") + " links selected, " + indexabstractContainercount + " index abstracts, " + (System.currentTimeMillis() - timestamp) + " milliseconds"); - + return prop; } diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index f7b659cf6..6f1a7835a 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -602,14 +602,14 @@ public class yacysearch { (post == null) ? sb.getConfig("search.navigation", "all") : post.get("nav", ""); // the query - final TreeSet[] query = QueryParams.cleanQuery(querystring.trim()); // converts also umlaute + final Collection[] query = QueryParams.cleanQuery(querystring.trim()); // converts also umlaute final int maxDistance = (querystring.indexOf('"', 0) >= 0) ? query.length - 1 : Integer.MAX_VALUE; // filter out stopwords - final SortedSet filtered = SetTools.joinConstructive(query[0], Switchboard.stopwords); + final SortedSet filtered = SetTools.joinConstructiveByTest(query[0], Switchboard.stopwords); if ( !filtered.isEmpty() ) { - SetTools.excludeDestructive(query[0], Switchboard.stopwords); + SetTools.excludeDestructiveByTestSmallInLarge(query[0], Switchboard.stopwords); } // if a minus-button was hit, remove a special reference first diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index 089556392..d6a5dce32 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -25,8 +25,8 @@ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA import java.net.MalformedURLException; +import java.util.Collection; import java.util.List; -import java.util.Set; import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.document.ASCII; @@ -165,7 +165,7 @@ public class yacysearchitem { prop.putHTML("content_publisher", result.publisher()); prop.putHTML("content_creator", result.creator());// author prop.putHTML("content_subject", result.subject()); - final Set[] query = theQuery.queryWords(); + final Collection[] query = theQuery.queryWords(); final StringBuilder s = new StringBuilder(query[0].size() * 20); for (final String t: query[0]) { s.append('+').append(t); diff --git a/source/net/yacy/kelondro/data/word/Word.java b/source/net/yacy/kelondro/data/word/Word.java index b4986d0dd..714bbbd6b 100644 --- a/source/net/yacy/kelondro/data/word/Word.java +++ b/source/net/yacy/kelondro/data/word/Word.java @@ -26,6 +26,7 @@ package net.yacy.kelondro.data.word; +import java.util.Collection; import java.util.HashSet; import java.util.Iterator; import java.util.Locale; @@ -118,11 +119,11 @@ public class Word { private final static byte lowByte = Base64Order.alpha_enhanced[0]; private final static byte highByte = Base64Order.alpha_enhanced[Base64Order.alpha_enhanced.length - 1]; - + public static boolean isPrivate(byte[] hash) { return hash[0] == highByte && hash[1] == highByte && hash[2] == highByte && hash[3] == highByte && hash[4] == highByte; } - + // create a word hash public static final byte[] word2hash(final String word) { final String wordlc = word.toLowerCase(Locale.ENGLISH); @@ -148,7 +149,7 @@ public class Word { public final static byte PRIVATE_TYPE_COPY = 'C'; // used for a private local copy of the index public final static byte PRIVATE_TYPE_PHONETIC = 'K'; // used for ColognePhonetics - + public static final byte[] hash2private(final byte[] hash, byte privateType) { byte[] p = new byte[commonHashLength]; p[0] = highByte; p[1] = highByte; p[2] = highByte; ; p[3] = highByte; ; p[4] = highByte; p[5] = privateType; @@ -156,7 +157,7 @@ public class Word { return p; } - public static final HandleSet words2hashesHandles(final Set words) { + public static final HandleSet words2hashesHandles(final Collection words) { final HandleSet hashes = new HandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, words.size()); for (final String word: words) try { diff --git a/source/net/yacy/kelondro/util/SetTools.java b/source/net/yacy/kelondro/util/SetTools.java index a08dc2e0c..6f34a9b74 100644 --- a/source/net/yacy/kelondro/util/SetTools.java +++ b/source/net/yacy/kelondro/util/SetTools.java @@ -49,12 +49,12 @@ import net.yacy.kelondro.logging.Log; public final class SetTools { - + //public static Comparator fastStringComparator = fastStringComparator(true); // ------------------------------------------------------------------------------------------------ // helper methods - + public static int log2a(int x) { // this computes 1 + log2 // it is the number of bits in x, not the logarithm by 2 @@ -72,10 +72,10 @@ public final class SetTools { // - join by pairwise enumeration // - join by iterative tests (where we distinguish left-right and right-left tests) - + public static SortedMap joinConstructive(final Collection> maps, final boolean concatStrings) { // this joins all TreeMap(s) contained in maps - + // first order entities by their size final SortedMap> orderMap = new TreeMap>(); SortedMap singleMap; @@ -84,18 +84,18 @@ public final class SetTools { while (i.hasNext()) { // get next entity: singleMap = i.next(); - + // check result if ((singleMap == null) || (singleMap.isEmpty())) return new TreeMap(); - + // store result in order of result size orderMap.put(Long.valueOf(singleMap.size() * 1000 + count), singleMap); count++; } - + // check if there is any result if (orderMap.isEmpty()) return new TreeMap(); - + // we now must pairwise build up a conjunction of these maps Long k = orderMap.firstKey(); // the smallest, which means, the one with the least entries SortedMap mapA, mapB, joinResult = orderMap.remove(k); @@ -114,7 +114,7 @@ public final class SetTools { if (joinResult.isEmpty()) return new TreeMap(); return joinResult; } - + public static SortedMap joinConstructive(final SortedMap map1, final SortedMap map2, final boolean concatStrings) { // comparators must be equal if ((map1 == null) || (map2 == null)) return null; @@ -134,7 +134,7 @@ public final class SetTools { } return joinConstructiveByEnumeration(map1, map2, concatStrings); } - + @SuppressWarnings("unchecked") private static SortedMap joinConstructiveByTest(final SortedMap small, final SortedMap large, final boolean concatStrings) { final SortedMap result = new TreeMap(large.comparator()); @@ -198,7 +198,7 @@ public final class SetTools { } return result; } - + // now the same for set-set public static SortedSet joinConstructive(final SortedSet set1, final SortedSet set2) { // comparators must be equal @@ -220,9 +220,9 @@ public final class SetTools { return joinConstructiveByEnumeration(set1, set2); } - private static SortedSet joinConstructiveByTest(final SortedSet small, final SortedSet large) { + public static SortedSet joinConstructiveByTest(final Collection small, final SortedSet large) { final Iterator mi = small.iterator(); - final SortedSet result = new TreeSet(small.comparator()); + final SortedSet result = new TreeSet(large.comparator()); A o; while (mi.hasNext()) { o = mi.next(); @@ -256,7 +256,7 @@ public final class SetTools { } return result; } - + /** * test if one set is totally included in another set * @param @@ -269,8 +269,8 @@ public final class SetTools { if (!large.contains(o)) return false; } return true; - } - + } + /** * test if one set is totally included in another set * @param small @@ -282,8 +282,8 @@ public final class SetTools { if (!large.has(handle)) return false; } return true; - } - + } + /** * test if the intersection of two sets is not empty * @param @@ -379,7 +379,7 @@ public final class SetTools { } return false; } - + private static boolean anymatchByEnumeration(final HandleSet set1, final HandleSet set2) { // implement pairwise enumeration final Comparator comp = set1.comparator(); @@ -402,7 +402,7 @@ public final class SetTools { } return false; } - + // ------------------------------------------------------------------------------------------------ // exclude @@ -416,7 +416,7 @@ public final class SetTools { return excludeConstructiveByTestMapInSet(map, set); // return excludeConstructiveByEnumeration(map, set); } - + private static TreeMap excludeConstructiveByTestMapInSet(final TreeMap map, final Set set) { final TreeMap result = new TreeMap(map.comparator()); A o; @@ -427,7 +427,7 @@ public final class SetTools { return result; } */ - + public static void excludeDestructive(final Map map, final Set set) { // comparators must be equal if (map == null) return; @@ -440,40 +440,40 @@ public final class SetTools { else excludeDestructiveByTestSetInMap(map, set); } - + private static void excludeDestructiveByTestMapInSet(final Map map, final Set set) { final Iterator mi = map.keySet().iterator(); while (mi.hasNext()) if (set.contains(mi.next())) mi.remove(); } - + private static void excludeDestructiveByTestSetInMap(final Map map, final Set set) { final Iterator si = set.iterator(); while (si.hasNext()) map.remove(si.next()); } - + // and the same again with set-set public static void excludeDestructive(final Set set1, final Set set2) { if (set1 == null) return; if (set2 == null) return; assert !(set1 instanceof SortedSet && set2 instanceof SortedSet) || ((SortedSet) set1).comparator() == ((SortedSet) set2).comparator(); if (set1.isEmpty() || set2.isEmpty()) return; - + if (set1.size() < set2.size()) excludeDestructiveByTestSmallInLarge(set1, set2); else excludeDestructiveByTestLargeInSmall(set1, set2); } - - private static void excludeDestructiveByTestSmallInLarge(final Set small, final Set large) { + + public static void excludeDestructiveByTestSmallInLarge(final Collection small, final Set large) { final Iterator mi = small.iterator(); while (mi.hasNext()) if (large.contains(mi.next())) mi.remove(); } - - private static void excludeDestructiveByTestLargeInSmall(final Set large, final Set small) { + + public static void excludeDestructiveByTestLargeInSmall(final Set large, final Collection small) { final Iterator si = small.iterator(); while (si.hasNext()) large.remove(si.next()); } - + // ------------------------------------------------------------------------------------------------ public static SortedMap loadMap(final String filename, final String sep) { @@ -488,13 +488,13 @@ public final class SetTools { if ((line.length() > 0 && line.charAt(0) != '#') && ((pos = line.indexOf(sep)) > 0)) map.put(line.substring(0, pos).trim().toLowerCase(), line.substring(pos + sep.length()).trim()); } - } catch (final IOException e) { + } catch (final IOException e) { } finally { if (br != null) try { br.close(); } catch (final Exception e) {} } return map; } - + public static SortedMap> loadMapMultiValsPerKey(final String filename, final String sep) { final SortedMap> map = new TreeMap>(); BufferedReader br = null; @@ -511,17 +511,17 @@ public final class SetTools { map.get(key).add(value); } } - } catch (final IOException e) { + } catch (final IOException e) { } finally { if (br != null) try { br.close(); } catch (final Exception e) {} } return map; } - + public static SortedSet loadList(final File file, final Comparator c) { final SortedSet list = new TreeSet(c); if (!(file.exists())) return list; - + BufferedReader br = null; try { br = new BufferedReader(new InputStreamReader(new FileInputStream(file))); @@ -531,7 +531,7 @@ public final class SetTools { if (line.length() > 0 && line.charAt(0) != '#') list.add(line.trim().toLowerCase()); } br.close(); - } catch (final IOException e) { + } catch (final IOException e) { } finally { if (br != null) try{br.close();}catch(final Exception e){} } @@ -547,7 +547,7 @@ public final class SetTools { } return sb.toString(); } - + public static String setToString(final Set set, final char separator) { final Iterator i = set.iterator(); final StringBuilder sb = new StringBuilder(set.size() * 7); @@ -560,7 +560,7 @@ public final class SetTools { // ------------------------------------------------------------------------------------------------ - + public static void main(final String[] args) { final SortedMap m = new TreeMap(); final SortedMap s = new TreeMap(); diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java index 2e759cd70..052f70062 100644 --- a/source/net/yacy/search/query/QueryParams.java +++ b/source/net/yacy/search/query/QueryParams.java @@ -35,7 +35,6 @@ import java.util.Iterator; import java.util.Map; import java.util.Set; import java.util.SortedSet; -import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; @@ -56,7 +55,6 @@ import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.order.Bitfield; -import net.yacy.kelondro.order.NaturalOrder; import net.yacy.kelondro.util.SetTools; import net.yacy.peers.Seed; import net.yacy.search.index.Segment; @@ -162,7 +160,7 @@ public final class QueryParams { } } else { this.queryString = queryString; - final TreeSet[] cq = cleanQuery(queryString); + final Collection[] cq = cleanQuery(queryString); this.queryHashes = Word.words2hashesHandles(cq[0]); this.excludeHashes = Word.words2hashesHandles(cq[1]); this.fullqueryHashes = Word.words2hashesHandles(cq[2]); @@ -378,11 +376,11 @@ public final class QueryParams { private static String seps = "'.,/&_"; static {seps += '"';} @SuppressWarnings("unchecked") - public static TreeSet[] cleanQuery(String querystring) { + public static Collection[] cleanQuery(String querystring) { // returns three sets: a query set, a exclude set and a full query set - final TreeSet query = new TreeSet(NaturalOrder.naturalComparator); - final TreeSet exclude = new TreeSet(NaturalOrder.naturalComparator); - final TreeSet fullquery = new TreeSet(NaturalOrder.naturalComparator); + final Collection query = new ArrayList(); + final Collection exclude = new ArrayList(); + final Collection fullquery = new ArrayList(); if ((querystring != null) && (!querystring.isEmpty())) { @@ -401,22 +399,23 @@ public final class QueryParams { final String[] queries = querystring.split(" "); for (String quer : queries) { if (quer.startsWith("-")) { - exclude.add(quer.substring(1)); + String x = quer.substring(1); + if (!exclude.contains(x)) exclude.add(x); } else { while ((c = quer.indexOf('-')) >= 0) { s = quer.substring(0, c); l = s.length(); - if (l >= Condenser.wordminsize) {query.add(s);} - if (l > 0) {fullquery.add(s);} + if (l >= Condenser.wordminsize && !query.contains(s)) {query.add(s);} + if (l > 0 && !fullquery.contains(s)) {fullquery.add(s);} quer = quer.substring(c + 1); } l = quer.length(); - if (l >= Condenser.wordminsize) {query.add(quer);} - if (l > 0) {fullquery.add(quer);} + if (l >= Condenser.wordminsize && !query.contains(quer)) {query.add(quer);} + if (l > 0 && !fullquery.contains(quer)) {fullquery.add(quer);} } } } - return new TreeSet[]{query, exclude, fullquery}; + return new Collection[]{query, exclude, fullquery}; } public String queryString(final boolean encodeHTML) { @@ -438,7 +437,7 @@ public final class QueryParams { } } - public TreeSet[] queryWords() { + public Collection[] queryWords() { return cleanQuery(this.queryString); }