changed storage of search words: keep order

pull/1/head
Michael Peter Christen 13 years ago
parent ed39ef2890
commit e2f8f263e8

@ -9,7 +9,7 @@
// $LastChangedBy: orbiter $
//
// LICENSE
//
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
@ -24,9 +24,9 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.util.Collection;
import java.util.Date;
import java.util.Iterator;
import java.util.TreeSet;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.protocol.RequestHeader;
@ -43,7 +43,6 @@ import net.yacy.search.Switchboard;
import net.yacy.search.index.Segment;
import net.yacy.search.index.Segments;
import net.yacy.search.query.QueryParams;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -52,18 +51,18 @@ public final class timeline {
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
// return variable that accumulates replacements
final Switchboard sb = (Switchboard) env;
final serverObjects prop = new serverObjects();
if ((post == null) || (env == null)) return prop;
final boolean authenticated = sb.adminAuthenticated(header) >= 2;
Segment segment = null;
if (post.containsKey("segment") && authenticated) {
segment = sb.indexSegments.segment(post.get("segment"));
} else {
segment = sb.indexSegments.segment(Segments.Process.PUBLIC);
}
final String querystring = post.get("query", ""); // a string of word hashes that shall be searched and combined
final int count = Math.min((authenticated) ? 1000 : 10, post.getInt("maximumRecords", 1000)); // SRU syntax
final int maxdist= post.getInt("maxdist", Integer.MAX_VALUE);
@ -75,22 +74,22 @@ public final class timeline {
language = (agent == null) ? "en" : ISO639.userAgentLanguageDetection(agent);
if (language == null) language = "en";
}
final TreeSet<String>[] query = QueryParams.cleanQuery(querystring); // converts also umlaute
final Collection<String>[] query = QueryParams.cleanQuery(querystring); // converts also umlaute
HandleSet q = Word.words2hashesHandles(query[0]);
// tell all threads to do nothing for a specific time
sb.intermissionAllThreads(3000);
// prepare search
final long timestamp = System.currentTimeMillis();
// prepare an abstract result
int indexabstractContainercount = 0;
int joincount = 0;
// retrieve index containers
//yacyCore.log.logInfo("INIT TIMELINE SEARCH: " + plasmaSearchQuery.anonymizedQueryHashes(query[0]) + " - " + count + " links");
// get the index container with the result vector
TermSearch<WordReference> search = null;
try {
@ -99,7 +98,7 @@ public final class timeline {
Log.logException(e);
}
ReferenceContainer<WordReference> index = search.joined();
Iterator<WordReference> i = index.entries();
WordReference entry;
int c = 0;
@ -117,14 +116,14 @@ public final class timeline {
c++;
}
prop.put("event", c);
// log
Network.log.logInfo("EXIT TIMELINE SEARCH: " +
QueryParams.anonymizedQueryHashes(q) + " - " + joincount + " links found, " +
prop.get("linkcount", "?") + " links selected, " +
indexabstractContainercount + " index abstracts, " +
(System.currentTimeMillis() - timestamp) + " milliseconds");
return prop;
}

@ -602,14 +602,14 @@ public class yacysearch {
(post == null) ? sb.getConfig("search.navigation", "all") : post.get("nav", "");
// the query
final TreeSet<String>[] query = QueryParams.cleanQuery(querystring.trim()); // converts also umlaute
final Collection<String>[] query = QueryParams.cleanQuery(querystring.trim()); // converts also umlaute
final int maxDistance = (querystring.indexOf('"', 0) >= 0) ? query.length - 1 : Integer.MAX_VALUE;
// filter out stopwords
final SortedSet<String> filtered = SetTools.joinConstructive(query[0], Switchboard.stopwords);
final SortedSet<String> filtered = SetTools.joinConstructiveByTest(query[0], Switchboard.stopwords);
if ( !filtered.isEmpty() ) {
SetTools.excludeDestructive(query[0], Switchboard.stopwords);
SetTools.excludeDestructiveByTestSmallInLarge(query[0], Switchboard.stopwords);
}
// if a minus-button was hit, remove a special reference first

@ -25,8 +25,8 @@
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.net.MalformedURLException;
import java.util.Collection;
import java.util.List;
import java.util.Set;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.ASCII;
@ -165,7 +165,7 @@ public class yacysearchitem {
prop.putHTML("content_publisher", result.publisher());
prop.putHTML("content_creator", result.creator());// author
prop.putHTML("content_subject", result.subject());
final Set<String>[] query = theQuery.queryWords();
final Collection<String>[] query = theQuery.queryWords();
final StringBuilder s = new StringBuilder(query[0].size() * 20);
for (final String t: query[0]) {
s.append('+').append(t);

@ -26,6 +26,7 @@
package net.yacy.kelondro.data.word;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Locale;
@ -118,11 +119,11 @@ public class Word {
private final static byte lowByte = Base64Order.alpha_enhanced[0];
private final static byte highByte = Base64Order.alpha_enhanced[Base64Order.alpha_enhanced.length - 1];
public static boolean isPrivate(byte[] hash) {
return hash[0] == highByte && hash[1] == highByte && hash[2] == highByte && hash[3] == highByte && hash[4] == highByte;
}
// create a word hash
public static final byte[] word2hash(final String word) {
final String wordlc = word.toLowerCase(Locale.ENGLISH);
@ -148,7 +149,7 @@ public class Word {
public final static byte PRIVATE_TYPE_COPY = 'C'; // used for a private local copy of the index
public final static byte PRIVATE_TYPE_PHONETIC = 'K'; // used for ColognePhonetics
public static final byte[] hash2private(final byte[] hash, byte privateType) {
byte[] p = new byte[commonHashLength];
p[0] = highByte; p[1] = highByte; p[2] = highByte; ; p[3] = highByte; ; p[4] = highByte; p[5] = privateType;
@ -156,7 +157,7 @@ public class Word {
return p;
}
public static final HandleSet words2hashesHandles(final Set<String> words) {
public static final HandleSet words2hashesHandles(final Collection<String> words) {
final HandleSet hashes = new HandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, words.size());
for (final String word: words)
try {

@ -49,12 +49,12 @@ import net.yacy.kelondro.logging.Log;
public final class SetTools {
//public static Comparator fastStringComparator = fastStringComparator(true);
// ------------------------------------------------------------------------------------------------
// helper methods
public static int log2a(int x) {
// this computes 1 + log2
// it is the number of bits in x, not the logarithm by 2
@ -72,10 +72,10 @@ public final class SetTools {
// - join by pairwise enumeration
// - join by iterative tests (where we distinguish left-right and right-left tests)
public static <A, B> SortedMap<A, B> joinConstructive(final Collection<SortedMap<A, B>> maps, final boolean concatStrings) {
// this joins all TreeMap(s) contained in maps
// first order entities by their size
final SortedMap<Long, SortedMap<A, B>> orderMap = new TreeMap<Long, SortedMap<A, B>>();
SortedMap<A, B> singleMap;
@ -84,18 +84,18 @@ public final class SetTools {
while (i.hasNext()) {
// get next entity:
singleMap = i.next();
// check result
if ((singleMap == null) || (singleMap.isEmpty())) return new TreeMap<A, B>();
// store result in order of result size
orderMap.put(Long.valueOf(singleMap.size() * 1000 + count), singleMap);
count++;
}
// check if there is any result
if (orderMap.isEmpty()) return new TreeMap<A, B>();
// we now must pairwise build up a conjunction of these maps
Long k = orderMap.firstKey(); // the smallest, which means, the one with the least entries
SortedMap<A, B> mapA, mapB, joinResult = orderMap.remove(k);
@ -114,7 +114,7 @@ public final class SetTools {
if (joinResult.isEmpty()) return new TreeMap<A, B>();
return joinResult;
}
public static <A, B> SortedMap<A, B> joinConstructive(final SortedMap<A, B> map1, final SortedMap<A, B> map2, final boolean concatStrings) {
// comparators must be equal
if ((map1 == null) || (map2 == null)) return null;
@ -134,7 +134,7 @@ public final class SetTools {
}
return joinConstructiveByEnumeration(map1, map2, concatStrings);
}
@SuppressWarnings("unchecked")
private static <A, B> SortedMap<A, B> joinConstructiveByTest(final SortedMap<A, B> small, final SortedMap<A, B> large, final boolean concatStrings) {
final SortedMap<A, B> result = new TreeMap<A, B>(large.comparator());
@ -198,7 +198,7 @@ public final class SetTools {
}
return result;
}
// now the same for set-set
public static <A> SortedSet<A> joinConstructive(final SortedSet<A> set1, final SortedSet<A> set2) {
// comparators must be equal
@ -220,9 +220,9 @@ public final class SetTools {
return joinConstructiveByEnumeration(set1, set2);
}
private static <A> SortedSet<A> joinConstructiveByTest(final SortedSet<A> small, final SortedSet<A> large) {
public static <A> SortedSet<A> joinConstructiveByTest(final Collection<A> small, final SortedSet<A> large) {
final Iterator<A> mi = small.iterator();
final SortedSet<A> result = new TreeSet<A>(small.comparator());
final SortedSet<A> result = new TreeSet<A>(large.comparator());
A o;
while (mi.hasNext()) {
o = mi.next();
@ -256,7 +256,7 @@ public final class SetTools {
}
return result;
}
/**
* test if one set is totally included in another set
* @param <A>
@ -269,8 +269,8 @@ public final class SetTools {
if (!large.contains(o)) return false;
}
return true;
}
}
/**
* test if one set is totally included in another set
* @param small
@ -282,8 +282,8 @@ public final class SetTools {
if (!large.has(handle)) return false;
}
return true;
}
}
/**
* test if the intersection of two sets is not empty
* @param <A>
@ -379,7 +379,7 @@ public final class SetTools {
}
return false;
}
private static boolean anymatchByEnumeration(final HandleSet set1, final HandleSet set2) {
// implement pairwise enumeration
final Comparator<byte[]> comp = set1.comparator();
@ -402,7 +402,7 @@ public final class SetTools {
}
return false;
}
// ------------------------------------------------------------------------------------------------
// exclude
@ -416,7 +416,7 @@ public final class SetTools {
return excludeConstructiveByTestMapInSet(map, set);
// return excludeConstructiveByEnumeration(map, set);
}
private static <A, B> TreeMap<A, B> excludeConstructiveByTestMapInSet(final TreeMap<A, B> map, final Set<A> set) {
final TreeMap<A, B> result = new TreeMap<A, B>(map.comparator());
A o;
@ -427,7 +427,7 @@ public final class SetTools {
return result;
}
*/
public static <A, B> void excludeDestructive(final Map<A, B> map, final Set<A> set) {
// comparators must be equal
if (map == null) return;
@ -440,40 +440,40 @@ public final class SetTools {
else
excludeDestructiveByTestSetInMap(map, set);
}
private static <A, B> void excludeDestructiveByTestMapInSet(final Map<A, B> map, final Set<A> set) {
final Iterator<A> mi = map.keySet().iterator();
while (mi.hasNext()) if (set.contains(mi.next())) mi.remove();
}
private static <A, B> void excludeDestructiveByTestSetInMap(final Map<A, B> map, final Set<A> set) {
final Iterator<A> si = set.iterator();
while (si.hasNext()) map.remove(si.next());
}
// and the same again with set-set
public static <A> void excludeDestructive(final Set<A> set1, final Set<A> set2) {
if (set1 == null) return;
if (set2 == null) return;
assert !(set1 instanceof SortedSet<?> && set2 instanceof SortedSet<?>) || ((SortedSet<A>) set1).comparator() == ((SortedSet<A>) set2).comparator();
if (set1.isEmpty() || set2.isEmpty()) return;
if (set1.size() < set2.size())
excludeDestructiveByTestSmallInLarge(set1, set2);
else
excludeDestructiveByTestLargeInSmall(set1, set2);
}
private static <A> void excludeDestructiveByTestSmallInLarge(final Set<A> small, final Set<A> large) {
public static <A> void excludeDestructiveByTestSmallInLarge(final Collection<A> small, final Set<A> large) {
final Iterator<A> mi = small.iterator();
while (mi.hasNext()) if (large.contains(mi.next())) mi.remove();
}
private static <A> void excludeDestructiveByTestLargeInSmall(final Set<A> large, final Set<A> small) {
public static <A> void excludeDestructiveByTestLargeInSmall(final Set<A> large, final Collection<A> small) {
final Iterator<A> si = small.iterator();
while (si.hasNext()) large.remove(si.next());
}
// ------------------------------------------------------------------------------------------------
public static SortedMap<String, String> loadMap(final String filename, final String sep) {
@ -488,13 +488,13 @@ public final class SetTools {
if ((line.length() > 0 && line.charAt(0) != '#') && ((pos = line.indexOf(sep)) > 0))
map.put(line.substring(0, pos).trim().toLowerCase(), line.substring(pos + sep.length()).trim());
}
} catch (final IOException e) {
} catch (final IOException e) {
} finally {
if (br != null) try { br.close(); } catch (final Exception e) {}
}
return map;
}
public static SortedMap<String, List<String>> loadMapMultiValsPerKey(final String filename, final String sep) {
final SortedMap<String, List<String>> map = new TreeMap<String, List<String>>();
BufferedReader br = null;
@ -511,17 +511,17 @@ public final class SetTools {
map.get(key).add(value);
}
}
} catch (final IOException e) {
} catch (final IOException e) {
} finally {
if (br != null) try { br.close(); } catch (final Exception e) {}
}
return map;
}
public static SortedSet<String> loadList(final File file, final Comparator<String> c) {
final SortedSet<String> list = new TreeSet<String>(c);
if (!(file.exists())) return list;
BufferedReader br = null;
try {
br = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
@ -531,7 +531,7 @@ public final class SetTools {
if (line.length() > 0 && line.charAt(0) != '#') list.add(line.trim().toLowerCase());
}
br.close();
} catch (final IOException e) {
} catch (final IOException e) {
} finally {
if (br != null) try{br.close();}catch(final Exception e){}
}
@ -547,7 +547,7 @@ public final class SetTools {
}
return sb.toString();
}
public static String setToString(final Set<String> set, final char separator) {
final Iterator<String> i = set.iterator();
final StringBuilder sb = new StringBuilder(set.size() * 7);
@ -560,7 +560,7 @@ public final class SetTools {
// ------------------------------------------------------------------------------------------------
public static void main(final String[] args) {
final SortedMap<String, String> m = new TreeMap<String, String>();
final SortedMap<String, String> s = new TreeMap<String, String>();

@ -35,7 +35,6 @@ import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
@ -56,7 +55,6 @@ import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.order.Bitfield;
import net.yacy.kelondro.order.NaturalOrder;
import net.yacy.kelondro.util.SetTools;
import net.yacy.peers.Seed;
import net.yacy.search.index.Segment;
@ -162,7 +160,7 @@ public final class QueryParams {
}
} else {
this.queryString = queryString;
final TreeSet<String>[] cq = cleanQuery(queryString);
final Collection<String>[] cq = cleanQuery(queryString);
this.queryHashes = Word.words2hashesHandles(cq[0]);
this.excludeHashes = Word.words2hashesHandles(cq[1]);
this.fullqueryHashes = Word.words2hashesHandles(cq[2]);
@ -378,11 +376,11 @@ public final class QueryParams {
private static String seps = "'.,/&_"; static {seps += '"';}
@SuppressWarnings("unchecked")
public static TreeSet<String>[] cleanQuery(String querystring) {
public static Collection<String>[] cleanQuery(String querystring) {
// returns three sets: a query set, a exclude set and a full query set
final TreeSet<String> query = new TreeSet<String>(NaturalOrder.naturalComparator);
final TreeSet<String> exclude = new TreeSet<String>(NaturalOrder.naturalComparator);
final TreeSet<String> fullquery = new TreeSet<String>(NaturalOrder.naturalComparator);
final Collection<String> query = new ArrayList<String>();
final Collection<String> exclude = new ArrayList<String>();
final Collection<String> fullquery = new ArrayList<String>();
if ((querystring != null) && (!querystring.isEmpty())) {
@ -401,22 +399,23 @@ public final class QueryParams {
final String[] queries = querystring.split(" ");
for (String quer : queries) {
if (quer.startsWith("-")) {
exclude.add(quer.substring(1));
String x = quer.substring(1);
if (!exclude.contains(x)) exclude.add(x);
} else {
while ((c = quer.indexOf('-')) >= 0) {
s = quer.substring(0, c);
l = s.length();
if (l >= Condenser.wordminsize) {query.add(s);}
if (l > 0) {fullquery.add(s);}
if (l >= Condenser.wordminsize && !query.contains(s)) {query.add(s);}
if (l > 0 && !fullquery.contains(s)) {fullquery.add(s);}
quer = quer.substring(c + 1);
}
l = quer.length();
if (l >= Condenser.wordminsize) {query.add(quer);}
if (l > 0) {fullquery.add(quer);}
if (l >= Condenser.wordminsize && !query.contains(quer)) {query.add(quer);}
if (l > 0 && !fullquery.contains(quer)) {fullquery.add(quer);}
}
}
}
return new TreeSet[]{query, exclude, fullquery};
return new Collection[]{query, exclude, fullquery};
}
public String queryString(final boolean encodeHTML) {
@ -438,7 +437,7 @@ public final class QueryParams {
}
}
public TreeSet<String>[] queryWords() {
public Collection<String>[] queryWords() {
return cleanQuery(this.queryString);
}

Loading…
Cancel
Save