From 604c37927f1c9ea0e48cf0f5e188e04a0d410c25 Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 8 Sep 2009 13:48:17 +0000 Subject: [PATCH] used comparator for did-you-mean that uses index sizes for comparisment, but: - limit comparisment to only the first 10 elements that had been sorted before without IO - added a size cache to index computation because the size is computed at least twice in set comparator git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6306 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/yacysearch.java | 2 +- source/de/anomic/data/DidYouMean.java | 70 +++++++++++++------ source/de/anomic/data/OpenGeoDB.java | 2 +- source/de/anomic/kelondro/text/IndexCell.java | 60 ++++++++++++---- 4 files changed, 97 insertions(+), 37 deletions(-) diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 24d2deeb9..b31b895da 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -488,7 +488,7 @@ public class yacysearch { prop.put("meanCount", meanMax); if (meanMax > 0) { DidYouMean didYouMean = new DidYouMean(sb.indexSegment.termIndex()); - Iterator meanIt = didYouMean.getSuggestion(querystring).iterator(); + Iterator meanIt = didYouMean.getSuggestions(querystring, 300, 10).iterator(); int meanCount = 0; String suggestion; while(meanCount index; protected String word; @@ -46,11 +47,8 @@ public class DidYouMean { * @param index a termIndex - most likely retrieved from a switchboard object. * @param sort true/false - sorts the resulting TreeSet by index.count(); Warning: this causes heavy i/o. */ - public DidYouMean(final IndexCell index, boolean sort) { - if(sort) - this.resultSet = Collections.synchronizedSortedSet(new TreeSet(new indexSizeComparator())); - else - this.resultSet = Collections.synchronizedSortedSet(new TreeSet(wlComp)); + public DidYouMean(final IndexCell index) { + this.resultSet = Collections.synchronizedSortedSet(new TreeSet(wlComp)); this.word = ""; this.wordLen = 0; this.index = index; @@ -60,20 +58,50 @@ public class DidYouMean { } /** - * @param index a termIndex - most likely retrieved from a switchboard object. + * get a single suggestion + * @param word + * @param timeout + * @return */ - public DidYouMean(final IndexCell index) { - this(index, false); + public String getSuggestion(final String word, long timeout) { + Set s = getSuggestions(word, timeout); + if (s == null || s.size() == 0) return null; + return s.iterator().next(); } /** - * This method triggers the producer and consumer threads of DidYouMean. - *

Note: the default timeout is 500ms - * @param word a String with a single word - * @return a Set<String> with word variations contained in index. + * get a single suggestion with additional sort + * @param word + * @param timeout + * @return + */ + public String getSuggestion(final String word, long timeout, int preSortSelection) { + Set s = getSuggestions(word, timeout, preSortSelection); + if (s == null || s.size() == 0) return null; + return s.iterator().next(); + } + + /** + * get suggestions for a given word. The result is first ordered using a term size ordering, + * and a subset of the result is sorted again with a IO-intensive order based on the index size + * @param word + * @param timeout + * @param preSortSelection the number of words that participate in the IO-intensive sort + * @return */ - public Set getSuggestion(final String word) { - return getSuggestion(word, 500); + public Set getSuggestions(final String word, long timeout, int preSortSelection) { + long startTime = System.currentTimeMillis(); + Set preSorted = getSuggestions(word, timeout); + long timelimit = 2 * System.currentTimeMillis() - startTime + timeout; + if (System.currentTimeMillis() > timelimit) return preSorted; + Set countSorted = Collections.synchronizedSortedSet(new TreeSet(new indexSizeComparator())); + for (String s: preSorted) { + if (System.currentTimeMillis() > timelimit) break; + if (preSortSelection <= 0) break; + countSorted.add(s); + preSortSelection--; + } + return countSorted; } /** @@ -82,7 +110,7 @@ public class DidYouMean { * @param timeout execution time in ms. * @return a Set<String> with word variations contained in term index. */ - public Set getSuggestion(final String word, long timeout) { + public Set getSuggestions(final String word, long timeout) { long startTime = System.currentTimeMillis(); this.timeLimit = startTime + timeout; this.word = word.toLowerCase(); @@ -251,7 +279,7 @@ public class DidYouMean { public int compare(final String o1, final String o2) { final int i1 = index.count(Word.word2hash(o1)); final int i2 = index.count(Word.word2hash(o2)); - if (i1 == i2) return o1.compareTo(o2); + if (i1 == i2) return wlComp.compare(o1, o2); return (i1 < i2) ? 1 : -1; // '<' is correct, because the largest count shall be ordered to be the first position in the result } } diff --git a/source/de/anomic/data/OpenGeoDB.java b/source/de/anomic/data/OpenGeoDB.java index 92d764a46..9f62c3de4 100644 --- a/source/de/anomic/data/OpenGeoDB.java +++ b/source/de/anomic/data/OpenGeoDB.java @@ -191,7 +191,7 @@ public class OpenGeoDB { /** * read the dictionary and construct a set of recommendations to a given string * @param s input value that is used to match recommendations - * @return a set that contains all words that start or end with the input value + * @return a set that contains all words that start with the input value */ public Set recommend(String s) { Set a = new HashSet(); diff --git a/source/de/anomic/kelondro/text/IndexCell.java b/source/de/anomic/kelondro/text/IndexCell.java index cc1c90165..04f2924cc 100644 --- a/source/de/anomic/kelondro/text/IndexCell.java +++ b/source/de/anomic/kelondro/text/IndexCell.java @@ -32,10 +32,12 @@ import java.util.HashSet; import java.util.Set; import de.anomic.kelondro.index.Row; +import de.anomic.kelondro.index.SimpleARC; import de.anomic.kelondro.order.ByteOrder; import de.anomic.kelondro.order.CloneableIterator; import de.anomic.kelondro.order.MergeIterator; import de.anomic.kelondro.order.Order; +import de.anomic.kelondro.util.ByteArray; import de.anomic.kelondro.util.MemoryControl; import de.anomic.server.serverProfiling; @@ -62,7 +64,7 @@ public final class IndexCell extends AbstractBu private long lastCleanup; private final long targetFileSize, maxFileSize; private final int writeBufferSize; - + private final SimpleARC countCache; public IndexCell( final File cellPath, @@ -86,6 +88,7 @@ public final class IndexCell extends AbstractBu this.targetFileSize = targetFileSize; this.maxFileSize = maxFileSize; this.writeBufferSize = writeBufferSize; + this.countCache = new SimpleARC(1000); //cleanCache(); } @@ -123,20 +126,41 @@ public final class IndexCell extends AbstractBu return this.array.has(termHash); } + /** + * count number of references for a given term + * this method may cause strong IO load if called too frequently, because it is + * necessary to read the corresponding reference containers from the files and + * count the resulting index object. + * To reduce the load for processes that frequently need access to the same + * term objects, a ARC cache is here to reduce IO load. + */ public int count(byte[] termHash) { - ReferenceContainer c0 = this.ram.get(termHash, null); - ReferenceContainer c1; - try { - c1 = this.array.get(termHash); - } catch (IOException e) { - c1 = null; - } - if (c1 == null) { - if (c0 == null) return 0; - return c0.size(); + + // check if value is in cache + ByteArray ba = new ByteArray(termHash); + Integer countCache = this.countCache.get(ba); + int countFile; + if (countCache == null) { + // read fresh values from file + ReferenceContainer c1; + try { + c1 = this.array.get(termHash); + } catch (IOException e) { + c1 = null; + } + countFile = (c1 == null) ? 0 : c1.size(); + + // store to cache + this.countCache.put(ba, countFile); + } else { + // value was in ram + countFile = countCache.intValue(); } - if (c0 == null) return c1.size(); - return c1.size() + c0.size(); + + // count from container in ram + ReferenceContainer countRam = this.ram.get(termHash, null); + + return (countRam == null) ? countFile : countFile + countRam.size(); } /** @@ -161,7 +185,10 @@ public final class IndexCell extends AbstractBu */ public ReferenceContainer delete(byte[] termHash) throws IOException { ReferenceContainer c1 = this.array.get(termHash); - if (c1 != null) this.array.delete(termHash); + if (c1 != null) { + this.array.delete(termHash); + this.countCache.remove(new ByteArray(termHash)); + } ReferenceContainer c0 = this.ram.delete(termHash); cleanCache(); if (c1 == null) return c0; @@ -179,12 +206,14 @@ public final class IndexCell extends AbstractBu public int remove(byte[] termHash, Set urlHashes) throws IOException { int removed = this.ram.remove(termHash, urlHashes); int reduced = this.array.replace(termHash, new RemoveRewriter(urlHashes)); + this.countCache.remove(new ByteArray(termHash)); return removed + (reduced / this.array.rowdef().objectsize); } public boolean remove(byte[] termHash, String urlHash) throws IOException { boolean removed = this.ram.remove(termHash, urlHash); int reduced = this.array.replace(termHash, new RemoveRewriter(urlHash)); + this.countCache.remove(new ByteArray(termHash)); return removed || (reduced > 0); } @@ -245,6 +274,7 @@ public final class IndexCell extends AbstractBu public synchronized void clear() throws IOException { this.ram.clear(); this.array.clear(); + this.countCache.clear(); } /** @@ -257,6 +287,7 @@ public final class IndexCell extends AbstractBu // close all this.ram.close(); this.array.close(); + this.countCache.clear(); } public int size() { @@ -292,6 +323,7 @@ public final class IndexCell extends AbstractBu */ private void cleanCache() { + this.countCache.clear(); // dump the cache if necessary if (this.ram.size() >= this.maxRamEntries || (this.ram.size() > 3000 && !MemoryControl.request(80L * 1024L * 1024L, false))) synchronized (this) {