used comparator for did-you-mean that uses index sizes for comparisment, but:

- limit comparisment to only the first 10 elements that had been sorted before without IO
- added a size cache to index computation because the size is computed at least twice in set comparator


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6306 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent a58d9cae7d
commit 604c37927f

@ -488,7 +488,7 @@ public class yacysearch {
prop.put("meanCount", meanMax);
if (meanMax > 0) {
DidYouMean didYouMean = new DidYouMean(sb.indexSegment.termIndex());
Iterator<String> meanIt = didYouMean.getSuggestion(querystring).iterator();
Iterator<String> meanIt = didYouMean.getSuggestions(querystring, 300, 10).iterator();
int meanCount = 0;
String suggestion;
while(meanCount<meanMax && meanIt.hasNext()) {

@ -27,11 +27,12 @@ import de.anomic.yacy.logging.Log;
*/
public class DidYouMean {
protected static final char[] alphabet = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p',
'q','r','s','t','u','v','w','x','y','z','\u00e4','\u00f6','\u00fc','\u00df'};
private static final String poisonString = "\n";
public static final int availableCPU = Runtime.getRuntime().availableProcessors();
private static final wordLengthComparator wlComp = new wordLengthComparator();
protected static final char[] alphabet = {
'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p',
'q','r','s','t','u','v','w','x','y','z','\u00e4','\u00f6','\u00fc','\u00df'};
private static final String poisonString = "\n";
public static final int availableCPU = Runtime.getRuntime().availableProcessors();
protected static final wordLengthComparator wlComp = new wordLengthComparator();
protected final IndexCell<WordReference> index;
protected String word;
@ -46,11 +47,8 @@ public class DidYouMean {
* @param index a termIndex - most likely retrieved from a switchboard object.
* @param sort true/false - sorts the resulting TreeSet by index.count(); <b>Warning:</b> this causes heavy i/o.
*/
public DidYouMean(final IndexCell<WordReference> index, boolean sort) {
if(sort)
this.resultSet = Collections.synchronizedSortedSet(new TreeSet<String>(new indexSizeComparator()));
else
this.resultSet = Collections.synchronizedSortedSet(new TreeSet<String>(wlComp));
public DidYouMean(final IndexCell<WordReference> index) {
this.resultSet = Collections.synchronizedSortedSet(new TreeSet<String>(wlComp));
this.word = "";
this.wordLen = 0;
this.index = index;
@ -60,20 +58,50 @@ public class DidYouMean {
}
/**
* @param index a termIndex - most likely retrieved from a switchboard object.
* get a single suggestion
* @param word
* @param timeout
* @return
*/
public DidYouMean(final IndexCell<WordReference> index) {
this(index, false);
public String getSuggestion(final String word, long timeout) {
Set<String> s = getSuggestions(word, timeout);
if (s == null || s.size() == 0) return null;
return s.iterator().next();
}
/**
* This method triggers the producer and consumer threads of DidYouMean.
* <p/><b>Note:</b> the default timeout is 500ms
* @param word a String with a single word
* @return a Set&lt;String&gt; with word variations contained in index.
* get a single suggestion with additional sort
* @param word
* @param timeout
* @return
*/
public String getSuggestion(final String word, long timeout, int preSortSelection) {
Set<String> s = getSuggestions(word, timeout, preSortSelection);
if (s == null || s.size() == 0) return null;
return s.iterator().next();
}
/**
* get suggestions for a given word. The result is first ordered using a term size ordering,
* and a subset of the result is sorted again with a IO-intensive order based on the index size
* @param word
* @param timeout
* @param preSortSelection the number of words that participate in the IO-intensive sort
* @return
*/
public Set<String> getSuggestion(final String word) {
return getSuggestion(word, 500);
public Set<String> getSuggestions(final String word, long timeout, int preSortSelection) {
long startTime = System.currentTimeMillis();
Set<String> preSorted = getSuggestions(word, timeout);
long timelimit = 2 * System.currentTimeMillis() - startTime + timeout;
if (System.currentTimeMillis() > timelimit) return preSorted;
Set<String> countSorted = Collections.synchronizedSortedSet(new TreeSet<String>(new indexSizeComparator()));
for (String s: preSorted) {
if (System.currentTimeMillis() > timelimit) break;
if (preSortSelection <= 0) break;
countSorted.add(s);
preSortSelection--;
}
return countSorted;
}
/**
@ -82,7 +110,7 @@ public class DidYouMean {
* @param timeout execution time in ms.
* @return a Set&lt;String&gt; with word variations contained in term index.
*/
public Set<String> getSuggestion(final String word, long timeout) {
public Set<String> getSuggestions(final String word, long timeout) {
long startTime = System.currentTimeMillis();
this.timeLimit = startTime + timeout;
this.word = word.toLowerCase();
@ -251,7 +279,7 @@ public class DidYouMean {
public int compare(final String o1, final String o2) {
final int i1 = index.count(Word.word2hash(o1));
final int i2 = index.count(Word.word2hash(o2));
if (i1 == i2) return o1.compareTo(o2);
if (i1 == i2) return wlComp.compare(o1, o2);
return (i1 < i2) ? 1 : -1; // '<' is correct, because the largest count shall be ordered to be the first position in the result
}
}

@ -191,7 +191,7 @@ public class OpenGeoDB {
/**
* read the dictionary and construct a set of recommendations to a given string
* @param s input value that is used to match recommendations
* @return a set that contains all words that start or end with the input value
* @return a set that contains all words that start with the input value
*/
public Set<String> recommend(String s) {
Set<String> a = new HashSet<String>();

@ -32,10 +32,12 @@ import java.util.HashSet;
import java.util.Set;
import de.anomic.kelondro.index.Row;
import de.anomic.kelondro.index.SimpleARC;
import de.anomic.kelondro.order.ByteOrder;
import de.anomic.kelondro.order.CloneableIterator;
import de.anomic.kelondro.order.MergeIterator;
import de.anomic.kelondro.order.Order;
import de.anomic.kelondro.util.ByteArray;
import de.anomic.kelondro.util.MemoryControl;
import de.anomic.server.serverProfiling;
@ -62,7 +64,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
private long lastCleanup;
private final long targetFileSize, maxFileSize;
private final int writeBufferSize;
private final SimpleARC<ByteArray, Integer> countCache;
public IndexCell(
final File cellPath,
@ -86,6 +88,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
this.targetFileSize = targetFileSize;
this.maxFileSize = maxFileSize;
this.writeBufferSize = writeBufferSize;
this.countCache = new SimpleARC<ByteArray, Integer>(1000);
//cleanCache();
}
@ -123,20 +126,41 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
return this.array.has(termHash);
}
/**
* count number of references for a given term
* this method may cause strong IO load if called too frequently, because it is
* necessary to read the corresponding reference containers from the files and
* count the resulting index object.
* To reduce the load for processes that frequently need access to the same
* term objects, a ARC cache is here to reduce IO load.
*/
public int count(byte[] termHash) {
ReferenceContainer<ReferenceType> c0 = this.ram.get(termHash, null);
ReferenceContainer<ReferenceType> c1;
try {
c1 = this.array.get(termHash);
} catch (IOException e) {
c1 = null;
}
if (c1 == null) {
if (c0 == null) return 0;
return c0.size();
// check if value is in cache
ByteArray ba = new ByteArray(termHash);
Integer countCache = this.countCache.get(ba);
int countFile;
if (countCache == null) {
// read fresh values from file
ReferenceContainer<ReferenceType> c1;
try {
c1 = this.array.get(termHash);
} catch (IOException e) {
c1 = null;
}
countFile = (c1 == null) ? 0 : c1.size();
// store to cache
this.countCache.put(ba, countFile);
} else {
// value was in ram
countFile = countCache.intValue();
}
if (c0 == null) return c1.size();
return c1.size() + c0.size();
// count from container in ram
ReferenceContainer<ReferenceType> countRam = this.ram.get(termHash, null);
return (countRam == null) ? countFile : countFile + countRam.size();
}
/**
@ -161,7 +185,10 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
*/
public ReferenceContainer<ReferenceType> delete(byte[] termHash) throws IOException {
ReferenceContainer<ReferenceType> c1 = this.array.get(termHash);
if (c1 != null) this.array.delete(termHash);
if (c1 != null) {
this.array.delete(termHash);
this.countCache.remove(new ByteArray(termHash));
}
ReferenceContainer<ReferenceType> c0 = this.ram.delete(termHash);
cleanCache();
if (c1 == null) return c0;
@ -179,12 +206,14 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
public int remove(byte[] termHash, Set<String> urlHashes) throws IOException {
int removed = this.ram.remove(termHash, urlHashes);
int reduced = this.array.replace(termHash, new RemoveRewriter<ReferenceType>(urlHashes));
this.countCache.remove(new ByteArray(termHash));
return removed + (reduced / this.array.rowdef().objectsize);
}
public boolean remove(byte[] termHash, String urlHash) throws IOException {
boolean removed = this.ram.remove(termHash, urlHash);
int reduced = this.array.replace(termHash, new RemoveRewriter<ReferenceType>(urlHash));
this.countCache.remove(new ByteArray(termHash));
return removed || (reduced > 0);
}
@ -245,6 +274,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
public synchronized void clear() throws IOException {
this.ram.clear();
this.array.clear();
this.countCache.clear();
}
/**
@ -257,6 +287,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
// close all
this.ram.close();
this.array.close();
this.countCache.clear();
}
public int size() {
@ -292,6 +323,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
*/
private void cleanCache() {
this.countCache.clear();
// dump the cache if necessary
if (this.ram.size() >= this.maxRamEntries || (this.ram.size() > 3000 && !MemoryControl.request(80L * 1024L * 1024L, false))) synchronized (this) {

Loading…
Cancel
Save