used comparator for did-you-mean that uses index sizes for comparisment, but:

- limit comparisment to only the first 10 elements that had been sorted before without IO
- added a size cache to index computation because the size is computed at least twice in set comparator


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6306 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent a58d9cae7d
commit 604c37927f

@ -488,7 +488,7 @@ public class yacysearch {
prop.put("meanCount", meanMax); prop.put("meanCount", meanMax);
if (meanMax > 0) { if (meanMax > 0) {
DidYouMean didYouMean = new DidYouMean(sb.indexSegment.termIndex()); DidYouMean didYouMean = new DidYouMean(sb.indexSegment.termIndex());
Iterator<String> meanIt = didYouMean.getSuggestion(querystring).iterator(); Iterator<String> meanIt = didYouMean.getSuggestions(querystring, 300, 10).iterator();
int meanCount = 0; int meanCount = 0;
String suggestion; String suggestion;
while(meanCount<meanMax && meanIt.hasNext()) { while(meanCount<meanMax && meanIt.hasNext()) {

@ -27,11 +27,12 @@ import de.anomic.yacy.logging.Log;
*/ */
public class DidYouMean { public class DidYouMean {
protected static final char[] alphabet = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p', protected static final char[] alphabet = {
'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p',
'q','r','s','t','u','v','w','x','y','z','\u00e4','\u00f6','\u00fc','\u00df'}; 'q','r','s','t','u','v','w','x','y','z','\u00e4','\u00f6','\u00fc','\u00df'};
private static final String poisonString = "\n"; private static final String poisonString = "\n";
public static final int availableCPU = Runtime.getRuntime().availableProcessors(); public static final int availableCPU = Runtime.getRuntime().availableProcessors();
private static final wordLengthComparator wlComp = new wordLengthComparator(); protected static final wordLengthComparator wlComp = new wordLengthComparator();
protected final IndexCell<WordReference> index; protected final IndexCell<WordReference> index;
protected String word; protected String word;
@ -46,10 +47,7 @@ public class DidYouMean {
* @param index a termIndex - most likely retrieved from a switchboard object. * @param index a termIndex - most likely retrieved from a switchboard object.
* @param sort true/false - sorts the resulting TreeSet by index.count(); <b>Warning:</b> this causes heavy i/o. * @param sort true/false - sorts the resulting TreeSet by index.count(); <b>Warning:</b> this causes heavy i/o.
*/ */
public DidYouMean(final IndexCell<WordReference> index, boolean sort) { public DidYouMean(final IndexCell<WordReference> index) {
if(sort)
this.resultSet = Collections.synchronizedSortedSet(new TreeSet<String>(new indexSizeComparator()));
else
this.resultSet = Collections.synchronizedSortedSet(new TreeSet<String>(wlComp)); this.resultSet = Collections.synchronizedSortedSet(new TreeSet<String>(wlComp));
this.word = ""; this.word = "";
this.wordLen = 0; this.wordLen = 0;
@ -60,20 +58,50 @@ public class DidYouMean {
} }
/** /**
* @param index a termIndex - most likely retrieved from a switchboard object. * get a single suggestion
* @param word
* @param timeout
* @return
*/ */
public DidYouMean(final IndexCell<WordReference> index) { public String getSuggestion(final String word, long timeout) {
this(index, false); Set<String> s = getSuggestions(word, timeout);
if (s == null || s.size() == 0) return null;
return s.iterator().next();
} }
/** /**
* This method triggers the producer and consumer threads of DidYouMean. * get a single suggestion with additional sort
* <p/><b>Note:</b> the default timeout is 500ms * @param word
* @param word a String with a single word * @param timeout
* @return a Set&lt;String&gt; with word variations contained in index. * @return
*/ */
public Set<String> getSuggestion(final String word) { public String getSuggestion(final String word, long timeout, int preSortSelection) {
return getSuggestion(word, 500); Set<String> s = getSuggestions(word, timeout, preSortSelection);
if (s == null || s.size() == 0) return null;
return s.iterator().next();
}
/**
* get suggestions for a given word. The result is first ordered using a term size ordering,
* and a subset of the result is sorted again with a IO-intensive order based on the index size
* @param word
* @param timeout
* @param preSortSelection the number of words that participate in the IO-intensive sort
* @return
*/
public Set<String> getSuggestions(final String word, long timeout, int preSortSelection) {
long startTime = System.currentTimeMillis();
Set<String> preSorted = getSuggestions(word, timeout);
long timelimit = 2 * System.currentTimeMillis() - startTime + timeout;
if (System.currentTimeMillis() > timelimit) return preSorted;
Set<String> countSorted = Collections.synchronizedSortedSet(new TreeSet<String>(new indexSizeComparator()));
for (String s: preSorted) {
if (System.currentTimeMillis() > timelimit) break;
if (preSortSelection <= 0) break;
countSorted.add(s);
preSortSelection--;
}
return countSorted;
} }
/** /**
@ -82,7 +110,7 @@ public class DidYouMean {
* @param timeout execution time in ms. * @param timeout execution time in ms.
* @return a Set&lt;String&gt; with word variations contained in term index. * @return a Set&lt;String&gt; with word variations contained in term index.
*/ */
public Set<String> getSuggestion(final String word, long timeout) { public Set<String> getSuggestions(final String word, long timeout) {
long startTime = System.currentTimeMillis(); long startTime = System.currentTimeMillis();
this.timeLimit = startTime + timeout; this.timeLimit = startTime + timeout;
this.word = word.toLowerCase(); this.word = word.toLowerCase();
@ -251,7 +279,7 @@ public class DidYouMean {
public int compare(final String o1, final String o2) { public int compare(final String o1, final String o2) {
final int i1 = index.count(Word.word2hash(o1)); final int i1 = index.count(Word.word2hash(o1));
final int i2 = index.count(Word.word2hash(o2)); final int i2 = index.count(Word.word2hash(o2));
if (i1 == i2) return o1.compareTo(o2); if (i1 == i2) return wlComp.compare(o1, o2);
return (i1 < i2) ? 1 : -1; // '<' is correct, because the largest count shall be ordered to be the first position in the result return (i1 < i2) ? 1 : -1; // '<' is correct, because the largest count shall be ordered to be the first position in the result
} }
} }

@ -191,7 +191,7 @@ public class OpenGeoDB {
/** /**
* read the dictionary and construct a set of recommendations to a given string * read the dictionary and construct a set of recommendations to a given string
* @param s input value that is used to match recommendations * @param s input value that is used to match recommendations
* @return a set that contains all words that start or end with the input value * @return a set that contains all words that start with the input value
*/ */
public Set<String> recommend(String s) { public Set<String> recommend(String s) {
Set<String> a = new HashSet<String>(); Set<String> a = new HashSet<String>();

@ -32,10 +32,12 @@ import java.util.HashSet;
import java.util.Set; import java.util.Set;
import de.anomic.kelondro.index.Row; import de.anomic.kelondro.index.Row;
import de.anomic.kelondro.index.SimpleARC;
import de.anomic.kelondro.order.ByteOrder; import de.anomic.kelondro.order.ByteOrder;
import de.anomic.kelondro.order.CloneableIterator; import de.anomic.kelondro.order.CloneableIterator;
import de.anomic.kelondro.order.MergeIterator; import de.anomic.kelondro.order.MergeIterator;
import de.anomic.kelondro.order.Order; import de.anomic.kelondro.order.Order;
import de.anomic.kelondro.util.ByteArray;
import de.anomic.kelondro.util.MemoryControl; import de.anomic.kelondro.util.MemoryControl;
import de.anomic.server.serverProfiling; import de.anomic.server.serverProfiling;
@ -62,7 +64,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
private long lastCleanup; private long lastCleanup;
private final long targetFileSize, maxFileSize; private final long targetFileSize, maxFileSize;
private final int writeBufferSize; private final int writeBufferSize;
private final SimpleARC<ByteArray, Integer> countCache;
public IndexCell( public IndexCell(
final File cellPath, final File cellPath,
@ -86,6 +88,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
this.targetFileSize = targetFileSize; this.targetFileSize = targetFileSize;
this.maxFileSize = maxFileSize; this.maxFileSize = maxFileSize;
this.writeBufferSize = writeBufferSize; this.writeBufferSize = writeBufferSize;
this.countCache = new SimpleARC<ByteArray, Integer>(1000);
//cleanCache(); //cleanCache();
} }
@ -123,20 +126,41 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
return this.array.has(termHash); return this.array.has(termHash);
} }
/**
* count number of references for a given term
* this method may cause strong IO load if called too frequently, because it is
* necessary to read the corresponding reference containers from the files and
* count the resulting index object.
* To reduce the load for processes that frequently need access to the same
* term objects, a ARC cache is here to reduce IO load.
*/
public int count(byte[] termHash) { public int count(byte[] termHash) {
ReferenceContainer<ReferenceType> c0 = this.ram.get(termHash, null);
// check if value is in cache
ByteArray ba = new ByteArray(termHash);
Integer countCache = this.countCache.get(ba);
int countFile;
if (countCache == null) {
// read fresh values from file
ReferenceContainer<ReferenceType> c1; ReferenceContainer<ReferenceType> c1;
try { try {
c1 = this.array.get(termHash); c1 = this.array.get(termHash);
} catch (IOException e) { } catch (IOException e) {
c1 = null; c1 = null;
} }
if (c1 == null) { countFile = (c1 == null) ? 0 : c1.size();
if (c0 == null) return 0;
return c0.size(); // store to cache
this.countCache.put(ba, countFile);
} else {
// value was in ram
countFile = countCache.intValue();
} }
if (c0 == null) return c1.size();
return c1.size() + c0.size(); // count from container in ram
ReferenceContainer<ReferenceType> countRam = this.ram.get(termHash, null);
return (countRam == null) ? countFile : countFile + countRam.size();
} }
/** /**
@ -161,7 +185,10 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
*/ */
public ReferenceContainer<ReferenceType> delete(byte[] termHash) throws IOException { public ReferenceContainer<ReferenceType> delete(byte[] termHash) throws IOException {
ReferenceContainer<ReferenceType> c1 = this.array.get(termHash); ReferenceContainer<ReferenceType> c1 = this.array.get(termHash);
if (c1 != null) this.array.delete(termHash); if (c1 != null) {
this.array.delete(termHash);
this.countCache.remove(new ByteArray(termHash));
}
ReferenceContainer<ReferenceType> c0 = this.ram.delete(termHash); ReferenceContainer<ReferenceType> c0 = this.ram.delete(termHash);
cleanCache(); cleanCache();
if (c1 == null) return c0; if (c1 == null) return c0;
@ -179,12 +206,14 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
public int remove(byte[] termHash, Set<String> urlHashes) throws IOException { public int remove(byte[] termHash, Set<String> urlHashes) throws IOException {
int removed = this.ram.remove(termHash, urlHashes); int removed = this.ram.remove(termHash, urlHashes);
int reduced = this.array.replace(termHash, new RemoveRewriter<ReferenceType>(urlHashes)); int reduced = this.array.replace(termHash, new RemoveRewriter<ReferenceType>(urlHashes));
this.countCache.remove(new ByteArray(termHash));
return removed + (reduced / this.array.rowdef().objectsize); return removed + (reduced / this.array.rowdef().objectsize);
} }
public boolean remove(byte[] termHash, String urlHash) throws IOException { public boolean remove(byte[] termHash, String urlHash) throws IOException {
boolean removed = this.ram.remove(termHash, urlHash); boolean removed = this.ram.remove(termHash, urlHash);
int reduced = this.array.replace(termHash, new RemoveRewriter<ReferenceType>(urlHash)); int reduced = this.array.replace(termHash, new RemoveRewriter<ReferenceType>(urlHash));
this.countCache.remove(new ByteArray(termHash));
return removed || (reduced > 0); return removed || (reduced > 0);
} }
@ -245,6 +274,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
public synchronized void clear() throws IOException { public synchronized void clear() throws IOException {
this.ram.clear(); this.ram.clear();
this.array.clear(); this.array.clear();
this.countCache.clear();
} }
/** /**
@ -257,6 +287,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
// close all // close all
this.ram.close(); this.ram.close();
this.array.close(); this.array.close();
this.countCache.clear();
} }
public int size() { public int size() {
@ -292,6 +323,7 @@ public final class IndexCell<ReferenceType extends Reference> extends AbstractBu
*/ */
private void cleanCache() { private void cleanCache() {
this.countCache.clear();
// dump the cache if necessary // dump the cache if necessary
if (this.ram.size() >= this.maxRamEntries || (this.ram.size() > 3000 && !MemoryControl.request(80L * 1024L * 1024L, false))) synchronized (this) { if (this.ram.size() >= this.maxRamEntries || (this.ram.size() > 3000 && !MemoryControl.request(80L * 1024L * 1024L, false))) synchronized (this) {

Loading…
Cancel
Save