added a word counter statistic in condenser which is used by the did-you-mean to calculate best matches for given search words.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7258 6c8d7289-2bf4-0310-a012-ef5d649a1542
15 years ago · 58e74282af
parent 2a0eb09e08
commit 58e74282af
14 changed files with 118 additions and 63 deletions
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@ -47,6 +47,7 @@ import net.yacy.kelondro.data.meta.URIMetadataRow;

 import de.anomic.crawler.CrawlProfile;
 import de.anomic.crawler.retrieval.Response;
+import de.anomic.data.LibraryProvider;
 import de.anomic.http.client.Cache;
 import de.anomic.search.Segment;
 import de.anomic.search.Segments;
@ -277,9 +278,9 @@ public class ViewFile {
                    // Search word highlighting
                    for (StringBuilder s: sentences) {
                        sentence = s.toString();
-                        Enumeration<StringBuilder> tokens = Condenser.wordTokenizer(sentence, "UTF-8");
+                        Enumeration<String> tokens = Condenser.wordTokenizer(sentence, "UTF-8", LibraryProvider.dymLib);
                        while (tokens.hasMoreElements()) {
-                            token = tokens.nextElement().toString();
+                            token = tokens.nextElement();
                            if (token.length() > 0) {
                                prop.put("viewMode_words_" + i + "_nr", i + 1);
                                prop.put("viewMode_words_" + i + "_word", token);
--- a/source/de/anomic/data/DidYouMean.java
+++ b/source/de/anomic/data/DidYouMean.java
@ -63,6 +63,7 @@ public class DidYouMean {
    private long timeLimit;
    private boolean createGen; // keeps the value 'true' as long as no entry in guessLib is written
    private final SortedSet<String> resultSet;
+    private final indexSizeComparator INDEX_SIZE_COMPARATOR;
    
 	
    /**
@ -70,13 +71,14 @@ public class DidYouMean {
     * @param sort true/false -  sorts the resulting TreeSet by index.count(); <b>Warning:</b> this causes heavy i/o.
     */
    public DidYouMean(final IndexCell<WordReference> index, String word0) {
-        this.resultSet = Collections.synchronizedSortedSet(new TreeSet<String>(WORD_LENGTH_COMPARATOR));
+        this.resultSet = Collections.synchronizedSortedSet(new TreeSet<String>(new headMatchingComparator(word0, WORD_LENGTH_COMPARATOR)));
        this.word = word0.toLowerCase();
        this.wordLen = word.length();
        this.index = index;
        this.guessGen = new LinkedBlockingQueue<String>();
        this.guessLib = new LinkedBlockingQueue<String>();
        this.createGen = true;
+        this.INDEX_SIZE_COMPARATOR = new indexSizeComparator();
        
        // identify language
        if (this.word.length() == 0) {
@ -134,7 +136,7 @@ public class DidYouMean {
            if (scored.size() >= 2 * preSortSelection) break;
            scored.inc(s, index.count(Word.word2hash(s)));
        }
-        SortedSet<String> countSorted = Collections.synchronizedSortedSet(new TreeSet<String>(new indexSizeComparator()));
+        SortedSet<String> countSorted = Collections.synchronizedSortedSet(new TreeSet<String>(new headMatchingComparator(this.word, this.INDEX_SIZE_COMPARATOR)));
        int wc = index.count(Word.word2hash(this.word)); // all counts must be greater than this
        while (scored.size() > 0 && countSorted.size() < preSortSelection) {
            String s = scored.getMaxKey();
@ -351,9 +353,9 @@ public class DidYouMean {
                } catch (InterruptedException e) {}
            }
 	}
-	
+    
    /**
-     * indexSizeComparator is used by DidYouMean to order terms by index.count()<p/>
+     * indexSizeComparator is used by DidYouMean to order terms by index.count()
     * <b>Warning:</b> this causes heavy i/o
     */
    private class indexSizeComparator implements Comparator<String> {
@ -363,11 +365,11 @@ public class DidYouMean {
            final int i2 = index.count(Word.word2hash(o2));
            if (i1 == i2) return WORD_LENGTH_COMPARATOR.compare(o1, o2);
            return (i1 < i2) ? 1 : -1; // '<' is correct, because the largest count shall be ordered to be the first position in the result
-    	}    	
+        }       
    }
    
    /**
-     * wordLengthComparator is used by DidYouMean to order terms by the term length<p/>
+     * wordLengthComparator is used by DidYouMean to order terms by the term length
     * This is the default order if the indexSizeComparator is not used
     */
    private static class wordLengthComparator implements Comparator<String> {
@ -376,11 +378,30 @@ public class DidYouMean {
            final int i1 = o1.length();
            final int i2 = o2.length();
            if (i1 == i2) return o1.compareTo(o2);
-            return (i1 > i2) ? 1 : -1; // '>' is correct, because the shortest word shall be first
+            return (i1 < i2) ? 1 : -1; // '<' is correct, because the longest word shall be first
        }
        
    }

+    /**
+     * headMatchingComparator is used to sort results in such a way that words that match with the given words are sorted first
+     */
+    private static class headMatchingComparator implements Comparator<String> {
+        private final String head;
+        private final Comparator<String> secondaryComparator;
+        public headMatchingComparator(String head, Comparator<String> secondaryComparator) {
+            this.head = head.toLowerCase();
+            this.secondaryComparator = secondaryComparator;
+        }
+        
+        public int compare(final String o1, final String o2) {
+            boolean o1m = o1.toLowerCase().startsWith(head);
+            boolean o2m = o2.toLowerCase().startsWith(head);
+            if ((o1m && o2m) || (!o1m && !o2m)) return secondaryComparator.compare(o1, o2);
+            return o1m ? -1 : 1;
+        }
+    }
+    
 }


--- a/source/de/anomic/data/DidYouMeanLibrary.java
+++ b/source/de/anomic/data/DidYouMeanLibrary.java
@ -33,12 +33,14 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.util.HashSet;
+import java.util.Map;
 import java.util.Set;
+import java.util.SortedMap;
 import java.util.SortedSet;
 import java.util.TreeSet;
 import java.util.zip.GZIPInputStream;

-import net.yacy.cora.storage.DynamicScore;
+import net.yacy.cora.storage.IntScore;
 import net.yacy.cora.storage.ScoreMap;
 import net.yacy.kelondro.logging.Log;

@ -50,8 +52,8 @@ public class DidYouMeanLibrary {
    
    // common word cache
    private static final int commonWordsMaxSize = 100000; // maximum size of common word cache
-    private static final int commonWordsMinLength = 4;    // words must have that length at minimum
-    private DynamicScore<String> commonWords = new ScoreMap<String>();    
+    private static final int commonWordsMinLength = 5;    // words must have that length at minimum
+    private ScoreMap<String> commonWords = new ScoreMap<String>(String.CASE_INSENSITIVE_ORDER);    
    
    // dictionaries
    private final File dictionaryPath;
@ -76,10 +78,9 @@ public class DidYouMeanLibrary {
     */
    public void learn(String word) {
        if (word == null) return;
-        word = word.trim().toLowerCase();
        if (word.length() < commonWordsMinLength) return;
        commonWords.inc(word);
-        if (commonWords.size() >= commonWordsMaxSize) {
+        if (commonWords.size() > commonWordsMaxSize) {
            commonWords.shrinkToMaxSize(commonWordsMaxSize / 2);
        }
    }
@ -140,6 +141,12 @@ public class DidYouMeanLibrary {
        for (final String r: t) {
            if (r.startsWith(string) && r.length() > string.length()) ret.add(r); else break;
        }
+        SortedMap<String, IntScore> u = this.commonWords.tailMap(string);
+        String vv;
+        for (final Map.Entry<String, IntScore> v: u.entrySet()) {
+            vv = v.getKey();
+            if (vv.startsWith(string) && vv.length() > string.length()) ret.add(vv); else break;
+        }
        string = reverse(string);
        t = this.tcid.tailSet(string);
        for (final String r: t) {
--- a/source/de/anomic/search/DocumentIndex.java
+++ b/source/de/anomic/search/DocumentIndex.java
@ -35,6 +35,8 @@ import java.util.Date;
 import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.LinkedBlockingQueue;

+import de.anomic.data.LibraryProvider;
+
 import net.yacy.document.Condenser;
 import net.yacy.document.Document;
 import net.yacy.document.TextParser;
@ -133,7 +135,7 @@ public class DocumentIndex extends Segment {
            throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage());
        }
        Document document = Document.mergeDocuments(url, null, documents);
-        final Condenser condenser = new Condenser(document, true, true);
+        final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib);
        return super.storeDocument(
                url,
                null,
--- a/source/de/anomic/search/MediaSnippet.java
+++ b/source/de/anomic/search/MediaSnippet.java
@ -206,7 +206,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
    private static HandleSet removeAppearanceHashes(final String sentence, final HandleSet queryhashes) {
        // remove all hashes that appear in the sentence
        if (sentence == null) return queryhashes;
-        final TreeMap<byte[], Integer> hs = Condenser.hashSentence(sentence);
+        final TreeMap<byte[], Integer> hs = Condenser.hashSentence(sentence, null);
        final Iterator<byte[]> j = queryhashes.iterator();
        byte[] hash;
        Integer pos;
--- a/source/de/anomic/search/QueryParams.java
+++ b/source/de/anomic/search/QueryParams.java
@ -294,7 +294,7 @@ public final class QueryParams {
     */
    public final boolean matchesText(final String text) {
        boolean ret = false;
-        final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text).keySet());
+        final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text, null).keySet());
        if (!SetTools.anymatch(wordhashes, this.excludeHashes)) {
            ret = SetTools.totalInclusion(this.queryHashes, wordhashes);
        }
@ -304,7 +304,7 @@ public final class QueryParams {
    protected static final boolean anymatch(final String text, final HandleSet keyhashes) {
    	// returns true if any of the word hashes in keyhashes appear in the String text
    	// to do this, all words in the string must be recognized and transcoded to word hashes
-    	final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text).keySet());
+    	final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text, null).keySet());
    	return SetTools.anymatch(wordhashes, keyhashes);
    }
    
--- a/source/de/anomic/search/ResultEntry.java
+++ b/source/de/anomic/search/ResultEntry.java
@ -89,7 +89,7 @@ public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEn
                            ("yacyshare " +
                             filename.replace('?', ' ') +
                             " " +
-                             urlcomps.dc_title())).keySet()),
+                             urlcomps.dc_title()), null).keySet()),
                             urlentry.hash());
                } catch (IOException e) {
                    Log.logException(e);
--- a/source/de/anomic/search/Segment.java
+++ b/source/de/anomic/search/Segment.java
@ -424,7 +424,7 @@ public class Segment {
            // get the word set
            Set<String> words = null;
            try {
-                words = new Condenser(document, true, true).words().keySet();
+                words = new Condenser(document, true, true, null).words().keySet();
            } catch (final UnsupportedEncodingException e) {
                Log.logException(e);
            }
--- a/source/de/anomic/search/Switchboard.java
+++ b/source/de/anomic/search/Switchboard.java
@ -1855,7 +1855,7 @@ public final class Switchboard extends serverSwitch {
        for (int i = 0; i < in.documents.length; i++) {
            // strip out words and generate statistics
            try {
-                condenser[i] = new Condenser(in.documents[i], in.queueEntry.profile().indexText(), in.queueEntry.profile().indexMedia());
+                condenser[i] = new Condenser(in.documents[i], in.queueEntry.profile().indexText(), in.queueEntry.profile().indexMedia(), LibraryProvider.dymLib);
    
                // update image result list statistics
                // its good to do this concurrently here, because it needs a DNS lookup
@ -2035,7 +2035,7 @@ public final class Switchboard extends serverSwitch {
                Document[] documents = response.parse();
                if (documents != null) for (Document document: documents) {
                    if (document.indexingDenied()) throw new Parser.Failure("indexing is denied", url);
-                    Condenser condenser = new Condenser(document, true, true);
+                    Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib);
                    ResultImages.registerImages(url, document, true);
                    webStructure.generateCitationReference(url, document, condenser, response.lastModified());
                    storeDocumentIndex(process, response, document, condenser, searchEvent, "heuristic:" + heuristicName);
--- a/source/de/anomic/search/TextSnippet.java
+++ b/source/de/anomic/search/TextSnippet.java
@ -417,7 +417,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
    }
    
    private static boolean containsAllHashes(final String sentence, final HandleSet queryhashes) {
-        final TreeMap<byte[], Integer> m = Condenser.hashSentence(sentence);
+        final TreeMap<byte[], Integer> m = Condenser.hashSentence(sentence, null);
        for (byte[] b: queryhashes) {
            if (!(m.containsKey(b))) return false;
        }
--- a/source/net/yacy/cora/storage/ScoreMap.java
+++ b/source/net/yacy/cora/storage/ScoreMap.java
@ -28,6 +28,7 @@ import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.SortedMap;
 import java.util.TreeMap;
 import java.util.TreeSet;

@ -184,6 +185,14 @@ public class ScoreMap<E> implements DynamicScore<E> {
        return score.intValue();
    }
    
+    public SortedMap<E, IntScore> tailMap(E obj) {
+        if (this.map instanceof TreeMap) {
+            return ((TreeMap<E, IntScore>) this.map).tailMap(obj);
+        }
+        throw new UnsupportedOperationException("map must have comparator");
+    }
+    
+    
    public int getMaxScore() {
        if (map.isEmpty()) return -1;
        int maxScore = Integer.MIN_VALUE;
--- a/source/net/yacy/document/Condenser.java
+++ b/source/net/yacy/document/Condenser.java
@ -42,6 +42,8 @@ import java.util.Properties;
 import java.util.TreeMap;
 import java.util.TreeSet;

+import de.anomic.data.DidYouMeanLibrary;
+
 import net.yacy.cora.document.MultiProtocolURI;
 import net.yacy.document.language.Identificator;
 import net.yacy.document.parser.html.ContentScraper;
@ -55,7 +57,7 @@ import net.yacy.kelondro.util.SetTools;


 public final class Condenser {
-
+    
    // this is the page analysis class
    public final static boolean pseudostemming = false; // switch for removal of words that appear in shortened form
    public final static int wordminsize = 2;
@ -108,7 +110,8 @@ public final class Condenser {
    public Condenser(
            final Document document,
            final boolean indexText,
-            final boolean indexMedia
+            final boolean indexMedia,
+            final DidYouMeanLibrary meaningLib
            ) throws UnsupportedEncodingException {
        // if addMedia == true, then all the media links are also parsed and added to the words
        // added media words are flagged with the appropriate media flag
@ -126,7 +129,7 @@ public final class Condenser {
        
        Map.Entry<MultiProtocolURI, String> entry;
        if (indexText) {
-            createCondensement(document.getText());        
+            createCondensement(document.getText(), meaningLib);        
            // the phrase counter:
            // phrase   0 are words taken from the URL
            // phrase   1 is the MainTitle
@ -140,15 +143,15 @@ public final class Condenser {
            // phrase  99 is taken from the media Link url and anchor description
            // phrase 100 and above are lines from the text
      
-            insertTextToWords(document.dc_title(),       1, WordReferenceRow.flag_app_dc_title, RESULT_FLAGS, true);
-            insertTextToWords(document.dc_description(), 3, WordReferenceRow.flag_app_dc_description, RESULT_FLAGS, true);
-            insertTextToWords(document.dc_creator(),     4, WordReferenceRow.flag_app_dc_creator, RESULT_FLAGS, true);
-            insertTextToWords(document.dc_publisher(),   5, WordReferenceRow.flag_app_dc_creator, RESULT_FLAGS, true);
-            insertTextToWords(document.dc_subject(' '),  6, WordReferenceRow.flag_app_dc_description, RESULT_FLAGS, true);
+            insertTextToWords(document.dc_title(),       1, WordReferenceRow.flag_app_dc_title, RESULT_FLAGS, true, meaningLib);
+            insertTextToWords(document.dc_description(), 3, WordReferenceRow.flag_app_dc_description, RESULT_FLAGS, true, meaningLib);
+            insertTextToWords(document.dc_creator(),     4, WordReferenceRow.flag_app_dc_creator, RESULT_FLAGS, true, meaningLib);
+            insertTextToWords(document.dc_publisher(),   5, WordReferenceRow.flag_app_dc_creator, RESULT_FLAGS, true, meaningLib);
+            insertTextToWords(document.dc_subject(' '),  6, WordReferenceRow.flag_app_dc_description, RESULT_FLAGS, true, meaningLib);
            // missing: tags!
            final String[] titles = document.getSectionTitles();
            for (int i = 0; i < titles.length; i++) {
-                insertTextToWords(titles[i], i + 10, WordReferenceRow.flag_app_emphasized, RESULT_FLAGS, true);
+                insertTextToWords(titles[i], i + 10, WordReferenceRow.flag_app_emphasized, RESULT_FLAGS, true, meaningLib);
            }
            
            // anchors: for text indexing we add only the anchor description
@ -173,7 +176,7 @@ public final class Condenser {
        }
        
        // add the URL components to the word list
-        insertTextToWords(document.dc_source().toNormalform(false, true), 0, WordReferenceRow.flag_app_dc_identifier, RESULT_FLAGS, false);
+        insertTextToWords(document.dc_source().toNormalform(false, true), 0, WordReferenceRow.flag_app_dc_identifier, RESULT_FLAGS, false, meaningLib);

        if (indexMedia) {
            // add anchor descriptions: here, we also add the url components
@ -181,24 +184,24 @@ public final class Condenser {
            Iterator<Map.Entry<MultiProtocolURI, String>> i = document.getAudiolinks().entrySet().iterator();
            while (i.hasNext()) {
                entry = i.next();
-                insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasaudio, RESULT_FLAGS, false);
-                insertTextToWords(entry.getValue(), 99, flag_cat_hasaudio, RESULT_FLAGS, true);
+                insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasaudio, RESULT_FLAGS, false, meaningLib);
+                insertTextToWords(entry.getValue(), 99, flag_cat_hasaudio, RESULT_FLAGS, true, meaningLib);
            }

            // video
            i = document.getVideolinks().entrySet().iterator();
            while (i.hasNext()) {
                entry = i.next();
-                insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasvideo, RESULT_FLAGS, false);
-                insertTextToWords(entry.getValue(), 99, flag_cat_hasvideo, RESULT_FLAGS, true);
+                insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasvideo, RESULT_FLAGS, false, meaningLib);
+                insertTextToWords(entry.getValue(), 99, flag_cat_hasvideo, RESULT_FLAGS, true, meaningLib);
            }

            // applications
            i = document.getApplinks().entrySet().iterator();
            while (i.hasNext()) {
                entry = i.next();
-                insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasapp, RESULT_FLAGS, false);
-                insertTextToWords(entry.getValue(), 99, flag_cat_hasapp, RESULT_FLAGS, true);
+                insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasapp, RESULT_FLAGS, false, meaningLib);
+                insertTextToWords(entry.getValue(), 99, flag_cat_hasapp, RESULT_FLAGS, true, meaningLib);
            }

            // images
@ -206,8 +209,8 @@ public final class Condenser {
            ImageEntry ientry;
            while (j.hasNext()) {
                ientry = j.next();
-                insertTextToWords(ientry.url().toNormalform(false, false), 99, flag_cat_hasimage, RESULT_FLAGS, false);
-                insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, RESULT_FLAGS, true);
+                insertTextToWords(ientry.url().toNormalform(false, false), 99, flag_cat_hasimage, RESULT_FLAGS, false, meaningLib);
+                insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, RESULT_FLAGS, true, meaningLib);
            }
        
            // finally check all words for missing flag entry
@ -225,12 +228,18 @@ public final class Condenser {
        }
    }
    
-    private void insertTextToWords(final String text, final int phrase, final int flagpos, final Bitfield flagstemplate, boolean useForLanguageIdentification) {
+    private void insertTextToWords(
+            final String text,
+            final int phrase,
+            final int flagpos,
+            final Bitfield flagstemplate,
+            boolean useForLanguageIdentification,
+            DidYouMeanLibrary meaningLib) {
        String word;
        Word wprop;
        sievedWordsEnum wordenum;
        try {
-            wordenum = new sievedWordsEnum(new ByteArrayInputStream(text.getBytes("UTF-8")));
+            wordenum = new sievedWordsEnum(new ByteArrayInputStream(text.getBytes("UTF-8")), meaningLib);
        } catch (final UnsupportedEncodingException e) {
            return;
        }
@ -250,11 +259,11 @@ public final class Condenser {
        }
    }

-    public Condenser(final InputStream text) throws UnsupportedEncodingException {
+    public Condenser(final InputStream text, DidYouMeanLibrary meaningLib) throws UnsupportedEncodingException {
        this.languageIdentificator = null; // we don't need that here
        // analysis = new Properties();
        words = new TreeMap<String, Word>();
-        createCondensement(text);
+        createCondensement(text, meaningLib);
    }
    
    public int excludeWords(final TreeSet<String> stopwords) {
@ -274,7 +283,7 @@ public final class Condenser {
        return this.languageIdentificator.getLanguage();
    }

-    private void createCondensement(final InputStream is) throws UnsupportedEncodingException {
+    private void createCondensement(final InputStream is, DidYouMeanLibrary meaningLib) throws UnsupportedEncodingException {
        final HashSet<String> currsentwords = new HashSet<String>();
        StringBuilder sentence = new StringBuilder(100);
        String word = "";
@ -293,7 +302,7 @@ public final class Condenser {
        final HashMap<StringBuilder, Phrase> sentences = new HashMap<StringBuilder, Phrase>(100);
        
        // read source
-        final sievedWordsEnum wordenum = new sievedWordsEnum(is);
+        final sievedWordsEnum wordenum = new sievedWordsEnum(is, meaningLib);
        while (wordenum.hasMoreElements()) {
            word = (wordenum.nextElement().toString()).toLowerCase(Locale.ENGLISH); // TODO: does toLowerCase work for non ISO-8859-1 chars?
            if (languageIdentificator != null) languageIdentificator.add(word);
@ -467,11 +476,11 @@ public final class Condenser {
     * @param sentence the sentence to be tokenized
     * @return a ordered map containing word hashes as key and positions as value. The map is orderd by the hash ordering
     */
-    public static TreeMap<byte[], Integer> hashSentence(final String sentence) {
+    public static TreeMap<byte[], Integer> hashSentence(final String sentence, DidYouMeanLibrary meaningLib) {
        final TreeMap<byte[], Integer> map = new TreeMap<byte[], Integer>(Base64Order.enhancedCoder);
-        final Enumeration<StringBuilder> words = wordTokenizer(sentence, "UTF-8");
+        final Enumeration<String> words = wordTokenizer(sentence, "UTF-8", meaningLib);
        int pos = 0;
-        StringBuilder word;
+        String word;
        byte[] hash;
        Integer oldpos;
        while (words.hasMoreElements()) {
@ -487,23 +496,25 @@ public final class Condenser {
        return map;
    }
    
-    public static Enumeration<StringBuilder> wordTokenizer(final String s, final String charset) {
+    public static Enumeration<String> wordTokenizer(final String s, final String charset, DidYouMeanLibrary meaningLib) {
        try {
-            return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes(charset)));
+            return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes(charset)), meaningLib);
        } catch (final Exception e) {
            return null;
        }
    }
 	
-    public static class sievedWordsEnum implements Enumeration<StringBuilder> {
+    public static class sievedWordsEnum implements Enumeration<String> {
        // this enumeration removes all words that contain either wrong characters or are too short
        
        StringBuilder buffer = null;
        unsievedWordsEnum e;
+        DidYouMeanLibrary meaningLib;

-        public sievedWordsEnum(final InputStream is) throws UnsupportedEncodingException {
-            e = new unsievedWordsEnum(is);
-            buffer = nextElement0();
+        public sievedWordsEnum(final InputStream is, DidYouMeanLibrary meaningLib) throws UnsupportedEncodingException {
+            this.e = new unsievedWordsEnum(is);
+            this.buffer = nextElement0();
+            this.meaningLib = meaningLib;
        }

        public void pre(final boolean x) {
@ -527,9 +538,11 @@ public final class Condenser {
            return buffer != null;
        }

-        public StringBuilder nextElement() {
-            final StringBuilder r = buffer;
+        public String nextElement() {
+            final String r = (buffer == null) ? null : buffer.toString();
            buffer = nextElement0();
+            // put word to words statistics cache
+            if (meaningLib != null) meaningLib.learn(r);
            return r;
        }

@ -710,7 +723,7 @@ public final class Condenser {
        return s;
    }

-    public static Map<String, Word> getWords(final String text) {
+    public static Map<String, Word> getWords(final String text, DidYouMeanLibrary meaningLib) {
        // returns a word/indexWord relation map
        if (text == null) return null;
        ByteArrayInputStream buffer;
@ -720,7 +733,7 @@ public final class Condenser {
 			buffer = new ByteArrayInputStream(text.getBytes());
 		}
        try {
-            return new Condenser(buffer).words();
+            return new Condenser(buffer, meaningLib).words();
        } catch (final UnsupportedEncodingException e) {
            return null;
        }
--- a/source/net/yacy/document/SnippetExtractor.java
+++ b/source/net/yacy/document/SnippetExtractor.java
@ -45,7 +45,7 @@ public class SnippetExtractor {
        int linenumber = 0;
        int fullmatchcounter = 0;
        lookup: for (StringBuilder sentence: sentences) {
-            hs = Condenser.hashSentence(sentence.toString());
+            hs = Condenser.hashSentence(sentence.toString(), null);
            positions = new TreeSet<Integer>();
            for (byte[] word: queryhashes) {
                pos = hs.get(word);
@ -124,7 +124,7 @@ public class SnippetExtractor {
            byte[] hash;
            
            // find all hashes that appear in the sentence
-            final TreeMap<byte[], Integer> hs = Condenser.hashSentence(sentence);
+            final TreeMap<byte[], Integer> hs = Condenser.hashSentence(sentence, null);
            final Iterator<byte[]> j = queryhashes.iterator();
            Integer pos;
            int p, minpos = sentence.length(), maxpos = -1;
--- a/source/net/yacy/document/parser/torrentParser.java
+++ b/source/net/yacy/document/parser/torrentParser.java
@ -28,6 +28,8 @@ import java.io.UnsupportedEncodingException;
 import java.util.List;
 import java.util.Map;

+import de.anomic.data.LibraryProvider;
+
 import net.yacy.cora.document.MultiProtocolURI;
 import net.yacy.document.AbstractParser;
 import net.yacy.document.Condenser;
@ -109,7 +111,7 @@ public class torrentParser extends AbstractParser implements Parser {
            byte[] b = FileUtils.read(new File(args[0]));
            torrentParser parser = new torrentParser();
            Document[] d = parser.parse(new MultiProtocolURI("http://localhost/test.torrent"), null, "utf-8", new ByteArrayInputStream(b));
-            Condenser c = new Condenser(d[0], true, true);
+            Condenser c = new Condenser(d[0], true, true, LibraryProvider.dymLib);
            Map<String, Word> w = c.words();
            for (Map.Entry<String, Word> e: w.entrySet()) System.out.println("Word: " + e.getKey() + " - " + e.getValue().posInText);
        } catch (IOException e) {