From 58e74282af70f74d293272d1a71cfe0710105361 Mon Sep 17 00:00:00 2001
From: orbiter <orbiter@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Mon, 18 Oct 2010 11:35:09 +0000
Subject: [PATCH] added a word counter statistic in condenser which is used by
 the did-you-mean to calculate best matches for given search words.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7258 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 htroot/ViewFile.java                          |  5 +-
 source/de/anomic/data/DidYouMean.java         | 35 ++++++--
 source/de/anomic/data/DidYouMeanLibrary.java  | 17 ++--
 source/de/anomic/search/DocumentIndex.java    |  4 +-
 source/de/anomic/search/MediaSnippet.java     |  2 +-
 source/de/anomic/search/QueryParams.java      |  4 +-
 source/de/anomic/search/ResultEntry.java      |  2 +-
 source/de/anomic/search/Segment.java          |  2 +-
 source/de/anomic/search/Switchboard.java      |  4 +-
 source/de/anomic/search/TextSnippet.java      |  2 +-
 source/net/yacy/cora/storage/ScoreMap.java    |  9 ++
 source/net/yacy/document/Condenser.java       | 87 +++++++++++--------
 .../net/yacy/document/SnippetExtractor.java   |  4 +-
 .../yacy/document/parser/torrentParser.java   |  4 +-
 14 files changed, 118 insertions(+), 63 deletions(-)
diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java
index df999f1bc..791ddd83c 100644
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@@ -47,6 +47,7 @@ import net.yacy.kelondro.data.meta.URIMetadataRow;
 
 import de.anomic.crawler.CrawlProfile;
 import de.anomic.crawler.retrieval.Response;
+import de.anomic.data.LibraryProvider;
 import de.anomic.http.client.Cache;
 import de.anomic.search.Segment;
 import de.anomic.search.Segments;
@@ -277,9 +278,9 @@ public class ViewFile {
                     // Search word highlighting
                     for (StringBuilder s: sentences) {
                         sentence = s.toString();
-                        Enumeration<StringBuilder> tokens = Condenser.wordTokenizer(sentence, "UTF-8");
+                        Enumeration<String> tokens = Condenser.wordTokenizer(sentence, "UTF-8", LibraryProvider.dymLib);
                         while (tokens.hasMoreElements()) {
-                            token = tokens.nextElement().toString();
+                            token = tokens.nextElement();
                             if (token.length() > 0) {
                                 prop.put("viewMode_words_" + i + "_nr", i + 1);
                                 prop.put("viewMode_words_" + i + "_word", token);
diff --git a/source/de/anomic/data/DidYouMean.java b/source/de/anomic/data/DidYouMean.java
index 89949f109..d6081a56c 100644
--- a/source/de/anomic/data/DidYouMean.java
+++ b/source/de/anomic/data/DidYouMean.java
@@ -63,6 +63,7 @@ public class DidYouMean {
     private long timeLimit;
     private boolean createGen; // keeps the value 'true' as long as no entry in guessLib is written
     private final SortedSet<String> resultSet;
+    private final indexSizeComparator INDEX_SIZE_COMPARATOR;
     
 	
     /**
@@ -70,13 +71,14 @@ public class DidYouMean {
      * @param sort true/false -  sorts the resulting TreeSet by index.count(); <b>Warning:</b> this causes heavy i/o.
      */
     public DidYouMean(final IndexCell<WordReference> index, String word0) {
-        this.resultSet = Collections.synchronizedSortedSet(new TreeSet<String>(WORD_LENGTH_COMPARATOR));
+        this.resultSet = Collections.synchronizedSortedSet(new TreeSet<String>(new headMatchingComparator(word0, WORD_LENGTH_COMPARATOR)));
         this.word = word0.toLowerCase();
         this.wordLen = word.length();
         this.index = index;
         this.guessGen = new LinkedBlockingQueue<String>();
         this.guessLib = new LinkedBlockingQueue<String>();
         this.createGen = true;
+        this.INDEX_SIZE_COMPARATOR = new indexSizeComparator();
         
         // identify language
         if (this.word.length() == 0) {
@@ -134,7 +136,7 @@ public class DidYouMean {
             if (scored.size() >= 2 * preSortSelection) break;
             scored.inc(s, index.count(Word.word2hash(s)));
         }
-        SortedSet<String> countSorted = Collections.synchronizedSortedSet(new TreeSet<String>(new indexSizeComparator()));
+        SortedSet<String> countSorted = Collections.synchronizedSortedSet(new TreeSet<String>(new headMatchingComparator(this.word, this.INDEX_SIZE_COMPARATOR)));
         int wc = index.count(Word.word2hash(this.word)); // all counts must be greater than this
         while (scored.size() > 0 && countSorted.size() < preSortSelection) {
             String s = scored.getMaxKey();
@@ -351,9 +353,9 @@ public class DidYouMean {
                 } catch (InterruptedException e) {}
             }
 	}
-	
+    
     /**
-     * indexSizeComparator is used by DidYouMean to order terms by index.count()<p/>
+     * indexSizeComparator is used by DidYouMean to order terms by index.count()
      * <b>Warning:</b> this causes heavy i/o
      */
     private class indexSizeComparator implements Comparator<String> {
@@ -363,11 +365,11 @@ public class DidYouMean {
             final int i2 = index.count(Word.word2hash(o2));
             if (i1 == i2) return WORD_LENGTH_COMPARATOR.compare(o1, o2);
             return (i1 < i2) ? 1 : -1; // '<' is correct, because the largest count shall be ordered to be the first position in the result
-    	}    	
+        }       
     }
     
     /**
-     * wordLengthComparator is used by DidYouMean to order terms by the term length<p/>
+     * wordLengthComparator is used by DidYouMean to order terms by the term length
      * This is the default order if the indexSizeComparator is not used
      */
     private static class wordLengthComparator implements Comparator<String> {
@@ -376,11 +378,30 @@ public class DidYouMean {
             final int i1 = o1.length();
             final int i2 = o2.length();
             if (i1 == i2) return o1.compareTo(o2);
-            return (i1 > i2) ? 1 : -1; // '>' is correct, because the shortest word shall be first
+            return (i1 < i2) ? 1 : -1; // '<' is correct, because the longest word shall be first
         }
         
     }
 
+    /**
+     * headMatchingComparator is used to sort results in such a way that words that match with the given words are sorted first
+     */
+    private static class headMatchingComparator implements Comparator<String> {
+        private final String head;
+        private final Comparator<String> secondaryComparator;
+        public headMatchingComparator(String head, Comparator<String> secondaryComparator) {
+            this.head = head.toLowerCase();
+            this.secondaryComparator = secondaryComparator;
+        }
+        
+        public int compare(final String o1, final String o2) {
+            boolean o1m = o1.toLowerCase().startsWith(head);
+            boolean o2m = o2.toLowerCase().startsWith(head);
+            if ((o1m && o2m) || (!o1m && !o2m)) return secondaryComparator.compare(o1, o2);
+            return o1m ? -1 : 1;
+        }
+    }
+    
 }
 
 
diff --git a/source/de/anomic/data/DidYouMeanLibrary.java b/source/de/anomic/data/DidYouMeanLibrary.java
index cca665499..16d1f905a 100644
--- a/source/de/anomic/data/DidYouMeanLibrary.java
+++ b/source/de/anomic/data/DidYouMeanLibrary.java
@@ -33,12 +33,14 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.util.HashSet;
+import java.util.Map;
 import java.util.Set;
+import java.util.SortedMap;
 import java.util.SortedSet;
 import java.util.TreeSet;
 import java.util.zip.GZIPInputStream;
 
-import net.yacy.cora.storage.DynamicScore;
+import net.yacy.cora.storage.IntScore;
 import net.yacy.cora.storage.ScoreMap;
 import net.yacy.kelondro.logging.Log;
 
@@ -50,8 +52,8 @@ public class DidYouMeanLibrary {
     
     // common word cache
     private static final int commonWordsMaxSize = 100000; // maximum size of common word cache
-    private static final int commonWordsMinLength = 4;    // words must have that length at minimum
-    private DynamicScore<String> commonWords = new ScoreMap<String>();    
+    private static final int commonWordsMinLength = 5;    // words must have that length at minimum
+    private ScoreMap<String> commonWords = new ScoreMap<String>(String.CASE_INSENSITIVE_ORDER);    
     
     // dictionaries
     private final File dictionaryPath;
@@ -76,10 +78,9 @@ public class DidYouMeanLibrary {
      */
     public void learn(String word) {
         if (word == null) return;
-        word = word.trim().toLowerCase();
         if (word.length() < commonWordsMinLength) return;
         commonWords.inc(word);
-        if (commonWords.size() >= commonWordsMaxSize) {
+        if (commonWords.size() > commonWordsMaxSize) {
             commonWords.shrinkToMaxSize(commonWordsMaxSize / 2);
         }
     }
@@ -140,6 +141,12 @@ public class DidYouMeanLibrary {
         for (final String r: t) {
             if (r.startsWith(string) && r.length() > string.length()) ret.add(r); else break;
         }
+        SortedMap<String, IntScore> u = this.commonWords.tailMap(string);
+        String vv;
+        for (final Map.Entry<String, IntScore> v: u.entrySet()) {
+            vv = v.getKey();
+            if (vv.startsWith(string) && vv.length() > string.length()) ret.add(vv); else break;
+        }
         string = reverse(string);
         t = this.tcid.tailSet(string);
         for (final String r: t) {
diff --git a/source/de/anomic/search/DocumentIndex.java b/source/de/anomic/search/DocumentIndex.java
index 0888addd5..a1c16f1f9 100644
--- a/source/de/anomic/search/DocumentIndex.java
+++ b/source/de/anomic/search/DocumentIndex.java
@@ -35,6 +35,8 @@ import java.util.Date;
 import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.LinkedBlockingQueue;
 
+import de.anomic.data.LibraryProvider;
+
 import net.yacy.document.Condenser;
 import net.yacy.document.Document;
 import net.yacy.document.TextParser;
@@ -133,7 +135,7 @@ public class DocumentIndex extends Segment {
             throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage());
         }
         Document document = Document.mergeDocuments(url, null, documents);
-        final Condenser condenser = new Condenser(document, true, true);
+        final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib);
         return super.storeDocument(
                 url,
                 null,
diff --git a/source/de/anomic/search/MediaSnippet.java b/source/de/anomic/search/MediaSnippet.java
index a4240320a..07c27249c 100644
--- a/source/de/anomic/search/MediaSnippet.java
+++ b/source/de/anomic/search/MediaSnippet.java
@@ -206,7 +206,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
     private static HandleSet removeAppearanceHashes(final String sentence, final HandleSet queryhashes) {
         // remove all hashes that appear in the sentence
         if (sentence == null) return queryhashes;
-        final TreeMap<byte[], Integer> hs = Condenser.hashSentence(sentence);
+        final TreeMap<byte[], Integer> hs = Condenser.hashSentence(sentence, null);
         final Iterator<byte[]> j = queryhashes.iterator();
         byte[] hash;
         Integer pos;
diff --git a/source/de/anomic/search/QueryParams.java b/source/de/anomic/search/QueryParams.java
index c209e8fe6..79abf7753 100644
--- a/source/de/anomic/search/QueryParams.java
+++ b/source/de/anomic/search/QueryParams.java
@@ -294,7 +294,7 @@ public final class QueryParams {
      */
     public final boolean matchesText(final String text) {
         boolean ret = false;
-        final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text).keySet());
+        final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text, null).keySet());
         if (!SetTools.anymatch(wordhashes, this.excludeHashes)) {
             ret = SetTools.totalInclusion(this.queryHashes, wordhashes);
         }
@@ -304,7 +304,7 @@ public final class QueryParams {
     protected static final boolean anymatch(final String text, final HandleSet keyhashes) {
     	// returns true if any of the word hashes in keyhashes appear in the String text
     	// to do this, all words in the string must be recognized and transcoded to word hashes
-    	final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text).keySet());
+    	final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text, null).keySet());
     	return SetTools.anymatch(wordhashes, keyhashes);
     }
     
diff --git a/source/de/anomic/search/ResultEntry.java b/source/de/anomic/search/ResultEntry.java
index 96983aa33..6f6776828 100644
--- a/source/de/anomic/search/ResultEntry.java
+++ b/source/de/anomic/search/ResultEntry.java
@@ -89,7 +89,7 @@ public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEn
                             ("yacyshare " +
                              filename.replace('?', ' ') +
                              " " +
-                             urlcomps.dc_title())).keySet()),
+                             urlcomps.dc_title()), null).keySet()),
                              urlentry.hash());
                 } catch (IOException e) {
                     Log.logException(e);
diff --git a/source/de/anomic/search/Segment.java b/source/de/anomic/search/Segment.java
index 5e6409379..06f859127 100644
--- a/source/de/anomic/search/Segment.java
+++ b/source/de/anomic/search/Segment.java
@@ -424,7 +424,7 @@ public class Segment {
             // get the word set
             Set<String> words = null;
             try {
-                words = new Condenser(document, true, true).words().keySet();
+                words = new Condenser(document, true, true, null).words().keySet();
             } catch (final UnsupportedEncodingException e) {
                 Log.logException(e);
             }
diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java
index a8f5f01c8..cb3e827b2 100644
--- a/source/de/anomic/search/Switchboard.java
+++ b/source/de/anomic/search/Switchboard.java
@@ -1855,7 +1855,7 @@ public final class Switchboard extends serverSwitch {
         for (int i = 0; i < in.documents.length; i++) {
             // strip out words and generate statistics
             try {
-                condenser[i] = new Condenser(in.documents[i], in.queueEntry.profile().indexText(), in.queueEntry.profile().indexMedia());
+                condenser[i] = new Condenser(in.documents[i], in.queueEntry.profile().indexText(), in.queueEntry.profile().indexMedia(), LibraryProvider.dymLib);
     
                 // update image result list statistics
                 // its good to do this concurrently here, because it needs a DNS lookup
@@ -2035,7 +2035,7 @@ public final class Switchboard extends serverSwitch {
                 Document[] documents = response.parse();
                 if (documents != null) for (Document document: documents) {
                     if (document.indexingDenied()) throw new Parser.Failure("indexing is denied", url);
-                    Condenser condenser = new Condenser(document, true, true);
+                    Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib);
                     ResultImages.registerImages(url, document, true);
                     webStructure.generateCitationReference(url, document, condenser, response.lastModified());
                     storeDocumentIndex(process, response, document, condenser, searchEvent, "heuristic:" + heuristicName);
diff --git a/source/de/anomic/search/TextSnippet.java b/source/de/anomic/search/TextSnippet.java
index 7827da6af..150326cbe 100644
--- a/source/de/anomic/search/TextSnippet.java
+++ b/source/de/anomic/search/TextSnippet.java
@@ -417,7 +417,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
     }
     
     private static boolean containsAllHashes(final String sentence, final HandleSet queryhashes) {
-        final TreeMap<byte[], Integer> m = Condenser.hashSentence(sentence);
+        final TreeMap<byte[], Integer> m = Condenser.hashSentence(sentence, null);
         for (byte[] b: queryhashes) {
             if (!(m.containsKey(b))) return false;
         }
diff --git a/source/net/yacy/cora/storage/ScoreMap.java b/source/net/yacy/cora/storage/ScoreMap.java
index 77878073f..7475a05c5 100644
--- a/source/net/yacy/cora/storage/ScoreMap.java
+++ b/source/net/yacy/cora/storage/ScoreMap.java
@@ -28,6 +28,7 @@ import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.SortedMap;
 import java.util.TreeMap;
 import java.util.TreeSet;
 
@@ -184,6 +185,14 @@ public class ScoreMap<E> implements DynamicScore<E> {
         return score.intValue();
     }
     
+    public SortedMap<E, IntScore> tailMap(E obj) {
+        if (this.map instanceof TreeMap) {
+            return ((TreeMap<E, IntScore>) this.map).tailMap(obj);
+        }
+        throw new UnsupportedOperationException("map must have comparator");
+    }
+    
+    
     public int getMaxScore() {
         if (map.isEmpty()) return -1;
         int maxScore = Integer.MIN_VALUE;
diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java
index 99ffa60e6..ef786c133 100644
--- a/source/net/yacy/document/Condenser.java
+++ b/source/net/yacy/document/Condenser.java
@@ -42,6 +42,8 @@ import java.util.Properties;
 import java.util.TreeMap;
 import java.util.TreeSet;
 
+import de.anomic.data.DidYouMeanLibrary;
+
 import net.yacy.cora.document.MultiProtocolURI;
 import net.yacy.document.language.Identificator;
 import net.yacy.document.parser.html.ContentScraper;
@@ -55,7 +57,7 @@ import net.yacy.kelondro.util.SetTools;
 
 
 public final class Condenser {
-
+    
     // this is the page analysis class
     public final static boolean pseudostemming = false; // switch for removal of words that appear in shortened form
     public final static int wordminsize = 2;
@@ -108,7 +110,8 @@ public final class Condenser {
     public Condenser(
             final Document document,
             final boolean indexText,
-            final boolean indexMedia
+            final boolean indexMedia,
+            final DidYouMeanLibrary meaningLib
             ) throws UnsupportedEncodingException {
         // if addMedia == true, then all the media links are also parsed and added to the words
         // added media words are flagged with the appropriate media flag
@@ -126,7 +129,7 @@ public final class Condenser {
         
         Map.Entry<MultiProtocolURI, String> entry;
         if (indexText) {
-            createCondensement(document.getText());        
+            createCondensement(document.getText(), meaningLib);        
             // the phrase counter:
             // phrase   0 are words taken from the URL
             // phrase   1 is the MainTitle
@@ -140,15 +143,15 @@ public final class Condenser {
             // phrase  99 is taken from the media Link url and anchor description
             // phrase 100 and above are lines from the text
       
-            insertTextToWords(document.dc_title(),       1, WordReferenceRow.flag_app_dc_title, RESULT_FLAGS, true);
-            insertTextToWords(document.dc_description(), 3, WordReferenceRow.flag_app_dc_description, RESULT_FLAGS, true);
-            insertTextToWords(document.dc_creator(),     4, WordReferenceRow.flag_app_dc_creator, RESULT_FLAGS, true);
-            insertTextToWords(document.dc_publisher(),   5, WordReferenceRow.flag_app_dc_creator, RESULT_FLAGS, true);
-            insertTextToWords(document.dc_subject(' '),  6, WordReferenceRow.flag_app_dc_description, RESULT_FLAGS, true);
+            insertTextToWords(document.dc_title(),       1, WordReferenceRow.flag_app_dc_title, RESULT_FLAGS, true, meaningLib);
+            insertTextToWords(document.dc_description(), 3, WordReferenceRow.flag_app_dc_description, RESULT_FLAGS, true, meaningLib);
+            insertTextToWords(document.dc_creator(),     4, WordReferenceRow.flag_app_dc_creator, RESULT_FLAGS, true, meaningLib);
+            insertTextToWords(document.dc_publisher(),   5, WordReferenceRow.flag_app_dc_creator, RESULT_FLAGS, true, meaningLib);
+            insertTextToWords(document.dc_subject(' '),  6, WordReferenceRow.flag_app_dc_description, RESULT_FLAGS, true, meaningLib);
             // missing: tags!
             final String[] titles = document.getSectionTitles();
             for (int i = 0; i < titles.length; i++) {
-                insertTextToWords(titles[i], i + 10, WordReferenceRow.flag_app_emphasized, RESULT_FLAGS, true);
+                insertTextToWords(titles[i], i + 10, WordReferenceRow.flag_app_emphasized, RESULT_FLAGS, true, meaningLib);
             }
             
             // anchors: for text indexing we add only the anchor description
@@ -173,7 +176,7 @@ public final class Condenser {
         }
         
         // add the URL components to the word list
-        insertTextToWords(document.dc_source().toNormalform(false, true), 0, WordReferenceRow.flag_app_dc_identifier, RESULT_FLAGS, false);
+        insertTextToWords(document.dc_source().toNormalform(false, true), 0, WordReferenceRow.flag_app_dc_identifier, RESULT_FLAGS, false, meaningLib);
 
         if (indexMedia) {
             // add anchor descriptions: here, we also add the url components
@@ -181,24 +184,24 @@ public final class Condenser {
             Iterator<Map.Entry<MultiProtocolURI, String>> i = document.getAudiolinks().entrySet().iterator();
             while (i.hasNext()) {
                 entry = i.next();
-                insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasaudio, RESULT_FLAGS, false);
-                insertTextToWords(entry.getValue(), 99, flag_cat_hasaudio, RESULT_FLAGS, true);
+                insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasaudio, RESULT_FLAGS, false, meaningLib);
+                insertTextToWords(entry.getValue(), 99, flag_cat_hasaudio, RESULT_FLAGS, true, meaningLib);
             }
 
             // video
             i = document.getVideolinks().entrySet().iterator();
             while (i.hasNext()) {
                 entry = i.next();
-                insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasvideo, RESULT_FLAGS, false);
-                insertTextToWords(entry.getValue(), 99, flag_cat_hasvideo, RESULT_FLAGS, true);
+                insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasvideo, RESULT_FLAGS, false, meaningLib);
+                insertTextToWords(entry.getValue(), 99, flag_cat_hasvideo, RESULT_FLAGS, true, meaningLib);
             }
 
             // applications
             i = document.getApplinks().entrySet().iterator();
             while (i.hasNext()) {
                 entry = i.next();
-                insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasapp, RESULT_FLAGS, false);
-                insertTextToWords(entry.getValue(), 99, flag_cat_hasapp, RESULT_FLAGS, true);
+                insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasapp, RESULT_FLAGS, false, meaningLib);
+                insertTextToWords(entry.getValue(), 99, flag_cat_hasapp, RESULT_FLAGS, true, meaningLib);
             }
 
             // images
@@ -206,8 +209,8 @@ public final class Condenser {
             ImageEntry ientry;
             while (j.hasNext()) {
                 ientry = j.next();
-                insertTextToWords(ientry.url().toNormalform(false, false), 99, flag_cat_hasimage, RESULT_FLAGS, false);
-                insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, RESULT_FLAGS, true);
+                insertTextToWords(ientry.url().toNormalform(false, false), 99, flag_cat_hasimage, RESULT_FLAGS, false, meaningLib);
+                insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, RESULT_FLAGS, true, meaningLib);
             }
         
             // finally check all words for missing flag entry
@@ -225,12 +228,18 @@ public final class Condenser {
         }
     }
     
-    private void insertTextToWords(final String text, final int phrase, final int flagpos, final Bitfield flagstemplate, boolean useForLanguageIdentification) {
+    private void insertTextToWords(
+            final String text,
+            final int phrase,
+            final int flagpos,
+            final Bitfield flagstemplate,
+            boolean useForLanguageIdentification,
+            DidYouMeanLibrary meaningLib) {
         String word;
         Word wprop;
         sievedWordsEnum wordenum;
         try {
-            wordenum = new sievedWordsEnum(new ByteArrayInputStream(text.getBytes("UTF-8")));
+            wordenum = new sievedWordsEnum(new ByteArrayInputStream(text.getBytes("UTF-8")), meaningLib);
         } catch (final UnsupportedEncodingException e) {
             return;
         }
@@ -250,11 +259,11 @@ public final class Condenser {
         }
     }
 
-    public Condenser(final InputStream text) throws UnsupportedEncodingException {
+    public Condenser(final InputStream text, DidYouMeanLibrary meaningLib) throws UnsupportedEncodingException {
         this.languageIdentificator = null; // we don't need that here
         // analysis = new Properties();
         words = new TreeMap<String, Word>();
-        createCondensement(text);
+        createCondensement(text, meaningLib);
     }
     
     public int excludeWords(final TreeSet<String> stopwords) {
@@ -274,7 +283,7 @@ public final class Condenser {
         return this.languageIdentificator.getLanguage();
     }
 
-    private void createCondensement(final InputStream is) throws UnsupportedEncodingException {
+    private void createCondensement(final InputStream is, DidYouMeanLibrary meaningLib) throws UnsupportedEncodingException {
         final HashSet<String> currsentwords = new HashSet<String>();
         StringBuilder sentence = new StringBuilder(100);
         String word = "";
@@ -293,7 +302,7 @@ public final class Condenser {
         final HashMap<StringBuilder, Phrase> sentences = new HashMap<StringBuilder, Phrase>(100);
         
         // read source
-        final sievedWordsEnum wordenum = new sievedWordsEnum(is);
+        final sievedWordsEnum wordenum = new sievedWordsEnum(is, meaningLib);
         while (wordenum.hasMoreElements()) {
             word = (wordenum.nextElement().toString()).toLowerCase(Locale.ENGLISH); // TODO: does toLowerCase work for non ISO-8859-1 chars?
             if (languageIdentificator != null) languageIdentificator.add(word);
@@ -467,11 +476,11 @@ public final class Condenser {
      * @param sentence the sentence to be tokenized
      * @return a ordered map containing word hashes as key and positions as value. The map is orderd by the hash ordering
      */
-    public static TreeMap<byte[], Integer> hashSentence(final String sentence) {
+    public static TreeMap<byte[], Integer> hashSentence(final String sentence, DidYouMeanLibrary meaningLib) {
         final TreeMap<byte[], Integer> map = new TreeMap<byte[], Integer>(Base64Order.enhancedCoder);
-        final Enumeration<StringBuilder> words = wordTokenizer(sentence, "UTF-8");
+        final Enumeration<String> words = wordTokenizer(sentence, "UTF-8", meaningLib);
         int pos = 0;
-        StringBuilder word;
+        String word;
         byte[] hash;
         Integer oldpos;
         while (words.hasMoreElements()) {
@@ -487,23 +496,25 @@ public final class Condenser {
         return map;
     }
     
-    public static Enumeration<StringBuilder> wordTokenizer(final String s, final String charset) {
+    public static Enumeration<String> wordTokenizer(final String s, final String charset, DidYouMeanLibrary meaningLib) {
         try {
-            return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes(charset)));
+            return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes(charset)), meaningLib);
         } catch (final Exception e) {
             return null;
         }
     }
 	
-    public static class sievedWordsEnum implements Enumeration<StringBuilder> {
+    public static class sievedWordsEnum implements Enumeration<String> {
         // this enumeration removes all words that contain either wrong characters or are too short
         
         StringBuilder buffer = null;
         unsievedWordsEnum e;
+        DidYouMeanLibrary meaningLib;
 
-        public sievedWordsEnum(final InputStream is) throws UnsupportedEncodingException {
-            e = new unsievedWordsEnum(is);
-            buffer = nextElement0();
+        public sievedWordsEnum(final InputStream is, DidYouMeanLibrary meaningLib) throws UnsupportedEncodingException {
+            this.e = new unsievedWordsEnum(is);
+            this.buffer = nextElement0();
+            this.meaningLib = meaningLib;
         }
 
         public void pre(final boolean x) {
@@ -527,9 +538,11 @@ public final class Condenser {
             return buffer != null;
         }
 
-        public StringBuilder nextElement() {
-            final StringBuilder r = buffer;
+        public String nextElement() {
+            final String r = (buffer == null) ? null : buffer.toString();
             buffer = nextElement0();
+            // put word to words statistics cache
+            if (meaningLib != null) meaningLib.learn(r);
             return r;
         }
 
@@ -710,7 +723,7 @@ public final class Condenser {
         return s;
     }
 
-    public static Map<String, Word> getWords(final String text) {
+    public static Map<String, Word> getWords(final String text, DidYouMeanLibrary meaningLib) {
         // returns a word/indexWord relation map
         if (text == null) return null;
         ByteArrayInputStream buffer;
@@ -720,7 +733,7 @@ public final class Condenser {
 			buffer = new ByteArrayInputStream(text.getBytes());
 		}
         try {
-            return new Condenser(buffer).words();
+            return new Condenser(buffer, meaningLib).words();
         } catch (final UnsupportedEncodingException e) {
             return null;
         }
diff --git a/source/net/yacy/document/SnippetExtractor.java b/source/net/yacy/document/SnippetExtractor.java
index 031e3133c..ada609e58 100644
--- a/source/net/yacy/document/SnippetExtractor.java
+++ b/source/net/yacy/document/SnippetExtractor.java
@@ -45,7 +45,7 @@ public class SnippetExtractor {
         int linenumber = 0;
         int fullmatchcounter = 0;
         lookup: for (StringBuilder sentence: sentences) {
-            hs = Condenser.hashSentence(sentence.toString());
+            hs = Condenser.hashSentence(sentence.toString(), null);
             positions = new TreeSet<Integer>();
             for (byte[] word: queryhashes) {
                 pos = hs.get(word);
@@ -124,7 +124,7 @@ public class SnippetExtractor {
             byte[] hash;
             
             // find all hashes that appear in the sentence
-            final TreeMap<byte[], Integer> hs = Condenser.hashSentence(sentence);
+            final TreeMap<byte[], Integer> hs = Condenser.hashSentence(sentence, null);
             final Iterator<byte[]> j = queryhashes.iterator();
             Integer pos;
             int p, minpos = sentence.length(), maxpos = -1;
diff --git a/source/net/yacy/document/parser/torrentParser.java b/source/net/yacy/document/parser/torrentParser.java
index 55d55648e..810b73e20 100644
--- a/source/net/yacy/document/parser/torrentParser.java
+++ b/source/net/yacy/document/parser/torrentParser.java
@@ -28,6 +28,8 @@ import java.io.UnsupportedEncodingException;
 import java.util.List;
 import java.util.Map;
 
+import de.anomic.data.LibraryProvider;
+
 import net.yacy.cora.document.MultiProtocolURI;
 import net.yacy.document.AbstractParser;
 import net.yacy.document.Condenser;
@@ -109,7 +111,7 @@ public class torrentParser extends AbstractParser implements Parser {
             byte[] b = FileUtils.read(new File(args[0]));
             torrentParser parser = new torrentParser();
             Document[] d = parser.parse(new MultiProtocolURI("http://localhost/test.torrent"), null, "utf-8", new ByteArrayInputStream(b));
-            Condenser c = new Condenser(d[0], true, true);
+            Condenser c = new Condenser(d[0], true, true, LibraryProvider.dymLib);
             Map<String, Word> w = c.words();
             for (Map.Entry<String, Word> e: w.entrySet()) System.out.println("Word: " + e.getKey() + " - " + e.getValue().posInText);
         } catch (IOException e) {