From 58e74282af70f74d293272d1a71cfe0710105361 Mon Sep 17 00:00:00 2001 From: orbiter Date: Mon, 18 Oct 2010 11:35:09 +0000 Subject: [PATCH] added a word counter statistic in condenser which is used by the did-you-mean to calculate best matches for given search words. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7258 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/ViewFile.java | 5 +- source/de/anomic/data/DidYouMean.java | 35 ++++++-- source/de/anomic/data/DidYouMeanLibrary.java | 17 ++-- source/de/anomic/search/DocumentIndex.java | 4 +- source/de/anomic/search/MediaSnippet.java | 2 +- source/de/anomic/search/QueryParams.java | 4 +- source/de/anomic/search/ResultEntry.java | 2 +- source/de/anomic/search/Segment.java | 2 +- source/de/anomic/search/Switchboard.java | 4 +- source/de/anomic/search/TextSnippet.java | 2 +- source/net/yacy/cora/storage/ScoreMap.java | 9 ++ source/net/yacy/document/Condenser.java | 87 +++++++++++-------- .../net/yacy/document/SnippetExtractor.java | 4 +- .../yacy/document/parser/torrentParser.java | 4 +- 14 files changed, 118 insertions(+), 63 deletions(-) diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index df999f1bc..791ddd83c 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -47,6 +47,7 @@ import net.yacy.kelondro.data.meta.URIMetadataRow; import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.retrieval.Response; +import de.anomic.data.LibraryProvider; import de.anomic.http.client.Cache; import de.anomic.search.Segment; import de.anomic.search.Segments; @@ -277,9 +278,9 @@ public class ViewFile { // Search word highlighting for (StringBuilder s: sentences) { sentence = s.toString(); - Enumeration tokens = Condenser.wordTokenizer(sentence, "UTF-8"); + Enumeration tokens = Condenser.wordTokenizer(sentence, "UTF-8", LibraryProvider.dymLib); while (tokens.hasMoreElements()) { - token = tokens.nextElement().toString(); + token = tokens.nextElement(); if (token.length() > 0) { prop.put("viewMode_words_" + i + "_nr", i + 1); prop.put("viewMode_words_" + i + "_word", token); diff --git a/source/de/anomic/data/DidYouMean.java b/source/de/anomic/data/DidYouMean.java index 89949f109..d6081a56c 100644 --- a/source/de/anomic/data/DidYouMean.java +++ b/source/de/anomic/data/DidYouMean.java @@ -63,6 +63,7 @@ public class DidYouMean { private long timeLimit; private boolean createGen; // keeps the value 'true' as long as no entry in guessLib is written private final SortedSet resultSet; + private final indexSizeComparator INDEX_SIZE_COMPARATOR; /** @@ -70,13 +71,14 @@ public class DidYouMean { * @param sort true/false - sorts the resulting TreeSet by index.count(); Warning: this causes heavy i/o. */ public DidYouMean(final IndexCell index, String word0) { - this.resultSet = Collections.synchronizedSortedSet(new TreeSet(WORD_LENGTH_COMPARATOR)); + this.resultSet = Collections.synchronizedSortedSet(new TreeSet(new headMatchingComparator(word0, WORD_LENGTH_COMPARATOR))); this.word = word0.toLowerCase(); this.wordLen = word.length(); this.index = index; this.guessGen = new LinkedBlockingQueue(); this.guessLib = new LinkedBlockingQueue(); this.createGen = true; + this.INDEX_SIZE_COMPARATOR = new indexSizeComparator(); // identify language if (this.word.length() == 0) { @@ -134,7 +136,7 @@ public class DidYouMean { if (scored.size() >= 2 * preSortSelection) break; scored.inc(s, index.count(Word.word2hash(s))); } - SortedSet countSorted = Collections.synchronizedSortedSet(new TreeSet(new indexSizeComparator())); + SortedSet countSorted = Collections.synchronizedSortedSet(new TreeSet(new headMatchingComparator(this.word, this.INDEX_SIZE_COMPARATOR))); int wc = index.count(Word.word2hash(this.word)); // all counts must be greater than this while (scored.size() > 0 && countSorted.size() < preSortSelection) { String s = scored.getMaxKey(); @@ -351,9 +353,9 @@ public class DidYouMean { } catch (InterruptedException e) {} } } - + /** - * indexSizeComparator is used by DidYouMean to order terms by index.count()

+ * indexSizeComparator is used by DidYouMean to order terms by index.count() * Warning: this causes heavy i/o */ private class indexSizeComparator implements Comparator { @@ -363,11 +365,11 @@ public class DidYouMean { final int i2 = index.count(Word.word2hash(o2)); if (i1 == i2) return WORD_LENGTH_COMPARATOR.compare(o1, o2); return (i1 < i2) ? 1 : -1; // '<' is correct, because the largest count shall be ordered to be the first position in the result - } + } } /** - * wordLengthComparator is used by DidYouMean to order terms by the term length

+ * wordLengthComparator is used by DidYouMean to order terms by the term length * This is the default order if the indexSizeComparator is not used */ private static class wordLengthComparator implements Comparator { @@ -376,11 +378,30 @@ public class DidYouMean { final int i1 = o1.length(); final int i2 = o2.length(); if (i1 == i2) return o1.compareTo(o2); - return (i1 > i2) ? 1 : -1; // '>' is correct, because the shortest word shall be first + return (i1 < i2) ? 1 : -1; // '<' is correct, because the longest word shall be first } } + /** + * headMatchingComparator is used to sort results in such a way that words that match with the given words are sorted first + */ + private static class headMatchingComparator implements Comparator { + private final String head; + private final Comparator secondaryComparator; + public headMatchingComparator(String head, Comparator secondaryComparator) { + this.head = head.toLowerCase(); + this.secondaryComparator = secondaryComparator; + } + + public int compare(final String o1, final String o2) { + boolean o1m = o1.toLowerCase().startsWith(head); + boolean o2m = o2.toLowerCase().startsWith(head); + if ((o1m && o2m) || (!o1m && !o2m)) return secondaryComparator.compare(o1, o2); + return o1m ? -1 : 1; + } + } + } diff --git a/source/de/anomic/data/DidYouMeanLibrary.java b/source/de/anomic/data/DidYouMeanLibrary.java index cca665499..16d1f905a 100644 --- a/source/de/anomic/data/DidYouMeanLibrary.java +++ b/source/de/anomic/data/DidYouMeanLibrary.java @@ -33,12 +33,14 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.HashSet; +import java.util.Map; import java.util.Set; +import java.util.SortedMap; import java.util.SortedSet; import java.util.TreeSet; import java.util.zip.GZIPInputStream; -import net.yacy.cora.storage.DynamicScore; +import net.yacy.cora.storage.IntScore; import net.yacy.cora.storage.ScoreMap; import net.yacy.kelondro.logging.Log; @@ -50,8 +52,8 @@ public class DidYouMeanLibrary { // common word cache private static final int commonWordsMaxSize = 100000; // maximum size of common word cache - private static final int commonWordsMinLength = 4; // words must have that length at minimum - private DynamicScore commonWords = new ScoreMap(); + private static final int commonWordsMinLength = 5; // words must have that length at minimum + private ScoreMap commonWords = new ScoreMap(String.CASE_INSENSITIVE_ORDER); // dictionaries private final File dictionaryPath; @@ -76,10 +78,9 @@ public class DidYouMeanLibrary { */ public void learn(String word) { if (word == null) return; - word = word.trim().toLowerCase(); if (word.length() < commonWordsMinLength) return; commonWords.inc(word); - if (commonWords.size() >= commonWordsMaxSize) { + if (commonWords.size() > commonWordsMaxSize) { commonWords.shrinkToMaxSize(commonWordsMaxSize / 2); } } @@ -140,6 +141,12 @@ public class DidYouMeanLibrary { for (final String r: t) { if (r.startsWith(string) && r.length() > string.length()) ret.add(r); else break; } + SortedMap u = this.commonWords.tailMap(string); + String vv; + for (final Map.Entry v: u.entrySet()) { + vv = v.getKey(); + if (vv.startsWith(string) && vv.length() > string.length()) ret.add(vv); else break; + } string = reverse(string); t = this.tcid.tailSet(string); for (final String r: t) { diff --git a/source/de/anomic/search/DocumentIndex.java b/source/de/anomic/search/DocumentIndex.java index 0888addd5..a1c16f1f9 100644 --- a/source/de/anomic/search/DocumentIndex.java +++ b/source/de/anomic/search/DocumentIndex.java @@ -35,6 +35,8 @@ import java.util.Date; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; +import de.anomic.data.LibraryProvider; + import net.yacy.document.Condenser; import net.yacy.document.Document; import net.yacy.document.TextParser; @@ -133,7 +135,7 @@ public class DocumentIndex extends Segment { throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage()); } Document document = Document.mergeDocuments(url, null, documents); - final Condenser condenser = new Condenser(document, true, true); + final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib); return super.storeDocument( url, null, diff --git a/source/de/anomic/search/MediaSnippet.java b/source/de/anomic/search/MediaSnippet.java index a4240320a..07c27249c 100644 --- a/source/de/anomic/search/MediaSnippet.java +++ b/source/de/anomic/search/MediaSnippet.java @@ -206,7 +206,7 @@ public class MediaSnippet implements Comparable, Comparator hs = Condenser.hashSentence(sentence); + final TreeMap hs = Condenser.hashSentence(sentence, null); final Iterator j = queryhashes.iterator(); byte[] hash; Integer pos; diff --git a/source/de/anomic/search/QueryParams.java b/source/de/anomic/search/QueryParams.java index c209e8fe6..79abf7753 100644 --- a/source/de/anomic/search/QueryParams.java +++ b/source/de/anomic/search/QueryParams.java @@ -294,7 +294,7 @@ public final class QueryParams { */ public final boolean matchesText(final String text) { boolean ret = false; - final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text).keySet()); + final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text, null).keySet()); if (!SetTools.anymatch(wordhashes, this.excludeHashes)) { ret = SetTools.totalInclusion(this.queryHashes, wordhashes); } @@ -304,7 +304,7 @@ public final class QueryParams { protected static final boolean anymatch(final String text, final HandleSet keyhashes) { // returns true if any of the word hashes in keyhashes appear in the String text // to do this, all words in the string must be recognized and transcoded to word hashes - final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text).keySet()); + final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text, null).keySet()); return SetTools.anymatch(wordhashes, keyhashes); } diff --git a/source/de/anomic/search/ResultEntry.java b/source/de/anomic/search/ResultEntry.java index 96983aa33..6f6776828 100644 --- a/source/de/anomic/search/ResultEntry.java +++ b/source/de/anomic/search/ResultEntry.java @@ -89,7 +89,7 @@ public class ResultEntry implements Comparable, Comparator words = null; try { - words = new Condenser(document, true, true).words().keySet(); + words = new Condenser(document, true, true, null).words().keySet(); } catch (final UnsupportedEncodingException e) { Log.logException(e); } diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index a8f5f01c8..cb3e827b2 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -1855,7 +1855,7 @@ public final class Switchboard extends serverSwitch { for (int i = 0; i < in.documents.length; i++) { // strip out words and generate statistics try { - condenser[i] = new Condenser(in.documents[i], in.queueEntry.profile().indexText(), in.queueEntry.profile().indexMedia()); + condenser[i] = new Condenser(in.documents[i], in.queueEntry.profile().indexText(), in.queueEntry.profile().indexMedia(), LibraryProvider.dymLib); // update image result list statistics // its good to do this concurrently here, because it needs a DNS lookup @@ -2035,7 +2035,7 @@ public final class Switchboard extends serverSwitch { Document[] documents = response.parse(); if (documents != null) for (Document document: documents) { if (document.indexingDenied()) throw new Parser.Failure("indexing is denied", url); - Condenser condenser = new Condenser(document, true, true); + Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib); ResultImages.registerImages(url, document, true); webStructure.generateCitationReference(url, document, condenser, response.lastModified()); storeDocumentIndex(process, response, document, condenser, searchEvent, "heuristic:" + heuristicName); diff --git a/source/de/anomic/search/TextSnippet.java b/source/de/anomic/search/TextSnippet.java index 7827da6af..150326cbe 100644 --- a/source/de/anomic/search/TextSnippet.java +++ b/source/de/anomic/search/TextSnippet.java @@ -417,7 +417,7 @@ public class TextSnippet implements Comparable, Comparator m = Condenser.hashSentence(sentence); + final TreeMap m = Condenser.hashSentence(sentence, null); for (byte[] b: queryhashes) { if (!(m.containsKey(b))) return false; } diff --git a/source/net/yacy/cora/storage/ScoreMap.java b/source/net/yacy/cora/storage/ScoreMap.java index 77878073f..7475a05c5 100644 --- a/source/net/yacy/cora/storage/ScoreMap.java +++ b/source/net/yacy/cora/storage/ScoreMap.java @@ -28,6 +28,7 @@ import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.SortedMap; import java.util.TreeMap; import java.util.TreeSet; @@ -184,6 +185,14 @@ public class ScoreMap implements DynamicScore { return score.intValue(); } + public SortedMap tailMap(E obj) { + if (this.map instanceof TreeMap) { + return ((TreeMap) this.map).tailMap(obj); + } + throw new UnsupportedOperationException("map must have comparator"); + } + + public int getMaxScore() { if (map.isEmpty()) return -1; int maxScore = Integer.MIN_VALUE; diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index 99ffa60e6..ef786c133 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -42,6 +42,8 @@ import java.util.Properties; import java.util.TreeMap; import java.util.TreeSet; +import de.anomic.data.DidYouMeanLibrary; + import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.language.Identificator; import net.yacy.document.parser.html.ContentScraper; @@ -55,7 +57,7 @@ import net.yacy.kelondro.util.SetTools; public final class Condenser { - + // this is the page analysis class public final static boolean pseudostemming = false; // switch for removal of words that appear in shortened form public final static int wordminsize = 2; @@ -108,7 +110,8 @@ public final class Condenser { public Condenser( final Document document, final boolean indexText, - final boolean indexMedia + final boolean indexMedia, + final DidYouMeanLibrary meaningLib ) throws UnsupportedEncodingException { // if addMedia == true, then all the media links are also parsed and added to the words // added media words are flagged with the appropriate media flag @@ -126,7 +129,7 @@ public final class Condenser { Map.Entry entry; if (indexText) { - createCondensement(document.getText()); + createCondensement(document.getText(), meaningLib); // the phrase counter: // phrase 0 are words taken from the URL // phrase 1 is the MainTitle @@ -140,15 +143,15 @@ public final class Condenser { // phrase 99 is taken from the media Link url and anchor description // phrase 100 and above are lines from the text - insertTextToWords(document.dc_title(), 1, WordReferenceRow.flag_app_dc_title, RESULT_FLAGS, true); - insertTextToWords(document.dc_description(), 3, WordReferenceRow.flag_app_dc_description, RESULT_FLAGS, true); - insertTextToWords(document.dc_creator(), 4, WordReferenceRow.flag_app_dc_creator, RESULT_FLAGS, true); - insertTextToWords(document.dc_publisher(), 5, WordReferenceRow.flag_app_dc_creator, RESULT_FLAGS, true); - insertTextToWords(document.dc_subject(' '), 6, WordReferenceRow.flag_app_dc_description, RESULT_FLAGS, true); + insertTextToWords(document.dc_title(), 1, WordReferenceRow.flag_app_dc_title, RESULT_FLAGS, true, meaningLib); + insertTextToWords(document.dc_description(), 3, WordReferenceRow.flag_app_dc_description, RESULT_FLAGS, true, meaningLib); + insertTextToWords(document.dc_creator(), 4, WordReferenceRow.flag_app_dc_creator, RESULT_FLAGS, true, meaningLib); + insertTextToWords(document.dc_publisher(), 5, WordReferenceRow.flag_app_dc_creator, RESULT_FLAGS, true, meaningLib); + insertTextToWords(document.dc_subject(' '), 6, WordReferenceRow.flag_app_dc_description, RESULT_FLAGS, true, meaningLib); // missing: tags! final String[] titles = document.getSectionTitles(); for (int i = 0; i < titles.length; i++) { - insertTextToWords(titles[i], i + 10, WordReferenceRow.flag_app_emphasized, RESULT_FLAGS, true); + insertTextToWords(titles[i], i + 10, WordReferenceRow.flag_app_emphasized, RESULT_FLAGS, true, meaningLib); } // anchors: for text indexing we add only the anchor description @@ -173,7 +176,7 @@ public final class Condenser { } // add the URL components to the word list - insertTextToWords(document.dc_source().toNormalform(false, true), 0, WordReferenceRow.flag_app_dc_identifier, RESULT_FLAGS, false); + insertTextToWords(document.dc_source().toNormalform(false, true), 0, WordReferenceRow.flag_app_dc_identifier, RESULT_FLAGS, false, meaningLib); if (indexMedia) { // add anchor descriptions: here, we also add the url components @@ -181,24 +184,24 @@ public final class Condenser { Iterator> i = document.getAudiolinks().entrySet().iterator(); while (i.hasNext()) { entry = i.next(); - insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasaudio, RESULT_FLAGS, false); - insertTextToWords(entry.getValue(), 99, flag_cat_hasaudio, RESULT_FLAGS, true); + insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasaudio, RESULT_FLAGS, false, meaningLib); + insertTextToWords(entry.getValue(), 99, flag_cat_hasaudio, RESULT_FLAGS, true, meaningLib); } // video i = document.getVideolinks().entrySet().iterator(); while (i.hasNext()) { entry = i.next(); - insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasvideo, RESULT_FLAGS, false); - insertTextToWords(entry.getValue(), 99, flag_cat_hasvideo, RESULT_FLAGS, true); + insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasvideo, RESULT_FLAGS, false, meaningLib); + insertTextToWords(entry.getValue(), 99, flag_cat_hasvideo, RESULT_FLAGS, true, meaningLib); } // applications i = document.getApplinks().entrySet().iterator(); while (i.hasNext()) { entry = i.next(); - insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasapp, RESULT_FLAGS, false); - insertTextToWords(entry.getValue(), 99, flag_cat_hasapp, RESULT_FLAGS, true); + insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasapp, RESULT_FLAGS, false, meaningLib); + insertTextToWords(entry.getValue(), 99, flag_cat_hasapp, RESULT_FLAGS, true, meaningLib); } // images @@ -206,8 +209,8 @@ public final class Condenser { ImageEntry ientry; while (j.hasNext()) { ientry = j.next(); - insertTextToWords(ientry.url().toNormalform(false, false), 99, flag_cat_hasimage, RESULT_FLAGS, false); - insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, RESULT_FLAGS, true); + insertTextToWords(ientry.url().toNormalform(false, false), 99, flag_cat_hasimage, RESULT_FLAGS, false, meaningLib); + insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, RESULT_FLAGS, true, meaningLib); } // finally check all words for missing flag entry @@ -225,12 +228,18 @@ public final class Condenser { } } - private void insertTextToWords(final String text, final int phrase, final int flagpos, final Bitfield flagstemplate, boolean useForLanguageIdentification) { + private void insertTextToWords( + final String text, + final int phrase, + final int flagpos, + final Bitfield flagstemplate, + boolean useForLanguageIdentification, + DidYouMeanLibrary meaningLib) { String word; Word wprop; sievedWordsEnum wordenum; try { - wordenum = new sievedWordsEnum(new ByteArrayInputStream(text.getBytes("UTF-8"))); + wordenum = new sievedWordsEnum(new ByteArrayInputStream(text.getBytes("UTF-8")), meaningLib); } catch (final UnsupportedEncodingException e) { return; } @@ -250,11 +259,11 @@ public final class Condenser { } } - public Condenser(final InputStream text) throws UnsupportedEncodingException { + public Condenser(final InputStream text, DidYouMeanLibrary meaningLib) throws UnsupportedEncodingException { this.languageIdentificator = null; // we don't need that here // analysis = new Properties(); words = new TreeMap(); - createCondensement(text); + createCondensement(text, meaningLib); } public int excludeWords(final TreeSet stopwords) { @@ -274,7 +283,7 @@ public final class Condenser { return this.languageIdentificator.getLanguage(); } - private void createCondensement(final InputStream is) throws UnsupportedEncodingException { + private void createCondensement(final InputStream is, DidYouMeanLibrary meaningLib) throws UnsupportedEncodingException { final HashSet currsentwords = new HashSet(); StringBuilder sentence = new StringBuilder(100); String word = ""; @@ -293,7 +302,7 @@ public final class Condenser { final HashMap sentences = new HashMap(100); // read source - final sievedWordsEnum wordenum = new sievedWordsEnum(is); + final sievedWordsEnum wordenum = new sievedWordsEnum(is, meaningLib); while (wordenum.hasMoreElements()) { word = (wordenum.nextElement().toString()).toLowerCase(Locale.ENGLISH); // TODO: does toLowerCase work for non ISO-8859-1 chars? if (languageIdentificator != null) languageIdentificator.add(word); @@ -467,11 +476,11 @@ public final class Condenser { * @param sentence the sentence to be tokenized * @return a ordered map containing word hashes as key and positions as value. The map is orderd by the hash ordering */ - public static TreeMap hashSentence(final String sentence) { + public static TreeMap hashSentence(final String sentence, DidYouMeanLibrary meaningLib) { final TreeMap map = new TreeMap(Base64Order.enhancedCoder); - final Enumeration words = wordTokenizer(sentence, "UTF-8"); + final Enumeration words = wordTokenizer(sentence, "UTF-8", meaningLib); int pos = 0; - StringBuilder word; + String word; byte[] hash; Integer oldpos; while (words.hasMoreElements()) { @@ -487,23 +496,25 @@ public final class Condenser { return map; } - public static Enumeration wordTokenizer(final String s, final String charset) { + public static Enumeration wordTokenizer(final String s, final String charset, DidYouMeanLibrary meaningLib) { try { - return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes(charset))); + return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes(charset)), meaningLib); } catch (final Exception e) { return null; } } - public static class sievedWordsEnum implements Enumeration { + public static class sievedWordsEnum implements Enumeration { // this enumeration removes all words that contain either wrong characters or are too short StringBuilder buffer = null; unsievedWordsEnum e; + DidYouMeanLibrary meaningLib; - public sievedWordsEnum(final InputStream is) throws UnsupportedEncodingException { - e = new unsievedWordsEnum(is); - buffer = nextElement0(); + public sievedWordsEnum(final InputStream is, DidYouMeanLibrary meaningLib) throws UnsupportedEncodingException { + this.e = new unsievedWordsEnum(is); + this.buffer = nextElement0(); + this.meaningLib = meaningLib; } public void pre(final boolean x) { @@ -527,9 +538,11 @@ public final class Condenser { return buffer != null; } - public StringBuilder nextElement() { - final StringBuilder r = buffer; + public String nextElement() { + final String r = (buffer == null) ? null : buffer.toString(); buffer = nextElement0(); + // put word to words statistics cache + if (meaningLib != null) meaningLib.learn(r); return r; } @@ -710,7 +723,7 @@ public final class Condenser { return s; } - public static Map getWords(final String text) { + public static Map getWords(final String text, DidYouMeanLibrary meaningLib) { // returns a word/indexWord relation map if (text == null) return null; ByteArrayInputStream buffer; @@ -720,7 +733,7 @@ public final class Condenser { buffer = new ByteArrayInputStream(text.getBytes()); } try { - return new Condenser(buffer).words(); + return new Condenser(buffer, meaningLib).words(); } catch (final UnsupportedEncodingException e) { return null; } diff --git a/source/net/yacy/document/SnippetExtractor.java b/source/net/yacy/document/SnippetExtractor.java index 031e3133c..ada609e58 100644 --- a/source/net/yacy/document/SnippetExtractor.java +++ b/source/net/yacy/document/SnippetExtractor.java @@ -45,7 +45,7 @@ public class SnippetExtractor { int linenumber = 0; int fullmatchcounter = 0; lookup: for (StringBuilder sentence: sentences) { - hs = Condenser.hashSentence(sentence.toString()); + hs = Condenser.hashSentence(sentence.toString(), null); positions = new TreeSet(); for (byte[] word: queryhashes) { pos = hs.get(word); @@ -124,7 +124,7 @@ public class SnippetExtractor { byte[] hash; // find all hashes that appear in the sentence - final TreeMap hs = Condenser.hashSentence(sentence); + final TreeMap hs = Condenser.hashSentence(sentence, null); final Iterator j = queryhashes.iterator(); Integer pos; int p, minpos = sentence.length(), maxpos = -1; diff --git a/source/net/yacy/document/parser/torrentParser.java b/source/net/yacy/document/parser/torrentParser.java index 55d55648e..810b73e20 100644 --- a/source/net/yacy/document/parser/torrentParser.java +++ b/source/net/yacy/document/parser/torrentParser.java @@ -28,6 +28,8 @@ import java.io.UnsupportedEncodingException; import java.util.List; import java.util.Map; +import de.anomic.data.LibraryProvider; + import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.AbstractParser; import net.yacy.document.Condenser; @@ -109,7 +111,7 @@ public class torrentParser extends AbstractParser implements Parser { byte[] b = FileUtils.read(new File(args[0])); torrentParser parser = new torrentParser(); Document[] d = parser.parse(new MultiProtocolURI("http://localhost/test.torrent"), null, "utf-8", new ByteArrayInputStream(b)); - Condenser c = new Condenser(d[0], true, true); + Condenser c = new Condenser(d[0], true, true, LibraryProvider.dymLib); Map w = c.words(); for (Map.Entry e: w.entrySet()) System.out.println("Word: " + e.getKey() + " - " + e.getValue().posInText); } catch (IOException e) {