added a word counter statistic in condenser which is used by the did-you-mean to calculate best matches for given search words.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7258 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 2a0eb09e08
commit 58e74282af

@ -47,6 +47,7 @@ import net.yacy.kelondro.data.meta.URIMetadataRow;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.Response;
import de.anomic.data.LibraryProvider;
import de.anomic.http.client.Cache;
import de.anomic.search.Segment;
import de.anomic.search.Segments;
@ -277,9 +278,9 @@ public class ViewFile {
// Search word highlighting
for (StringBuilder s: sentences) {
sentence = s.toString();
Enumeration<StringBuilder> tokens = Condenser.wordTokenizer(sentence, "UTF-8");
Enumeration<String> tokens = Condenser.wordTokenizer(sentence, "UTF-8", LibraryProvider.dymLib);
while (tokens.hasMoreElements()) {
token = tokens.nextElement().toString();
token = tokens.nextElement();
if (token.length() > 0) {
prop.put("viewMode_words_" + i + "_nr", i + 1);
prop.put("viewMode_words_" + i + "_word", token);

@ -63,6 +63,7 @@ public class DidYouMean {
private long timeLimit;
private boolean createGen; // keeps the value 'true' as long as no entry in guessLib is written
private final SortedSet<String> resultSet;
private final indexSizeComparator INDEX_SIZE_COMPARATOR;
/**
@ -70,13 +71,14 @@ public class DidYouMean {
* @param sort true/false - sorts the resulting TreeSet by index.count(); <b>Warning:</b> this causes heavy i/o.
*/
public DidYouMean(final IndexCell<WordReference> index, String word0) {
this.resultSet = Collections.synchronizedSortedSet(new TreeSet<String>(WORD_LENGTH_COMPARATOR));
this.resultSet = Collections.synchronizedSortedSet(new TreeSet<String>(new headMatchingComparator(word0, WORD_LENGTH_COMPARATOR)));
this.word = word0.toLowerCase();
this.wordLen = word.length();
this.index = index;
this.guessGen = new LinkedBlockingQueue<String>();
this.guessLib = new LinkedBlockingQueue<String>();
this.createGen = true;
this.INDEX_SIZE_COMPARATOR = new indexSizeComparator();
// identify language
if (this.word.length() == 0) {
@ -134,7 +136,7 @@ public class DidYouMean {
if (scored.size() >= 2 * preSortSelection) break;
scored.inc(s, index.count(Word.word2hash(s)));
}
SortedSet<String> countSorted = Collections.synchronizedSortedSet(new TreeSet<String>(new indexSizeComparator()));
SortedSet<String> countSorted = Collections.synchronizedSortedSet(new TreeSet<String>(new headMatchingComparator(this.word, this.INDEX_SIZE_COMPARATOR)));
int wc = index.count(Word.word2hash(this.word)); // all counts must be greater than this
while (scored.size() > 0 && countSorted.size() < preSortSelection) {
String s = scored.getMaxKey();
@ -351,9 +353,9 @@ public class DidYouMean {
} catch (InterruptedException e) {}
}
}
/**
* indexSizeComparator is used by DidYouMean to order terms by index.count()<p/>
* indexSizeComparator is used by DidYouMean to order terms by index.count()
* <b>Warning:</b> this causes heavy i/o
*/
private class indexSizeComparator implements Comparator<String> {
@ -363,11 +365,11 @@ public class DidYouMean {
final int i2 = index.count(Word.word2hash(o2));
if (i1 == i2) return WORD_LENGTH_COMPARATOR.compare(o1, o2);
return (i1 < i2) ? 1 : -1; // '<' is correct, because the largest count shall be ordered to be the first position in the result
}
}
}
/**
* wordLengthComparator is used by DidYouMean to order terms by the term length<p/>
* wordLengthComparator is used by DidYouMean to order terms by the term length
* This is the default order if the indexSizeComparator is not used
*/
private static class wordLengthComparator implements Comparator<String> {
@ -376,11 +378,30 @@ public class DidYouMean {
final int i1 = o1.length();
final int i2 = o2.length();
if (i1 == i2) return o1.compareTo(o2);
return (i1 > i2) ? 1 : -1; // '>' is correct, because the shortest word shall be first
return (i1 < i2) ? 1 : -1; // '<' is correct, because the longest word shall be first
}
}
/**
* headMatchingComparator is used to sort results in such a way that words that match with the given words are sorted first
*/
private static class headMatchingComparator implements Comparator<String> {
private final String head;
private final Comparator<String> secondaryComparator;
public headMatchingComparator(String head, Comparator<String> secondaryComparator) {
this.head = head.toLowerCase();
this.secondaryComparator = secondaryComparator;
}
public int compare(final String o1, final String o2) {
boolean o1m = o1.toLowerCase().startsWith(head);
boolean o2m = o2.toLowerCase().startsWith(head);
if ((o1m && o2m) || (!o1m && !o2m)) return secondaryComparator.compare(o1, o2);
return o1m ? -1 : 1;
}
}
}

@ -33,12 +33,14 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.zip.GZIPInputStream;
import net.yacy.cora.storage.DynamicScore;
import net.yacy.cora.storage.IntScore;
import net.yacy.cora.storage.ScoreMap;
import net.yacy.kelondro.logging.Log;
@ -50,8 +52,8 @@ public class DidYouMeanLibrary {
// common word cache
private static final int commonWordsMaxSize = 100000; // maximum size of common word cache
private static final int commonWordsMinLength = 4; // words must have that length at minimum
private DynamicScore<String> commonWords = new ScoreMap<String>();
private static final int commonWordsMinLength = 5; // words must have that length at minimum
private ScoreMap<String> commonWords = new ScoreMap<String>(String.CASE_INSENSITIVE_ORDER);
// dictionaries
private final File dictionaryPath;
@ -76,10 +78,9 @@ public class DidYouMeanLibrary {
*/
public void learn(String word) {
if (word == null) return;
word = word.trim().toLowerCase();
if (word.length() < commonWordsMinLength) return;
commonWords.inc(word);
if (commonWords.size() >= commonWordsMaxSize) {
if (commonWords.size() > commonWordsMaxSize) {
commonWords.shrinkToMaxSize(commonWordsMaxSize / 2);
}
}
@ -140,6 +141,12 @@ public class DidYouMeanLibrary {
for (final String r: t) {
if (r.startsWith(string) && r.length() > string.length()) ret.add(r); else break;
}
SortedMap<String, IntScore> u = this.commonWords.tailMap(string);
String vv;
for (final Map.Entry<String, IntScore> v: u.entrySet()) {
vv = v.getKey();
if (vv.startsWith(string) && vv.length() > string.length()) ret.add(vv); else break;
}
string = reverse(string);
t = this.tcid.tailSet(string);
for (final String r: t) {

@ -35,6 +35,8 @@ import java.util.Date;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import de.anomic.data.LibraryProvider;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.TextParser;
@ -133,7 +135,7 @@ public class DocumentIndex extends Segment {
throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage());
}
Document document = Document.mergeDocuments(url, null, documents);
final Condenser condenser = new Condenser(document, true, true);
final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib);
return super.storeDocument(
url,
null,

@ -206,7 +206,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
private static HandleSet removeAppearanceHashes(final String sentence, final HandleSet queryhashes) {
// remove all hashes that appear in the sentence
if (sentence == null) return queryhashes;
final TreeMap<byte[], Integer> hs = Condenser.hashSentence(sentence);
final TreeMap<byte[], Integer> hs = Condenser.hashSentence(sentence, null);
final Iterator<byte[]> j = queryhashes.iterator();
byte[] hash;
Integer pos;

@ -294,7 +294,7 @@ public final class QueryParams {
*/
public final boolean matchesText(final String text) {
boolean ret = false;
final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text).keySet());
final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text, null).keySet());
if (!SetTools.anymatch(wordhashes, this.excludeHashes)) {
ret = SetTools.totalInclusion(this.queryHashes, wordhashes);
}
@ -304,7 +304,7 @@ public final class QueryParams {
protected static final boolean anymatch(final String text, final HandleSet keyhashes) {
// returns true if any of the word hashes in keyhashes appear in the String text
// to do this, all words in the string must be recognized and transcoded to word hashes
final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text).keySet());
final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text, null).keySet());
return SetTools.anymatch(wordhashes, keyhashes);
}

@ -89,7 +89,7 @@ public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEn
("yacyshare " +
filename.replace('?', ' ') +
" " +
urlcomps.dc_title())).keySet()),
urlcomps.dc_title()), null).keySet()),
urlentry.hash());
} catch (IOException e) {
Log.logException(e);

@ -424,7 +424,7 @@ public class Segment {
// get the word set
Set<String> words = null;
try {
words = new Condenser(document, true, true).words().keySet();
words = new Condenser(document, true, true, null).words().keySet();
} catch (final UnsupportedEncodingException e) {
Log.logException(e);
}

@ -1855,7 +1855,7 @@ public final class Switchboard extends serverSwitch {
for (int i = 0; i < in.documents.length; i++) {
// strip out words and generate statistics
try {
condenser[i] = new Condenser(in.documents[i], in.queueEntry.profile().indexText(), in.queueEntry.profile().indexMedia());
condenser[i] = new Condenser(in.documents[i], in.queueEntry.profile().indexText(), in.queueEntry.profile().indexMedia(), LibraryProvider.dymLib);
// update image result list statistics
// its good to do this concurrently here, because it needs a DNS lookup
@ -2035,7 +2035,7 @@ public final class Switchboard extends serverSwitch {
Document[] documents = response.parse();
if (documents != null) for (Document document: documents) {
if (document.indexingDenied()) throw new Parser.Failure("indexing is denied", url);
Condenser condenser = new Condenser(document, true, true);
Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib);
ResultImages.registerImages(url, document, true);
webStructure.generateCitationReference(url, document, condenser, response.lastModified());
storeDocumentIndex(process, response, document, condenser, searchEvent, "heuristic:" + heuristicName);

@ -417,7 +417,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
}
private static boolean containsAllHashes(final String sentence, final HandleSet queryhashes) {
final TreeMap<byte[], Integer> m = Condenser.hashSentence(sentence);
final TreeMap<byte[], Integer> m = Condenser.hashSentence(sentence, null);
for (byte[] b: queryhashes) {
if (!(m.containsKey(b))) return false;
}

@ -28,6 +28,7 @@ import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.TreeSet;
@ -184,6 +185,14 @@ public class ScoreMap<E> implements DynamicScore<E> {
return score.intValue();
}
public SortedMap<E, IntScore> tailMap(E obj) {
if (this.map instanceof TreeMap) {
return ((TreeMap<E, IntScore>) this.map).tailMap(obj);
}
throw new UnsupportedOperationException("map must have comparator");
}
public int getMaxScore() {
if (map.isEmpty()) return -1;
int maxScore = Integer.MIN_VALUE;

@ -42,6 +42,8 @@ import java.util.Properties;
import java.util.TreeMap;
import java.util.TreeSet;
import de.anomic.data.DidYouMeanLibrary;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.language.Identificator;
import net.yacy.document.parser.html.ContentScraper;
@ -55,7 +57,7 @@ import net.yacy.kelondro.util.SetTools;
public final class Condenser {
// this is the page analysis class
public final static boolean pseudostemming = false; // switch for removal of words that appear in shortened form
public final static int wordminsize = 2;
@ -108,7 +110,8 @@ public final class Condenser {
public Condenser(
final Document document,
final boolean indexText,
final boolean indexMedia
final boolean indexMedia,
final DidYouMeanLibrary meaningLib
) throws UnsupportedEncodingException {
// if addMedia == true, then all the media links are also parsed and added to the words
// added media words are flagged with the appropriate media flag
@ -126,7 +129,7 @@ public final class Condenser {
Map.Entry<MultiProtocolURI, String> entry;
if (indexText) {
createCondensement(document.getText());
createCondensement(document.getText(), meaningLib);
// the phrase counter:
// phrase 0 are words taken from the URL
// phrase 1 is the MainTitle
@ -140,15 +143,15 @@ public final class Condenser {
// phrase 99 is taken from the media Link url and anchor description
// phrase 100 and above are lines from the text
insertTextToWords(document.dc_title(), 1, WordReferenceRow.flag_app_dc_title, RESULT_FLAGS, true);
insertTextToWords(document.dc_description(), 3, WordReferenceRow.flag_app_dc_description, RESULT_FLAGS, true);
insertTextToWords(document.dc_creator(), 4, WordReferenceRow.flag_app_dc_creator, RESULT_FLAGS, true);
insertTextToWords(document.dc_publisher(), 5, WordReferenceRow.flag_app_dc_creator, RESULT_FLAGS, true);
insertTextToWords(document.dc_subject(' '), 6, WordReferenceRow.flag_app_dc_description, RESULT_FLAGS, true);
insertTextToWords(document.dc_title(), 1, WordReferenceRow.flag_app_dc_title, RESULT_FLAGS, true, meaningLib);
insertTextToWords(document.dc_description(), 3, WordReferenceRow.flag_app_dc_description, RESULT_FLAGS, true, meaningLib);
insertTextToWords(document.dc_creator(), 4, WordReferenceRow.flag_app_dc_creator, RESULT_FLAGS, true, meaningLib);
insertTextToWords(document.dc_publisher(), 5, WordReferenceRow.flag_app_dc_creator, RESULT_FLAGS, true, meaningLib);
insertTextToWords(document.dc_subject(' '), 6, WordReferenceRow.flag_app_dc_description, RESULT_FLAGS, true, meaningLib);
// missing: tags!
final String[] titles = document.getSectionTitles();
for (int i = 0; i < titles.length; i++) {
insertTextToWords(titles[i], i + 10, WordReferenceRow.flag_app_emphasized, RESULT_FLAGS, true);
insertTextToWords(titles[i], i + 10, WordReferenceRow.flag_app_emphasized, RESULT_FLAGS, true, meaningLib);
}
// anchors: for text indexing we add only the anchor description
@ -173,7 +176,7 @@ public final class Condenser {
}
// add the URL components to the word list
insertTextToWords(document.dc_source().toNormalform(false, true), 0, WordReferenceRow.flag_app_dc_identifier, RESULT_FLAGS, false);
insertTextToWords(document.dc_source().toNormalform(false, true), 0, WordReferenceRow.flag_app_dc_identifier, RESULT_FLAGS, false, meaningLib);
if (indexMedia) {
// add anchor descriptions: here, we also add the url components
@ -181,24 +184,24 @@ public final class Condenser {
Iterator<Map.Entry<MultiProtocolURI, String>> i = document.getAudiolinks().entrySet().iterator();
while (i.hasNext()) {
entry = i.next();
insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasaudio, RESULT_FLAGS, false);
insertTextToWords(entry.getValue(), 99, flag_cat_hasaudio, RESULT_FLAGS, true);
insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasaudio, RESULT_FLAGS, false, meaningLib);
insertTextToWords(entry.getValue(), 99, flag_cat_hasaudio, RESULT_FLAGS, true, meaningLib);
}
// video
i = document.getVideolinks().entrySet().iterator();
while (i.hasNext()) {
entry = i.next();
insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasvideo, RESULT_FLAGS, false);
insertTextToWords(entry.getValue(), 99, flag_cat_hasvideo, RESULT_FLAGS, true);
insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasvideo, RESULT_FLAGS, false, meaningLib);
insertTextToWords(entry.getValue(), 99, flag_cat_hasvideo, RESULT_FLAGS, true, meaningLib);
}
// applications
i = document.getApplinks().entrySet().iterator();
while (i.hasNext()) {
entry = i.next();
insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasapp, RESULT_FLAGS, false);
insertTextToWords(entry.getValue(), 99, flag_cat_hasapp, RESULT_FLAGS, true);
insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasapp, RESULT_FLAGS, false, meaningLib);
insertTextToWords(entry.getValue(), 99, flag_cat_hasapp, RESULT_FLAGS, true, meaningLib);
}
// images
@ -206,8 +209,8 @@ public final class Condenser {
ImageEntry ientry;
while (j.hasNext()) {
ientry = j.next();
insertTextToWords(ientry.url().toNormalform(false, false), 99, flag_cat_hasimage, RESULT_FLAGS, false);
insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, RESULT_FLAGS, true);
insertTextToWords(ientry.url().toNormalform(false, false), 99, flag_cat_hasimage, RESULT_FLAGS, false, meaningLib);
insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, RESULT_FLAGS, true, meaningLib);
}
// finally check all words for missing flag entry
@ -225,12 +228,18 @@ public final class Condenser {
}
}
private void insertTextToWords(final String text, final int phrase, final int flagpos, final Bitfield flagstemplate, boolean useForLanguageIdentification) {
private void insertTextToWords(
final String text,
final int phrase,
final int flagpos,
final Bitfield flagstemplate,
boolean useForLanguageIdentification,
DidYouMeanLibrary meaningLib) {
String word;
Word wprop;
sievedWordsEnum wordenum;
try {
wordenum = new sievedWordsEnum(new ByteArrayInputStream(text.getBytes("UTF-8")));
wordenum = new sievedWordsEnum(new ByteArrayInputStream(text.getBytes("UTF-8")), meaningLib);
} catch (final UnsupportedEncodingException e) {
return;
}
@ -250,11 +259,11 @@ public final class Condenser {
}
}
public Condenser(final InputStream text) throws UnsupportedEncodingException {
public Condenser(final InputStream text, DidYouMeanLibrary meaningLib) throws UnsupportedEncodingException {
this.languageIdentificator = null; // we don't need that here
// analysis = new Properties();
words = new TreeMap<String, Word>();
createCondensement(text);
createCondensement(text, meaningLib);
}
public int excludeWords(final TreeSet<String> stopwords) {
@ -274,7 +283,7 @@ public final class Condenser {
return this.languageIdentificator.getLanguage();
}
private void createCondensement(final InputStream is) throws UnsupportedEncodingException {
private void createCondensement(final InputStream is, DidYouMeanLibrary meaningLib) throws UnsupportedEncodingException {
final HashSet<String> currsentwords = new HashSet<String>();
StringBuilder sentence = new StringBuilder(100);
String word = "";
@ -293,7 +302,7 @@ public final class Condenser {
final HashMap<StringBuilder, Phrase> sentences = new HashMap<StringBuilder, Phrase>(100);
// read source
final sievedWordsEnum wordenum = new sievedWordsEnum(is);
final sievedWordsEnum wordenum = new sievedWordsEnum(is, meaningLib);
while (wordenum.hasMoreElements()) {
word = (wordenum.nextElement().toString()).toLowerCase(Locale.ENGLISH); // TODO: does toLowerCase work for non ISO-8859-1 chars?
if (languageIdentificator != null) languageIdentificator.add(word);
@ -467,11 +476,11 @@ public final class Condenser {
* @param sentence the sentence to be tokenized
* @return a ordered map containing word hashes as key and positions as value. The map is orderd by the hash ordering
*/
public static TreeMap<byte[], Integer> hashSentence(final String sentence) {
public static TreeMap<byte[], Integer> hashSentence(final String sentence, DidYouMeanLibrary meaningLib) {
final TreeMap<byte[], Integer> map = new TreeMap<byte[], Integer>(Base64Order.enhancedCoder);
final Enumeration<StringBuilder> words = wordTokenizer(sentence, "UTF-8");
final Enumeration<String> words = wordTokenizer(sentence, "UTF-8", meaningLib);
int pos = 0;
StringBuilder word;
String word;
byte[] hash;
Integer oldpos;
while (words.hasMoreElements()) {
@ -487,23 +496,25 @@ public final class Condenser {
return map;
}
public static Enumeration<StringBuilder> wordTokenizer(final String s, final String charset) {
public static Enumeration<String> wordTokenizer(final String s, final String charset, DidYouMeanLibrary meaningLib) {
try {
return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes(charset)));
return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes(charset)), meaningLib);
} catch (final Exception e) {
return null;
}
}
public static class sievedWordsEnum implements Enumeration<StringBuilder> {
public static class sievedWordsEnum implements Enumeration<String> {
// this enumeration removes all words that contain either wrong characters or are too short
StringBuilder buffer = null;
unsievedWordsEnum e;
DidYouMeanLibrary meaningLib;
public sievedWordsEnum(final InputStream is) throws UnsupportedEncodingException {
e = new unsievedWordsEnum(is);
buffer = nextElement0();
public sievedWordsEnum(final InputStream is, DidYouMeanLibrary meaningLib) throws UnsupportedEncodingException {
this.e = new unsievedWordsEnum(is);
this.buffer = nextElement0();
this.meaningLib = meaningLib;
}
public void pre(final boolean x) {
@ -527,9 +538,11 @@ public final class Condenser {
return buffer != null;
}
public StringBuilder nextElement() {
final StringBuilder r = buffer;
public String nextElement() {
final String r = (buffer == null) ? null : buffer.toString();
buffer = nextElement0();
// put word to words statistics cache
if (meaningLib != null) meaningLib.learn(r);
return r;
}
@ -710,7 +723,7 @@ public final class Condenser {
return s;
}
public static Map<String, Word> getWords(final String text) {
public static Map<String, Word> getWords(final String text, DidYouMeanLibrary meaningLib) {
// returns a word/indexWord relation map
if (text == null) return null;
ByteArrayInputStream buffer;
@ -720,7 +733,7 @@ public final class Condenser {
buffer = new ByteArrayInputStream(text.getBytes());
}
try {
return new Condenser(buffer).words();
return new Condenser(buffer, meaningLib).words();
} catch (final UnsupportedEncodingException e) {
return null;
}

@ -45,7 +45,7 @@ public class SnippetExtractor {
int linenumber = 0;
int fullmatchcounter = 0;
lookup: for (StringBuilder sentence: sentences) {
hs = Condenser.hashSentence(sentence.toString());
hs = Condenser.hashSentence(sentence.toString(), null);
positions = new TreeSet<Integer>();
for (byte[] word: queryhashes) {
pos = hs.get(word);
@ -124,7 +124,7 @@ public class SnippetExtractor {
byte[] hash;
// find all hashes that appear in the sentence
final TreeMap<byte[], Integer> hs = Condenser.hashSentence(sentence);
final TreeMap<byte[], Integer> hs = Condenser.hashSentence(sentence, null);
final Iterator<byte[]> j = queryhashes.iterator();
Integer pos;
int p, minpos = sentence.length(), maxpos = -1;

@ -28,6 +28,8 @@ import java.io.UnsupportedEncodingException;
import java.util.List;
import java.util.Map;
import de.anomic.data.LibraryProvider;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.AbstractParser;
import net.yacy.document.Condenser;
@ -109,7 +111,7 @@ public class torrentParser extends AbstractParser implements Parser {
byte[] b = FileUtils.read(new File(args[0]));
torrentParser parser = new torrentParser();
Document[] d = parser.parse(new MultiProtocolURI("http://localhost/test.torrent"), null, "utf-8", new ByteArrayInputStream(b));
Condenser c = new Condenser(d[0], true, true);
Condenser c = new Condenser(d[0], true, true, LibraryProvider.dymLib);
Map<String, Word> w = c.words();
for (Map.Entry<String, Word> e: w.entrySet()) System.out.println("Word: " + e.getKey() + " - " + e.getValue().posInText);
} catch (IOException e) {

Loading…
Cancel
Save