added a word counter statistic in condenser which is used by the did-you-mean to calculate best matches for given search words.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7258 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 2a0eb09e08
commit 58e74282af

@ -47,6 +47,7 @@ import net.yacy.kelondro.data.meta.URIMetadataRow;
import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.Response; import de.anomic.crawler.retrieval.Response;
import de.anomic.data.LibraryProvider;
import de.anomic.http.client.Cache; import de.anomic.http.client.Cache;
import de.anomic.search.Segment; import de.anomic.search.Segment;
import de.anomic.search.Segments; import de.anomic.search.Segments;
@ -277,9 +278,9 @@ public class ViewFile {
// Search word highlighting // Search word highlighting
for (StringBuilder s: sentences) { for (StringBuilder s: sentences) {
sentence = s.toString(); sentence = s.toString();
Enumeration<StringBuilder> tokens = Condenser.wordTokenizer(sentence, "UTF-8"); Enumeration<String> tokens = Condenser.wordTokenizer(sentence, "UTF-8", LibraryProvider.dymLib);
while (tokens.hasMoreElements()) { while (tokens.hasMoreElements()) {
token = tokens.nextElement().toString(); token = tokens.nextElement();
if (token.length() > 0) { if (token.length() > 0) {
prop.put("viewMode_words_" + i + "_nr", i + 1); prop.put("viewMode_words_" + i + "_nr", i + 1);
prop.put("viewMode_words_" + i + "_word", token); prop.put("viewMode_words_" + i + "_word", token);

@ -63,6 +63,7 @@ public class DidYouMean {
private long timeLimit; private long timeLimit;
private boolean createGen; // keeps the value 'true' as long as no entry in guessLib is written private boolean createGen; // keeps the value 'true' as long as no entry in guessLib is written
private final SortedSet<String> resultSet; private final SortedSet<String> resultSet;
private final indexSizeComparator INDEX_SIZE_COMPARATOR;
/** /**
@ -70,13 +71,14 @@ public class DidYouMean {
* @param sort true/false - sorts the resulting TreeSet by index.count(); <b>Warning:</b> this causes heavy i/o. * @param sort true/false - sorts the resulting TreeSet by index.count(); <b>Warning:</b> this causes heavy i/o.
*/ */
public DidYouMean(final IndexCell<WordReference> index, String word0) { public DidYouMean(final IndexCell<WordReference> index, String word0) {
this.resultSet = Collections.synchronizedSortedSet(new TreeSet<String>(WORD_LENGTH_COMPARATOR)); this.resultSet = Collections.synchronizedSortedSet(new TreeSet<String>(new headMatchingComparator(word0, WORD_LENGTH_COMPARATOR)));
this.word = word0.toLowerCase(); this.word = word0.toLowerCase();
this.wordLen = word.length(); this.wordLen = word.length();
this.index = index; this.index = index;
this.guessGen = new LinkedBlockingQueue<String>(); this.guessGen = new LinkedBlockingQueue<String>();
this.guessLib = new LinkedBlockingQueue<String>(); this.guessLib = new LinkedBlockingQueue<String>();
this.createGen = true; this.createGen = true;
this.INDEX_SIZE_COMPARATOR = new indexSizeComparator();
// identify language // identify language
if (this.word.length() == 0) { if (this.word.length() == 0) {
@ -134,7 +136,7 @@ public class DidYouMean {
if (scored.size() >= 2 * preSortSelection) break; if (scored.size() >= 2 * preSortSelection) break;
scored.inc(s, index.count(Word.word2hash(s))); scored.inc(s, index.count(Word.word2hash(s)));
} }
SortedSet<String> countSorted = Collections.synchronizedSortedSet(new TreeSet<String>(new indexSizeComparator())); SortedSet<String> countSorted = Collections.synchronizedSortedSet(new TreeSet<String>(new headMatchingComparator(this.word, this.INDEX_SIZE_COMPARATOR)));
int wc = index.count(Word.word2hash(this.word)); // all counts must be greater than this int wc = index.count(Word.word2hash(this.word)); // all counts must be greater than this
while (scored.size() > 0 && countSorted.size() < preSortSelection) { while (scored.size() > 0 && countSorted.size() < preSortSelection) {
String s = scored.getMaxKey(); String s = scored.getMaxKey();
@ -351,9 +353,9 @@ public class DidYouMean {
} catch (InterruptedException e) {} } catch (InterruptedException e) {}
} }
} }
/** /**
* indexSizeComparator is used by DidYouMean to order terms by index.count()<p/> * indexSizeComparator is used by DidYouMean to order terms by index.count()
* <b>Warning:</b> this causes heavy i/o * <b>Warning:</b> this causes heavy i/o
*/ */
private class indexSizeComparator implements Comparator<String> { private class indexSizeComparator implements Comparator<String> {
@ -363,11 +365,11 @@ public class DidYouMean {
final int i2 = index.count(Word.word2hash(o2)); final int i2 = index.count(Word.word2hash(o2));
if (i1 == i2) return WORD_LENGTH_COMPARATOR.compare(o1, o2); if (i1 == i2) return WORD_LENGTH_COMPARATOR.compare(o1, o2);
return (i1 < i2) ? 1 : -1; // '<' is correct, because the largest count shall be ordered to be the first position in the result return (i1 < i2) ? 1 : -1; // '<' is correct, because the largest count shall be ordered to be the first position in the result
} }
} }
/** /**
* wordLengthComparator is used by DidYouMean to order terms by the term length<p/> * wordLengthComparator is used by DidYouMean to order terms by the term length
* This is the default order if the indexSizeComparator is not used * This is the default order if the indexSizeComparator is not used
*/ */
private static class wordLengthComparator implements Comparator<String> { private static class wordLengthComparator implements Comparator<String> {
@ -376,11 +378,30 @@ public class DidYouMean {
final int i1 = o1.length(); final int i1 = o1.length();
final int i2 = o2.length(); final int i2 = o2.length();
if (i1 == i2) return o1.compareTo(o2); if (i1 == i2) return o1.compareTo(o2);
return (i1 > i2) ? 1 : -1; // '>' is correct, because the shortest word shall be first return (i1 < i2) ? 1 : -1; // '<' is correct, because the longest word shall be first
} }
} }
/**
* headMatchingComparator is used to sort results in such a way that words that match with the given words are sorted first
*/
private static class headMatchingComparator implements Comparator<String> {
private final String head;
private final Comparator<String> secondaryComparator;
public headMatchingComparator(String head, Comparator<String> secondaryComparator) {
this.head = head.toLowerCase();
this.secondaryComparator = secondaryComparator;
}
public int compare(final String o1, final String o2) {
boolean o1m = o1.toLowerCase().startsWith(head);
boolean o2m = o2.toLowerCase().startsWith(head);
if ((o1m && o2m) || (!o1m && !o2m)) return secondaryComparator.compare(o1, o2);
return o1m ? -1 : 1;
}
}
} }

@ -33,12 +33,14 @@ import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.util.HashSet; import java.util.HashSet;
import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.SortedMap;
import java.util.SortedSet; import java.util.SortedSet;
import java.util.TreeSet; import java.util.TreeSet;
import java.util.zip.GZIPInputStream; import java.util.zip.GZIPInputStream;
import net.yacy.cora.storage.DynamicScore; import net.yacy.cora.storage.IntScore;
import net.yacy.cora.storage.ScoreMap; import net.yacy.cora.storage.ScoreMap;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
@ -50,8 +52,8 @@ public class DidYouMeanLibrary {
// common word cache // common word cache
private static final int commonWordsMaxSize = 100000; // maximum size of common word cache private static final int commonWordsMaxSize = 100000; // maximum size of common word cache
private static final int commonWordsMinLength = 4; // words must have that length at minimum private static final int commonWordsMinLength = 5; // words must have that length at minimum
private DynamicScore<String> commonWords = new ScoreMap<String>(); private ScoreMap<String> commonWords = new ScoreMap<String>(String.CASE_INSENSITIVE_ORDER);
// dictionaries // dictionaries
private final File dictionaryPath; private final File dictionaryPath;
@ -76,10 +78,9 @@ public class DidYouMeanLibrary {
*/ */
public void learn(String word) { public void learn(String word) {
if (word == null) return; if (word == null) return;
word = word.trim().toLowerCase();
if (word.length() < commonWordsMinLength) return; if (word.length() < commonWordsMinLength) return;
commonWords.inc(word); commonWords.inc(word);
if (commonWords.size() >= commonWordsMaxSize) { if (commonWords.size() > commonWordsMaxSize) {
commonWords.shrinkToMaxSize(commonWordsMaxSize / 2); commonWords.shrinkToMaxSize(commonWordsMaxSize / 2);
} }
} }
@ -140,6 +141,12 @@ public class DidYouMeanLibrary {
for (final String r: t) { for (final String r: t) {
if (r.startsWith(string) && r.length() > string.length()) ret.add(r); else break; if (r.startsWith(string) && r.length() > string.length()) ret.add(r); else break;
} }
SortedMap<String, IntScore> u = this.commonWords.tailMap(string);
String vv;
for (final Map.Entry<String, IntScore> v: u.entrySet()) {
vv = v.getKey();
if (vv.startsWith(string) && vv.length() > string.length()) ret.add(vv); else break;
}
string = reverse(string); string = reverse(string);
t = this.tcid.tailSet(string); t = this.tcid.tailSet(string);
for (final String r: t) { for (final String r: t) {

@ -35,6 +35,8 @@ import java.util.Date;
import java.util.concurrent.BlockingQueue; import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.LinkedBlockingQueue;
import de.anomic.data.LibraryProvider;
import net.yacy.document.Condenser; import net.yacy.document.Condenser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.TextParser; import net.yacy.document.TextParser;
@ -133,7 +135,7 @@ public class DocumentIndex extends Segment {
throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage()); throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage());
} }
Document document = Document.mergeDocuments(url, null, documents); Document document = Document.mergeDocuments(url, null, documents);
final Condenser condenser = new Condenser(document, true, true); final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib);
return super.storeDocument( return super.storeDocument(
url, url,
null, null,

@ -206,7 +206,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
private static HandleSet removeAppearanceHashes(final String sentence, final HandleSet queryhashes) { private static HandleSet removeAppearanceHashes(final String sentence, final HandleSet queryhashes) {
// remove all hashes that appear in the sentence // remove all hashes that appear in the sentence
if (sentence == null) return queryhashes; if (sentence == null) return queryhashes;
final TreeMap<byte[], Integer> hs = Condenser.hashSentence(sentence); final TreeMap<byte[], Integer> hs = Condenser.hashSentence(sentence, null);
final Iterator<byte[]> j = queryhashes.iterator(); final Iterator<byte[]> j = queryhashes.iterator();
byte[] hash; byte[] hash;
Integer pos; Integer pos;

@ -294,7 +294,7 @@ public final class QueryParams {
*/ */
public final boolean matchesText(final String text) { public final boolean matchesText(final String text) {
boolean ret = false; boolean ret = false;
final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text).keySet()); final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text, null).keySet());
if (!SetTools.anymatch(wordhashes, this.excludeHashes)) { if (!SetTools.anymatch(wordhashes, this.excludeHashes)) {
ret = SetTools.totalInclusion(this.queryHashes, wordhashes); ret = SetTools.totalInclusion(this.queryHashes, wordhashes);
} }
@ -304,7 +304,7 @@ public final class QueryParams {
protected static final boolean anymatch(final String text, final HandleSet keyhashes) { protected static final boolean anymatch(final String text, final HandleSet keyhashes) {
// returns true if any of the word hashes in keyhashes appear in the String text // returns true if any of the word hashes in keyhashes appear in the String text
// to do this, all words in the string must be recognized and transcoded to word hashes // to do this, all words in the string must be recognized and transcoded to word hashes
final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text).keySet()); final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text, null).keySet());
return SetTools.anymatch(wordhashes, keyhashes); return SetTools.anymatch(wordhashes, keyhashes);
} }

@ -89,7 +89,7 @@ public class ResultEntry implements Comparable<ResultEntry>, Comparator<ResultEn
("yacyshare " + ("yacyshare " +
filename.replace('?', ' ') + filename.replace('?', ' ') +
" " + " " +
urlcomps.dc_title())).keySet()), urlcomps.dc_title()), null).keySet()),
urlentry.hash()); urlentry.hash());
} catch (IOException e) { } catch (IOException e) {
Log.logException(e); Log.logException(e);

@ -424,7 +424,7 @@ public class Segment {
// get the word set // get the word set
Set<String> words = null; Set<String> words = null;
try { try {
words = new Condenser(document, true, true).words().keySet(); words = new Condenser(document, true, true, null).words().keySet();
} catch (final UnsupportedEncodingException e) { } catch (final UnsupportedEncodingException e) {
Log.logException(e); Log.logException(e);
} }

@ -1855,7 +1855,7 @@ public final class Switchboard extends serverSwitch {
for (int i = 0; i < in.documents.length; i++) { for (int i = 0; i < in.documents.length; i++) {
// strip out words and generate statistics // strip out words and generate statistics
try { try {
condenser[i] = new Condenser(in.documents[i], in.queueEntry.profile().indexText(), in.queueEntry.profile().indexMedia()); condenser[i] = new Condenser(in.documents[i], in.queueEntry.profile().indexText(), in.queueEntry.profile().indexMedia(), LibraryProvider.dymLib);
// update image result list statistics // update image result list statistics
// its good to do this concurrently here, because it needs a DNS lookup // its good to do this concurrently here, because it needs a DNS lookup
@ -2035,7 +2035,7 @@ public final class Switchboard extends serverSwitch {
Document[] documents = response.parse(); Document[] documents = response.parse();
if (documents != null) for (Document document: documents) { if (documents != null) for (Document document: documents) {
if (document.indexingDenied()) throw new Parser.Failure("indexing is denied", url); if (document.indexingDenied()) throw new Parser.Failure("indexing is denied", url);
Condenser condenser = new Condenser(document, true, true); Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib);
ResultImages.registerImages(url, document, true); ResultImages.registerImages(url, document, true);
webStructure.generateCitationReference(url, document, condenser, response.lastModified()); webStructure.generateCitationReference(url, document, condenser, response.lastModified());
storeDocumentIndex(process, response, document, condenser, searchEvent, "heuristic:" + heuristicName); storeDocumentIndex(process, response, document, condenser, searchEvent, "heuristic:" + heuristicName);

@ -417,7 +417,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
} }
private static boolean containsAllHashes(final String sentence, final HandleSet queryhashes) { private static boolean containsAllHashes(final String sentence, final HandleSet queryhashes) {
final TreeMap<byte[], Integer> m = Condenser.hashSentence(sentence); final TreeMap<byte[], Integer> m = Condenser.hashSentence(sentence, null);
for (byte[] b: queryhashes) { for (byte[] b: queryhashes) {
if (!(m.containsKey(b))) return false; if (!(m.containsKey(b))) return false;
} }

@ -28,6 +28,7 @@ import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap; import java.util.TreeMap;
import java.util.TreeSet; import java.util.TreeSet;
@ -184,6 +185,14 @@ public class ScoreMap<E> implements DynamicScore<E> {
return score.intValue(); return score.intValue();
} }
public SortedMap<E, IntScore> tailMap(E obj) {
if (this.map instanceof TreeMap) {
return ((TreeMap<E, IntScore>) this.map).tailMap(obj);
}
throw new UnsupportedOperationException("map must have comparator");
}
public int getMaxScore() { public int getMaxScore() {
if (map.isEmpty()) return -1; if (map.isEmpty()) return -1;
int maxScore = Integer.MIN_VALUE; int maxScore = Integer.MIN_VALUE;

@ -42,6 +42,8 @@ import java.util.Properties;
import java.util.TreeMap; import java.util.TreeMap;
import java.util.TreeSet; import java.util.TreeSet;
import de.anomic.data.DidYouMeanLibrary;
import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.language.Identificator; import net.yacy.document.language.Identificator;
import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ContentScraper;
@ -55,7 +57,7 @@ import net.yacy.kelondro.util.SetTools;
public final class Condenser { public final class Condenser {
// this is the page analysis class // this is the page analysis class
public final static boolean pseudostemming = false; // switch for removal of words that appear in shortened form public final static boolean pseudostemming = false; // switch for removal of words that appear in shortened form
public final static int wordminsize = 2; public final static int wordminsize = 2;
@ -108,7 +110,8 @@ public final class Condenser {
public Condenser( public Condenser(
final Document document, final Document document,
final boolean indexText, final boolean indexText,
final boolean indexMedia final boolean indexMedia,
final DidYouMeanLibrary meaningLib
) throws UnsupportedEncodingException { ) throws UnsupportedEncodingException {
// if addMedia == true, then all the media links are also parsed and added to the words // if addMedia == true, then all the media links are also parsed and added to the words
// added media words are flagged with the appropriate media flag // added media words are flagged with the appropriate media flag
@ -126,7 +129,7 @@ public final class Condenser {
Map.Entry<MultiProtocolURI, String> entry; Map.Entry<MultiProtocolURI, String> entry;
if (indexText) { if (indexText) {
createCondensement(document.getText()); createCondensement(document.getText(), meaningLib);
// the phrase counter: // the phrase counter:
// phrase 0 are words taken from the URL // phrase 0 are words taken from the URL
// phrase 1 is the MainTitle // phrase 1 is the MainTitle
@ -140,15 +143,15 @@ public final class Condenser {
// phrase 99 is taken from the media Link url and anchor description // phrase 99 is taken from the media Link url and anchor description
// phrase 100 and above are lines from the text // phrase 100 and above are lines from the text
insertTextToWords(document.dc_title(), 1, WordReferenceRow.flag_app_dc_title, RESULT_FLAGS, true); insertTextToWords(document.dc_title(), 1, WordReferenceRow.flag_app_dc_title, RESULT_FLAGS, true, meaningLib);
insertTextToWords(document.dc_description(), 3, WordReferenceRow.flag_app_dc_description, RESULT_FLAGS, true); insertTextToWords(document.dc_description(), 3, WordReferenceRow.flag_app_dc_description, RESULT_FLAGS, true, meaningLib);
insertTextToWords(document.dc_creator(), 4, WordReferenceRow.flag_app_dc_creator, RESULT_FLAGS, true); insertTextToWords(document.dc_creator(), 4, WordReferenceRow.flag_app_dc_creator, RESULT_FLAGS, true, meaningLib);
insertTextToWords(document.dc_publisher(), 5, WordReferenceRow.flag_app_dc_creator, RESULT_FLAGS, true); insertTextToWords(document.dc_publisher(), 5, WordReferenceRow.flag_app_dc_creator, RESULT_FLAGS, true, meaningLib);
insertTextToWords(document.dc_subject(' '), 6, WordReferenceRow.flag_app_dc_description, RESULT_FLAGS, true); insertTextToWords(document.dc_subject(' '), 6, WordReferenceRow.flag_app_dc_description, RESULT_FLAGS, true, meaningLib);
// missing: tags! // missing: tags!
final String[] titles = document.getSectionTitles(); final String[] titles = document.getSectionTitles();
for (int i = 0; i < titles.length; i++) { for (int i = 0; i < titles.length; i++) {
insertTextToWords(titles[i], i + 10, WordReferenceRow.flag_app_emphasized, RESULT_FLAGS, true); insertTextToWords(titles[i], i + 10, WordReferenceRow.flag_app_emphasized, RESULT_FLAGS, true, meaningLib);
} }
// anchors: for text indexing we add only the anchor description // anchors: for text indexing we add only the anchor description
@ -173,7 +176,7 @@ public final class Condenser {
} }
// add the URL components to the word list // add the URL components to the word list
insertTextToWords(document.dc_source().toNormalform(false, true), 0, WordReferenceRow.flag_app_dc_identifier, RESULT_FLAGS, false); insertTextToWords(document.dc_source().toNormalform(false, true), 0, WordReferenceRow.flag_app_dc_identifier, RESULT_FLAGS, false, meaningLib);
if (indexMedia) { if (indexMedia) {
// add anchor descriptions: here, we also add the url components // add anchor descriptions: here, we also add the url components
@ -181,24 +184,24 @@ public final class Condenser {
Iterator<Map.Entry<MultiProtocolURI, String>> i = document.getAudiolinks().entrySet().iterator(); Iterator<Map.Entry<MultiProtocolURI, String>> i = document.getAudiolinks().entrySet().iterator();
while (i.hasNext()) { while (i.hasNext()) {
entry = i.next(); entry = i.next();
insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasaudio, RESULT_FLAGS, false); insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasaudio, RESULT_FLAGS, false, meaningLib);
insertTextToWords(entry.getValue(), 99, flag_cat_hasaudio, RESULT_FLAGS, true); insertTextToWords(entry.getValue(), 99, flag_cat_hasaudio, RESULT_FLAGS, true, meaningLib);
} }
// video // video
i = document.getVideolinks().entrySet().iterator(); i = document.getVideolinks().entrySet().iterator();
while (i.hasNext()) { while (i.hasNext()) {
entry = i.next(); entry = i.next();
insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasvideo, RESULT_FLAGS, false); insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasvideo, RESULT_FLAGS, false, meaningLib);
insertTextToWords(entry.getValue(), 99, flag_cat_hasvideo, RESULT_FLAGS, true); insertTextToWords(entry.getValue(), 99, flag_cat_hasvideo, RESULT_FLAGS, true, meaningLib);
} }
// applications // applications
i = document.getApplinks().entrySet().iterator(); i = document.getApplinks().entrySet().iterator();
while (i.hasNext()) { while (i.hasNext()) {
entry = i.next(); entry = i.next();
insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasapp, RESULT_FLAGS, false); insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasapp, RESULT_FLAGS, false, meaningLib);
insertTextToWords(entry.getValue(), 99, flag_cat_hasapp, RESULT_FLAGS, true); insertTextToWords(entry.getValue(), 99, flag_cat_hasapp, RESULT_FLAGS, true, meaningLib);
} }
// images // images
@ -206,8 +209,8 @@ public final class Condenser {
ImageEntry ientry; ImageEntry ientry;
while (j.hasNext()) { while (j.hasNext()) {
ientry = j.next(); ientry = j.next();
insertTextToWords(ientry.url().toNormalform(false, false), 99, flag_cat_hasimage, RESULT_FLAGS, false); insertTextToWords(ientry.url().toNormalform(false, false), 99, flag_cat_hasimage, RESULT_FLAGS, false, meaningLib);
insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, RESULT_FLAGS, true); insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, RESULT_FLAGS, true, meaningLib);
} }
// finally check all words for missing flag entry // finally check all words for missing flag entry
@ -225,12 +228,18 @@ public final class Condenser {
} }
} }
private void insertTextToWords(final String text, final int phrase, final int flagpos, final Bitfield flagstemplate, boolean useForLanguageIdentification) { private void insertTextToWords(
final String text,
final int phrase,
final int flagpos,
final Bitfield flagstemplate,
boolean useForLanguageIdentification,
DidYouMeanLibrary meaningLib) {
String word; String word;
Word wprop; Word wprop;
sievedWordsEnum wordenum; sievedWordsEnum wordenum;
try { try {
wordenum = new sievedWordsEnum(new ByteArrayInputStream(text.getBytes("UTF-8"))); wordenum = new sievedWordsEnum(new ByteArrayInputStream(text.getBytes("UTF-8")), meaningLib);
} catch (final UnsupportedEncodingException e) { } catch (final UnsupportedEncodingException e) {
return; return;
} }
@ -250,11 +259,11 @@ public final class Condenser {
} }
} }
public Condenser(final InputStream text) throws UnsupportedEncodingException { public Condenser(final InputStream text, DidYouMeanLibrary meaningLib) throws UnsupportedEncodingException {
this.languageIdentificator = null; // we don't need that here this.languageIdentificator = null; // we don't need that here
// analysis = new Properties(); // analysis = new Properties();
words = new TreeMap<String, Word>(); words = new TreeMap<String, Word>();
createCondensement(text); createCondensement(text, meaningLib);
} }
public int excludeWords(final TreeSet<String> stopwords) { public int excludeWords(final TreeSet<String> stopwords) {
@ -274,7 +283,7 @@ public final class Condenser {
return this.languageIdentificator.getLanguage(); return this.languageIdentificator.getLanguage();
} }
private void createCondensement(final InputStream is) throws UnsupportedEncodingException { private void createCondensement(final InputStream is, DidYouMeanLibrary meaningLib) throws UnsupportedEncodingException {
final HashSet<String> currsentwords = new HashSet<String>(); final HashSet<String> currsentwords = new HashSet<String>();
StringBuilder sentence = new StringBuilder(100); StringBuilder sentence = new StringBuilder(100);
String word = ""; String word = "";
@ -293,7 +302,7 @@ public final class Condenser {
final HashMap<StringBuilder, Phrase> sentences = new HashMap<StringBuilder, Phrase>(100); final HashMap<StringBuilder, Phrase> sentences = new HashMap<StringBuilder, Phrase>(100);
// read source // read source
final sievedWordsEnum wordenum = new sievedWordsEnum(is); final sievedWordsEnum wordenum = new sievedWordsEnum(is, meaningLib);
while (wordenum.hasMoreElements()) { while (wordenum.hasMoreElements()) {
word = (wordenum.nextElement().toString()).toLowerCase(Locale.ENGLISH); // TODO: does toLowerCase work for non ISO-8859-1 chars? word = (wordenum.nextElement().toString()).toLowerCase(Locale.ENGLISH); // TODO: does toLowerCase work for non ISO-8859-1 chars?
if (languageIdentificator != null) languageIdentificator.add(word); if (languageIdentificator != null) languageIdentificator.add(word);
@ -467,11 +476,11 @@ public final class Condenser {
* @param sentence the sentence to be tokenized * @param sentence the sentence to be tokenized
* @return a ordered map containing word hashes as key and positions as value. The map is orderd by the hash ordering * @return a ordered map containing word hashes as key and positions as value. The map is orderd by the hash ordering
*/ */
public static TreeMap<byte[], Integer> hashSentence(final String sentence) { public static TreeMap<byte[], Integer> hashSentence(final String sentence, DidYouMeanLibrary meaningLib) {
final TreeMap<byte[], Integer> map = new TreeMap<byte[], Integer>(Base64Order.enhancedCoder); final TreeMap<byte[], Integer> map = new TreeMap<byte[], Integer>(Base64Order.enhancedCoder);
final Enumeration<StringBuilder> words = wordTokenizer(sentence, "UTF-8"); final Enumeration<String> words = wordTokenizer(sentence, "UTF-8", meaningLib);
int pos = 0; int pos = 0;
StringBuilder word; String word;
byte[] hash; byte[] hash;
Integer oldpos; Integer oldpos;
while (words.hasMoreElements()) { while (words.hasMoreElements()) {
@ -487,23 +496,25 @@ public final class Condenser {
return map; return map;
} }
public static Enumeration<StringBuilder> wordTokenizer(final String s, final String charset) { public static Enumeration<String> wordTokenizer(final String s, final String charset, DidYouMeanLibrary meaningLib) {
try { try {
return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes(charset))); return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes(charset)), meaningLib);
} catch (final Exception e) { } catch (final Exception e) {
return null; return null;
} }
} }
public static class sievedWordsEnum implements Enumeration<StringBuilder> { public static class sievedWordsEnum implements Enumeration<String> {
// this enumeration removes all words that contain either wrong characters or are too short // this enumeration removes all words that contain either wrong characters or are too short
StringBuilder buffer = null; StringBuilder buffer = null;
unsievedWordsEnum e; unsievedWordsEnum e;
DidYouMeanLibrary meaningLib;
public sievedWordsEnum(final InputStream is) throws UnsupportedEncodingException { public sievedWordsEnum(final InputStream is, DidYouMeanLibrary meaningLib) throws UnsupportedEncodingException {
e = new unsievedWordsEnum(is); this.e = new unsievedWordsEnum(is);
buffer = nextElement0(); this.buffer = nextElement0();
this.meaningLib = meaningLib;
} }
public void pre(final boolean x) { public void pre(final boolean x) {
@ -527,9 +538,11 @@ public final class Condenser {
return buffer != null; return buffer != null;
} }
public StringBuilder nextElement() { public String nextElement() {
final StringBuilder r = buffer; final String r = (buffer == null) ? null : buffer.toString();
buffer = nextElement0(); buffer = nextElement0();
// put word to words statistics cache
if (meaningLib != null) meaningLib.learn(r);
return r; return r;
} }
@ -710,7 +723,7 @@ public final class Condenser {
return s; return s;
} }
public static Map<String, Word> getWords(final String text) { public static Map<String, Word> getWords(final String text, DidYouMeanLibrary meaningLib) {
// returns a word/indexWord relation map // returns a word/indexWord relation map
if (text == null) return null; if (text == null) return null;
ByteArrayInputStream buffer; ByteArrayInputStream buffer;
@ -720,7 +733,7 @@ public final class Condenser {
buffer = new ByteArrayInputStream(text.getBytes()); buffer = new ByteArrayInputStream(text.getBytes());
} }
try { try {
return new Condenser(buffer).words(); return new Condenser(buffer, meaningLib).words();
} catch (final UnsupportedEncodingException e) { } catch (final UnsupportedEncodingException e) {
return null; return null;
} }

@ -45,7 +45,7 @@ public class SnippetExtractor {
int linenumber = 0; int linenumber = 0;
int fullmatchcounter = 0; int fullmatchcounter = 0;
lookup: for (StringBuilder sentence: sentences) { lookup: for (StringBuilder sentence: sentences) {
hs = Condenser.hashSentence(sentence.toString()); hs = Condenser.hashSentence(sentence.toString(), null);
positions = new TreeSet<Integer>(); positions = new TreeSet<Integer>();
for (byte[] word: queryhashes) { for (byte[] word: queryhashes) {
pos = hs.get(word); pos = hs.get(word);
@ -124,7 +124,7 @@ public class SnippetExtractor {
byte[] hash; byte[] hash;
// find all hashes that appear in the sentence // find all hashes that appear in the sentence
final TreeMap<byte[], Integer> hs = Condenser.hashSentence(sentence); final TreeMap<byte[], Integer> hs = Condenser.hashSentence(sentence, null);
final Iterator<byte[]> j = queryhashes.iterator(); final Iterator<byte[]> j = queryhashes.iterator();
Integer pos; Integer pos;
int p, minpos = sentence.length(), maxpos = -1; int p, minpos = sentence.length(), maxpos = -1;

@ -28,6 +28,8 @@ import java.io.UnsupportedEncodingException;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import de.anomic.data.LibraryProvider;
import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Condenser; import net.yacy.document.Condenser;
@ -109,7 +111,7 @@ public class torrentParser extends AbstractParser implements Parser {
byte[] b = FileUtils.read(new File(args[0])); byte[] b = FileUtils.read(new File(args[0]));
torrentParser parser = new torrentParser(); torrentParser parser = new torrentParser();
Document[] d = parser.parse(new MultiProtocolURI("http://localhost/test.torrent"), null, "utf-8", new ByteArrayInputStream(b)); Document[] d = parser.parse(new MultiProtocolURI("http://localhost/test.torrent"), null, "utf-8", new ByteArrayInputStream(b));
Condenser c = new Condenser(d[0], true, true); Condenser c = new Condenser(d[0], true, true, LibraryProvider.dymLib);
Map<String, Word> w = c.words(); Map<String, Word> w = c.words();
for (Map.Entry<String, Word> e: w.entrySet()) System.out.println("Word: " + e.getKey() + " - " + e.getValue().posInText); for (Map.Entry<String, Word> e: w.entrySet()) System.out.println("Word: " + e.getKey() + " - " + e.getValue().posInText);
} catch (IOException e) { } catch (IOException e) {

Loading…
Cancel
Save