fix posInText ranking calculation to score 0 on no position info

+ fix Word posInText calc in Tokenizer to start with 1
+ test case
pull/93/head
reger 9 years ago
parent d14a9ee918
commit e310ec5f70

@ -170,7 +170,7 @@ public class Tokenizer {
wsp.inc();
} else {
// word does not yet exist, create new word entry
wordHandle = wordHandleCount++;
wordHandle = ++wordHandleCount; // let start pos with 1
wsp = new Word(wordHandle, wordInSentenceCounter, /* sentences.size() + */ 100);
wsp.flags = this.RESULT_FLAGS.clone();
this.words.put(word.toLowerCase(), wsp);

@ -228,13 +228,13 @@ public class ReferenceOrder {
assert this.ranking != null;
final long tf = ((this.max.termFrequency() == this.min.termFrequency()) ? 0 : (((int)(((t.termFrequency()-this.min.termFrequency())*256.0)/(this.max.termFrequency() - this.min.termFrequency())))) << this.ranking.coeff_termfrequency);
//System.out.println("tf(" + t.urlHash + ") = " + Math.floor(1000 * t.termFrequency()) + ", min = " + Math.floor(1000 * min.termFrequency()) + ", max = " + Math.floor(1000 * max.termFrequency()) + ", tf-normed = " + tf);
final int maxmaxpos = this.max.maxposition();
final int maxmaxpos = this.max.maxposition(); // returns Integer.MIN_VALUE if positions empty
final int minminpos = this.min.minposition();
final long r =
((256 - DigestURL.domLengthNormalized(t.urlhash())) << this.ranking.coeff_domlength)
+ ((this.max.urlcomps() == this.min.urlcomps() ) ? 0 : (256 - (((t.urlcomps() - this.min.urlcomps() ) << 8) / (this.max.urlcomps() - this.min.urlcomps()) )) << this.ranking.coeff_urlcomps)
+ ((this.max.urllength() == this.min.urllength() ) ? 0 : (256 - (((t.urllength() - this.min.urllength() ) << 8) / (this.max.urllength() - this.min.urllength()) )) << this.ranking.coeff_urllength)
+ ((maxmaxpos == minminpos) ? 0 : (256 - (((t.minposition() - minminpos) << 8) / (maxmaxpos - minminpos))) << this.ranking.coeff_posintext)
+ ((maxmaxpos == minminpos || maxmaxpos < 0) ? 0 : (256 - (((t.minposition() - minminpos) << 8) / (maxmaxpos - minminpos))) << this.ranking.coeff_posintext)
+ ((this.max.posofphrase() == this.min.posofphrase()) ? 0 : (256 - (((t.posofphrase() - this.min.posofphrase() ) << 8) / (this.max.posofphrase() - this.min.posofphrase()) )) << this.ranking.coeff_posofphrase)
+ ((this.max.posinphrase() == this.min.posinphrase()) ? 0 : (256 - (((t.posinphrase() - this.min.posinphrase() ) << 8) / (this.max.posinphrase() - this.min.posinphrase()) )) << this.ranking.coeff_posinphrase)
+ ((this.max.distance() == this.min.distance() ) ? 0 : (256 - (((t.distance() - this.min.distance() ) << 8) / (this.max.distance() - this.min.distance()) )) << this.ranking.coeff_worddistance)

@ -0,0 +1,39 @@
package net.yacy.document;
import java.net.MalformedURLException;
import java.util.Map;
import net.yacy.cora.document.WordCache;
import net.yacy.kelondro.data.word.Word;
import org.junit.Test;
import static org.junit.Assert.*;
public class TokenizerTest {
/**
* Test of words method, of class Tokenizer.
*/
@Test
public void testWords() throws MalformedURLException {
// pos = 1 2 3 4 5 6 7 8 9 10 // 1-letter words don't count
String text = "One word is not a sentence because words are just words.";
WordCache meaningLib = new WordCache(null);
boolean doAutotagging = false;
VocabularyScraper scraper = null;
Tokenizer t = new Tokenizer(null, text, meaningLib, doAutotagging, scraper);
Map<String, Word> words = t.words;
// test extracted word information (position)
Word w = words.get("word");
assertEquals("position of 'word' ", 2, w.posInText);
assertEquals("occurence of 'word' ", 1, w.occurrences());
w = words.get("words");
assertEquals("position of 'words' ", 7, w.posInText);
assertEquals("occurence of 'words' ", 2, w.occurrences());
}
}
Loading…
Cancel
Save