fix posInText ranking calculation to score 0 on no position info

+ fix Word posInText calc in Tokenizer to start with 1 + test case
9 years ago · e310ec5f70
parent d14a9ee918
commit e310ec5f70
3 changed files with 42 additions and 3 deletions
--- a/source/net/yacy/document/Tokenizer.java
+++ b/source/net/yacy/document/Tokenizer.java
@ -170,7 +170,7 @@ public class Tokenizer {
                        wsp.inc();
                    } else {
                        // word does not yet exist, create new word entry
-                        wordHandle = wordHandleCount++;
+                        wordHandle = ++wordHandleCount; // let start pos with 1
                        wsp = new Word(wordHandle, wordInSentenceCounter, /* sentences.size() + */ 100);
                        wsp.flags = this.RESULT_FLAGS.clone();
                        this.words.put(word.toLowerCase(), wsp);
--- a/source/net/yacy/search/ranking/ReferenceOrder.java
+++ b/source/net/yacy/search/ranking/ReferenceOrder.java
@ -228,13 +228,13 @@ public class ReferenceOrder {
        assert this.ranking != null;
        final long tf = ((this.max.termFrequency() == this.min.termFrequency()) ? 0 : (((int)(((t.termFrequency()-this.min.termFrequency())*256.0)/(this.max.termFrequency() - this.min.termFrequency())))) << this.ranking.coeff_termfrequency);
        //System.out.println("tf(" + t.urlHash + ") = " + Math.floor(1000 * t.termFrequency()) + ", min = " + Math.floor(1000 * min.termFrequency()) + ", max = " + Math.floor(1000 * max.termFrequency()) + ", tf-normed = " + tf);
-        final int maxmaxpos = this.max.maxposition();
+        final int maxmaxpos = this.max.maxposition(); // returns Integer.MIN_VALUE if positions empty
        final int minminpos = this.min.minposition();
        final long r =
             ((256 - DigestURL.domLengthNormalized(t.urlhash())) << this.ranking.coeff_domlength)
           + ((this.max.urlcomps()      == this.min.urlcomps()   )   ? 0 : (256 - (((t.urlcomps()     - this.min.urlcomps()     ) << 8) / (this.max.urlcomps()     - this.min.urlcomps())     )) << this.ranking.coeff_urlcomps)
           + ((this.max.urllength()     == this.min.urllength()  )   ? 0 : (256 - (((t.urllength()    - this.min.urllength()    ) << 8) / (this.max.urllength()    - this.min.urllength())    )) << this.ranking.coeff_urllength)
-           + ((maxmaxpos == minminpos)                               ? 0 : (256 - (((t.minposition() - minminpos) << 8) / (maxmaxpos - minminpos))) << this.ranking.coeff_posintext)
+           + ((maxmaxpos == minminpos || maxmaxpos < 0)              ? 0 : (256 - (((t.minposition()  - minminpos) << 8) / (maxmaxpos - minminpos))) << this.ranking.coeff_posintext)
           + ((this.max.posofphrase()   == this.min.posofphrase())   ? 0 : (256 - (((t.posofphrase()  - this.min.posofphrase()  ) << 8) / (this.max.posofphrase()  - this.min.posofphrase())  )) << this.ranking.coeff_posofphrase)
           + ((this.max.posinphrase()   == this.min.posinphrase())   ? 0 : (256 - (((t.posinphrase()  - this.min.posinphrase()  ) << 8) / (this.max.posinphrase()  - this.min.posinphrase())  )) << this.ranking.coeff_posinphrase)
           + ((this.max.distance()      == this.min.distance()   )   ? 0 : (256 - (((t.distance()     - this.min.distance()     ) << 8) / (this.max.distance()     - this.min.distance())     )) << this.ranking.coeff_worddistance)
--- a/test/java/net/yacy/document/TokenizerTest.java
+++ b/test/java/net/yacy/document/TokenizerTest.java
@ -0,0 +1,39 @@
+
+package net.yacy.document;
+
+import java.net.MalformedURLException;
+import java.util.Map;
+import net.yacy.cora.document.WordCache;
+import net.yacy.kelondro.data.word.Word;
+import org.junit.Test;
+import static org.junit.Assert.*;
+
+
+public class TokenizerTest {
+
+    /**
+     * Test of words method, of class Tokenizer.
+     */
+    @Test
+    public void testWords() throws MalformedURLException {
+        //  pos  =      1   2   3   4       5        6      7    8   9    10     // 1-letter words don't count
+        String text = "One word is not a sentence because words are just words.";
+        WordCache meaningLib = new WordCache(null);
+        boolean doAutotagging = false;
+        VocabularyScraper scraper = null;
+
+        Tokenizer t = new Tokenizer(null, text, meaningLib, doAutotagging, scraper);
+
+        Map<String, Word> words = t.words;
+
+        // test extracted word information (position)
+        Word w = words.get("word");
+        assertEquals("position of 'word' ", 2, w.posInText);
+        assertEquals("occurence of 'word' ", 1, w.occurrences());
+
+        w = words.get("words");
+        assertEquals("position of 'words' ", 7, w.posInText);
+        assertEquals("occurence of 'words' ", 2, w.occurrences());
+    }
+
+}