From 474f0476c66dabaf118502478ef82e7e36a9d2e1 Mon Sep 17 00:00:00 2001 From: reger Date: Wed, 5 Oct 2016 05:52:37 +0200 Subject: [PATCH] adjust Tokenizer sentence count on trailing text after last recognized sentence + upd test case for rwi multi-word-query (leaving results known to fail untested) --- source/net/yacy/document/Tokenizer.java | 3 +- .../net/yacy/search/index/SegmentTest.java | 42 +++++++++++-------- 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/source/net/yacy/document/Tokenizer.java b/source/net/yacy/document/Tokenizer.java index 4b34d7f18..0420194f5 100644 --- a/source/net/yacy/document/Tokenizer.java +++ b/source/net/yacy/document/Tokenizer.java @@ -213,7 +213,8 @@ public class Tokenizer { // store result this.RESULT_NUMB_WORDS = allwordcounter; - this.RESULT_NUMB_SENTENCES = allsentencecounter; + // if text doesn't end with punktuation but has words after last found sentence, inc sentence count for trailing text. + this.RESULT_NUMB_SENTENCES = allsentencecounter + (currsentwords.size() > 0 ? 1 : 0); } public Map words() { diff --git a/test/java/net/yacy/search/index/SegmentTest.java b/test/java/net/yacy/search/index/SegmentTest.java index 183ce2265..9ebea042b 100644 --- a/test/java/net/yacy/search/index/SegmentTest.java +++ b/test/java/net/yacy/search/index/SegmentTest.java @@ -3,7 +3,6 @@ package net.yacy.search.index; import java.io.File; import java.io.IOException; import java.net.MalformedURLException; -import java.util.Iterator; import java.util.Map; import net.yacy.cora.document.WordCache; import net.yacy.cora.document.encoding.UTF8; @@ -23,9 +22,9 @@ import net.yacy.kelondro.rwi.ReferenceContainer; import net.yacy.kelondro.rwi.ReferenceFactory; import net.yacy.kelondro.rwi.TermSearch; import net.yacy.kelondro.util.Bitfield; -import static net.yacy.search.index.Segment.catchallWord; import net.yacy.search.query.QueryGoal; import org.junit.AfterClass; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import org.junit.BeforeClass; import org.junit.Test; @@ -156,7 +155,11 @@ public class SegmentTest { // creates one test url with this text in the rwi index DigestURL url = new DigestURL("http://test.org/test.html"); - storeTestDocTextToTermIndex(url, "One Two Three Four Five. This is a test text. One two three for five"); + storeTestDocTextToTermIndex(url, "One Two Three Four Five. This is a test text. One two three for five."); + // posintext 1 2 3 4 5 6 7 8 9 + // hitcount ("five") 1 1 2 + // posofphrase |-------100------------| |------101---------| |--------102----------| + // posinphrase 1 2 3 4 5 1 2 3 4 1 2 3 4 5 // create a query to get the search word hashsets QueryGoal qg = new QueryGoal("five test "); @@ -175,23 +178,26 @@ public class SegmentTest { assertTrue("test url hash in result set", wc.has(url.hash())); // the returned WordReference is expected to be a joined Reference with properties set used in ranking - Iterator it = wc.entries(); - System.out.println("-----------------"); + WordReference r = wc.getReference(url.hash()); + + // min position of search word in text (posintext) + assertEquals("minposition('five')", 5, r.minposition()); + // occurence of search words in text + assertEquals("hitcount('five')", 2, r.hitcount()); + + // phrase counts + assertEquals("phrasesintext", 3, r.phrasesintext()); + assertEquals("posofphrase", 100, r.posofphrase()); + assertEquals("posinphrase", 5, r.posinphrase()); // currently the results are not as expected for a multi-word query - while (it.hasNext()) { - WordReference r = it.next(); - // expected to be 1st in text - System.out.println("posintext=" + r.positions() + " (expected=5)"); - // min position of search word in text - System.out.println("minposition=" + r.minposition() + " (expected=5)"); - // max position of search word in text - System.out.println("maxposition=" + r.maxposition() + " (expected=8)"); - // for a multiword query distance expected to be the avg of search word positions in text - System.out.println("distance=" + r.distance() + " (expected=3)"); - // occurence of search words in text - System.out.println("hitcount=" + r.hitcount() + " (expected=2)"); - } + // (reason: Reference container is backed by ReferenceRow (which doen't hold positions of joined references) ergo can't return related results + System.out.println("-----------------"); + System.out.println("positions=" + r.positions() + " (expected=5,8)"); + // max position of search word in text + System.out.println("maxposition=" + r.maxposition() + " (expected=8)"); + // for a multiword query distance expected to be the avg of search word positions in text + System.out.println("distance=" + r.distance() + " (expected=3)"); System.out.println("-----------------"); }