adjust Tokenizer sentence count to ignore repeated punktuation (like !!!! )

+ remove unused sentenceword map (we use only the count) + upd test case for sentence count
9 years ago · ae3717d087
parent b5eb7a9217
commit ae3717d087
2 changed files with 25 additions and 5 deletions
--- a/source/net/yacy/document/Tokenizer.java
+++ b/source/net/yacy/document/Tokenizer.java
@ -68,7 +68,6 @@ public class Tokenizer {
        this.words = new TreeMap<String, Word>(NaturalOrder.naturalComparator);
        this.synonyms = new LinkedHashSet<String>();
        assert text != null;
-        final Set<String> currsentwords = new HashSet<String>();
        String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1];
        for (int i = 0; i < wordcache.length; i++) wordcache[i] = "";
        String k;
@ -89,9 +88,9 @@ public class Tokenizer {
                // handle punktuation (start new sentence)
                if (word.length() == 1 && SentenceReader.punctuation(word.charAt(0))) {
                    // store sentence
-                    currsentwords.clear();
+                    if (wordInSentenceCounter > 1) // if no word in sentence repeated punktuation ".....", don't count as sentence
+                        allsentencecounter++;
                    wordInSentenceCounter = 1;
-                    allsentencecounter++;
                    continue;
                }
                if (word.length() < wordminsize) continue;
@ -160,7 +159,6 @@ public class Tokenizer {

                // store word
                allwordcounter++;
-                currsentwords.add(word);
                Word wsp = this.words.get(word);
                if (wsp != null) {
                    // word already exists
@ -214,7 +212,7 @@ public class Tokenizer {
        // store result
        this.RESULT_NUMB_WORDS = allwordcounter;
        // if text doesn't end with punktuation but has words after last found sentence, inc sentence count for trailing text.
-        this.RESULT_NUMB_SENTENCES = allsentencecounter + (currsentwords.size() > 0 ? 1 : 0);
+        this.RESULT_NUMB_SENTENCES = allsentencecounter + (wordInSentenceCounter > 1 ? 1 : 0);
    }
    
    public Map<String, Word> words() {
--- a/test/java/net/yacy/document/TokenizerTest.java
+++ b/test/java/net/yacy/document/TokenizerTest.java
@ -2,7 +2,9 @@
 package net.yacy.document;

 import java.net.MalformedURLException;
+import java.util.HashSet;
 import java.util.Map;
+import java.util.Set;
 import net.yacy.cora.document.WordCache;
 import net.yacy.kelondro.data.word.Word;
 import org.junit.Test;
@ -36,4 +38,24 @@ public class TokenizerTest {
        assertEquals("occurence of 'words' ", 2, w.occurrences());
    }

+    /**
+     * Test of RESULT_NUMB_SENTENCES, of class Tokenizer.
+     */
+    @Test
+    public void testNumberOfSentences() {
+        Set<String> testText = new HashSet();
+        // text with 5 sentences
+        testText.add("Sentence One. Sentence Two. Comment on this. This is sentence four! Good By................");
+        testText.add("Sentence One. Sentence two. Sentence 3? Sentence 4! Sentence w/o punktuation at end of text");
+        testText.add("!!! ! ! ! Sentence One. Sentence two. Sentence 3? Sentence 4! Sentence 5 ! ! ! !!!");
+
+        WordCache meaningLib = new WordCache(null);
+        boolean doAutotagging = false;
+        VocabularyScraper scraper = null;
+        for (String text : testText) {
+            Tokenizer t = new Tokenizer(null, text, meaningLib, doAutotagging, scraper);
+            System.out.println(t.RESULT_NUMB_WORDS);
+            assertEquals("Tokenizer.RESULT_NUMB_SENTENCES", 5, t.RESULT_NUMB_SENTENCES);
+        }
+    }
 }