From 272cdd496a33a4c282443de7e70d4653613b96be Mon Sep 17 00:00:00 2001
From: reger <reger18@arcor.de>
Date: Wed, 7 Sep 2016 02:16:16 +0200
Subject: [PATCH] reactivate sentence counter in WordTokenizer for phrasepos
 ranking, by counting punktuation (delivered as 1 char word) again.

---
 source/net/yacy/document/Tokenizer.java       | 66 +++++++++----------
 source/net/yacy/document/WordTokenizer.java   | 10 ++-
 .../net/yacy/document/WordTokenizerTest.java  |  8 ++-
 3 files changed, 47 insertions(+), 37 deletions(-)

diff --git a/source/net/yacy/document/Tokenizer.java b/source/net/yacy/document/Tokenizer.java
index ca5591795..7e27492dd 100644
--- a/source/net/yacy/document/Tokenizer.java
+++ b/source/net/yacy/document/Tokenizer.java
@@ -78,7 +78,7 @@ public class Tokenizer {
         int wordHandleCount = 0;
         //final int sentenceHandleCount = 0;
         int allwordcounter = 0;
-        final int allsentencecounter = 0;
+        int allsentencecounter = 0;
         int wordInSentenceCounter = 1;
         boolean comb_indexof = false, last_last = false, last_index = false;
         //final Map<StringBuilder, Phrase> sentences = new HashMap<StringBuilder, Phrase>(100);
@@ -89,6 +89,14 @@ public class Tokenizer {
         try {
             while (wordenum.hasMoreElements()) {
                 String word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH);
+                // handle punktuation (start new sentence)
+                if (word.length() == 1 && SentenceReader.punctuation(word.charAt(0))) {
+                    // store sentence
+                    currsentwords.clear();
+                    wordInSentenceCounter = 1;
+                    allsentencecounter++;
+                    continue;
+                }
                 if (word.length() < wordminsize) continue;
 
                 // get tags from autotagging
@@ -144,40 +152,32 @@ public class Tokenizer {
                 System.arraycopy(wordcache, 1, wordcache, 0, wordcache.length - 1);
                 wordcache[wordcache.length - 1] = word;
 
-                // distinguish punctuation and words
-                wordlen = word.length();
-                if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) { // TODO: wordlen == 1 never true (see earlier if < wordminsize )
-                    // store sentence
-                    currsentwords.clear();
-                    wordInSentenceCounter = 1;
+                // check index.of detection
+                if (last_last && comb_indexof && word.equals("modified")) {
+                    this.RESULT_FLAGS.set(flag_cat_indexof, true);
+                    wordenum.pre(true); // parse lines as they come with CRLF
+                }
+                if (last_index && (wordminsize > 2 || word.equals("of"))) comb_indexof = true;
+                last_last = word.equals("last");
+                last_index = word.equals("index");
+
+                // store word
+                allwordcounter++;
+                currsentwords.add(word);
+                Word wsp = this.words.get(word);
+                if (wsp != null) {
+                    // word already exists
+                    wordHandle = wsp.posInText;
+                    wsp.inc();
                 } else {
-                    // check index.of detection
-                    if (last_last && comb_indexof && word.equals("modified")) {
-                        this.RESULT_FLAGS.set(flag_cat_indexof, true);
-                        wordenum.pre(true); // parse lines as they come with CRLF
-                    }
-                    if (last_index && (wordminsize > 2 || word.equals("of"))) comb_indexof = true;
-                    last_last = word.equals("last");
-                    last_index = word.equals("index");
-
-                    // store word
-                    allwordcounter++;
-                    currsentwords.add(word);
-                    Word wsp = this.words.get(word);
-                    if (wsp != null) {
-                        // word already exists
-                        wordHandle = wsp.posInText;
-                        wsp.inc();
-                    } else {
-                        // word does not yet exist, create new word entry
-                        wordHandle = ++wordHandleCount; // let start pos with 1
-                        wsp = new Word(wordHandle, wordInSentenceCounter, /* sentences.size() + */ 100);
-                        wsp.flags = this.RESULT_FLAGS.clone();
-                        this.words.put(word.toLowerCase(), wsp);
-                    }
-                    // we now have the unique handle of the word, put it into the sentence:
-                    wordInSentenceCounter++;
+                    // word does not yet exist, create new word entry
+                    wordHandle = ++wordHandleCount; // let start pos with 1
+                    wsp = new Word(wordHandle, wordInSentenceCounter, allsentencecounter + 100); // nomal sentence start at 100 !
+                    wsp.flags = this.RESULT_FLAGS.clone();
+                    this.words.put(word.toLowerCase(), wsp);
                 }
+                // we now have the unique handle of the word, put it into the sentence:
+                wordInSentenceCounter++;
             }
         } finally {
             wordenum.close();
diff --git a/source/net/yacy/document/WordTokenizer.java b/source/net/yacy/document/WordTokenizer.java
index 69d78ae71..25caf88ac 100644
--- a/source/net/yacy/document/WordTokenizer.java
+++ b/source/net/yacy/document/WordTokenizer.java
@@ -56,7 +56,7 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
     private StringBuilder nextElement0() {
         StringBuilder s;
         while (this.e.hasMoreElements()) {
-            s = this.e.nextElement(); // next word (punctuation and invisible chars filtered)
+            s = this.e.nextElement(); // next word (invisible chars filtered)
             return s;
         }
         return null;
@@ -118,7 +118,13 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
                 for (int i = 0; i < r.length(); i++) { // tokenize one sentence
                     c = r.charAt(i);
                     if (SentenceReader.punctuation(c)) { // punctuation check is simple/quick, do it before invisible
-                        if (sb.length() > 0) {this.s.add(sb); sb = new StringBuilder(20);}
+                        if (sb.length() > 0) {
+                            this.s.add(sb);
+                            sb = new StringBuilder(1);
+                        }
+                        sb.append(c);
+                        this.s.add(sb);
+                        sb = new StringBuilder(20);
                     } else if (SentenceReader.invisible(c)) { // ! currently punctuation again checked by invisible()
                         if (sb.length() > 0) {this.s.add(sb); sb = new StringBuilder(20);}
                     } else {
diff --git a/test/java/net/yacy/document/WordTokenizerTest.java b/test/java/net/yacy/document/WordTokenizerTest.java
index c32e71ead..7f4250953 100644
--- a/test/java/net/yacy/document/WordTokenizerTest.java
+++ b/test/java/net/yacy/document/WordTokenizerTest.java
@@ -22,8 +22,12 @@ public class WordTokenizerTest {
             int cnt = 0;
             while (wt.hasMoreElements()) {
                 StringBuilder sb = wt.nextElement();
-                assertEquals("word", sb.toString());
-                cnt++;
+                if (sb.length() > 1) { // skip punktuation
+                    assertEquals("word", sb.toString());
+                    cnt++;
+                } else {
+                    assertTrue("punktuation", SentenceReader.punctuation(sb.charAt(0)));
+                }
             }
             wt.close();
             assertEquals(10, cnt);