reactivate sentence counter in WordTokenizer for phrasepos ranking,

by counting punktuation (delivered as 1 char word) again.
9 years ago · 272cdd496a
parent 5e165a8150
commit 272cdd496a
3 changed files with 47 additions and 37 deletions
--- a/source/net/yacy/document/Tokenizer.java
+++ b/source/net/yacy/document/Tokenizer.java
@ -78,7 +78,7 @@ public class Tokenizer {
        int wordHandleCount = 0;
        //final int sentenceHandleCount = 0;
        int allwordcounter = 0;
-        final int allsentencecounter = 0;
+        int allsentencecounter = 0;
        int wordInSentenceCounter = 1;
        boolean comb_indexof = false, last_last = false, last_index = false;
        //final Map<StringBuilder, Phrase> sentences = new HashMap<StringBuilder, Phrase>(100);
@ -89,6 +89,14 @@ public class Tokenizer {
        try {
            while (wordenum.hasMoreElements()) {
                String word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH);
+                // handle punktuation (start new sentence)
+                if (word.length() == 1 && SentenceReader.punctuation(word.charAt(0))) {
+                    // store sentence
+                    currsentwords.clear();
+                    wordInSentenceCounter = 1;
+                    allsentencecounter++;
+                    continue;
+                }
                if (word.length() < wordminsize) continue;

                // get tags from autotagging
@ -144,40 +152,32 @@ public class Tokenizer {
                System.arraycopy(wordcache, 1, wordcache, 0, wordcache.length - 1);
                wordcache[wordcache.length - 1] = word;

-                // distinguish punctuation and words
-                wordlen = word.length();
-                if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) { // TODO: wordlen == 1 never true (see earlier if < wordminsize )
-                    // store sentence
-                    currsentwords.clear();
-                    wordInSentenceCounter = 1;
+                // check index.of detection
+                if (last_last && comb_indexof && word.equals("modified")) {
+                    this.RESULT_FLAGS.set(flag_cat_indexof, true);
+                    wordenum.pre(true); // parse lines as they come with CRLF
+                }
+                if (last_index && (wordminsize > 2 || word.equals("of"))) comb_indexof = true;
+                last_last = word.equals("last");
+                last_index = word.equals("index");
+
+                // store word
+                allwordcounter++;
+                currsentwords.add(word);
+                Word wsp = this.words.get(word);
+                if (wsp != null) {
+                    // word already exists
+                    wordHandle = wsp.posInText;
+                    wsp.inc();
                } else {
-                    // check index.of detection
-                    if (last_last && comb_indexof && word.equals("modified")) {
-                        this.RESULT_FLAGS.set(flag_cat_indexof, true);
-                        wordenum.pre(true); // parse lines as they come with CRLF
-                    }
-                    if (last_index && (wordminsize > 2 || word.equals("of"))) comb_indexof = true;
-                    last_last = word.equals("last");
-                    last_index = word.equals("index");
-
-                    // store word
-                    allwordcounter++;
-                    currsentwords.add(word);
-                    Word wsp = this.words.get(word);
-                    if (wsp != null) {
-                        // word already exists
-                        wordHandle = wsp.posInText;
-                        wsp.inc();
-                    } else {
-                        // word does not yet exist, create new word entry
-                        wordHandle = ++wordHandleCount; // let start pos with 1
-                        wsp = new Word(wordHandle, wordInSentenceCounter, /* sentences.size() + */ 100);
-                        wsp.flags = this.RESULT_FLAGS.clone();
-                        this.words.put(word.toLowerCase(), wsp);
-                    }
-                    // we now have the unique handle of the word, put it into the sentence:
-                    wordInSentenceCounter++;
+                    // word does not yet exist, create new word entry
+                    wordHandle = ++wordHandleCount; // let start pos with 1
+                    wsp = new Word(wordHandle, wordInSentenceCounter, allsentencecounter + 100); // nomal sentence start at 100 !
+                    wsp.flags = this.RESULT_FLAGS.clone();
+                    this.words.put(word.toLowerCase(), wsp);
                }
+                // we now have the unique handle of the word, put it into the sentence:
+                wordInSentenceCounter++;
            }
        } finally {
            wordenum.close();
--- a/source/net/yacy/document/WordTokenizer.java
+++ b/source/net/yacy/document/WordTokenizer.java
@ -56,7 +56,7 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
    private StringBuilder nextElement0() {
        StringBuilder s;
        while (this.e.hasMoreElements()) {
-            s = this.e.nextElement(); // next word (punctuation and invisible chars filtered)
+            s = this.e.nextElement(); // next word (invisible chars filtered)
            return s;
        }
        return null;
@ -118,7 +118,13 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
                for (int i = 0; i < r.length(); i++) { // tokenize one sentence
                    c = r.charAt(i);
                    if (SentenceReader.punctuation(c)) { // punctuation check is simple/quick, do it before invisible
-                        if (sb.length() > 0) {this.s.add(sb); sb = new StringBuilder(20);}
+                        if (sb.length() > 0) {
+                            this.s.add(sb);
+                            sb = new StringBuilder(1);
+                        }
+                        sb.append(c);
+                        this.s.add(sb);
+                        sb = new StringBuilder(20);
                    } else if (SentenceReader.invisible(c)) { // ! currently punctuation again checked by invisible()
                        if (sb.length() > 0) {this.s.add(sb); sb = new StringBuilder(20);}
                    } else {
--- a/test/java/net/yacy/document/WordTokenizerTest.java
+++ b/test/java/net/yacy/document/WordTokenizerTest.java
@ -22,8 +22,12 @@ public class WordTokenizerTest {
            int cnt = 0;
            while (wt.hasMoreElements()) {
                StringBuilder sb = wt.nextElement();
-                assertEquals("word", sb.toString());
-                cnt++;
+                if (sb.length() > 1) { // skip punktuation
+                    assertEquals("word", sb.toString());
+                    cnt++;
+                } else {
+                    assertTrue("punktuation", SentenceReader.punctuation(sb.charAt(0)));
+                }
            }
            wt.close();
            assertEquals(10, cnt);