From 272cdd496a33a4c282443de7e70d4653613b96be Mon Sep 17 00:00:00 2001 From: reger Date: Wed, 7 Sep 2016 02:16:16 +0200 Subject: [PATCH] reactivate sentence counter in WordTokenizer for phrasepos ranking, by counting punktuation (delivered as 1 char word) again. --- source/net/yacy/document/Tokenizer.java | 66 +++++++++---------- source/net/yacy/document/WordTokenizer.java | 10 ++- .../net/yacy/document/WordTokenizerTest.java | 8 ++- 3 files changed, 47 insertions(+), 37 deletions(-) diff --git a/source/net/yacy/document/Tokenizer.java b/source/net/yacy/document/Tokenizer.java index ca5591795..7e27492dd 100644 --- a/source/net/yacy/document/Tokenizer.java +++ b/source/net/yacy/document/Tokenizer.java @@ -78,7 +78,7 @@ public class Tokenizer { int wordHandleCount = 0; //final int sentenceHandleCount = 0; int allwordcounter = 0; - final int allsentencecounter = 0; + int allsentencecounter = 0; int wordInSentenceCounter = 1; boolean comb_indexof = false, last_last = false, last_index = false; //final Map sentences = new HashMap(100); @@ -89,6 +89,14 @@ public class Tokenizer { try { while (wordenum.hasMoreElements()) { String word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH); + // handle punktuation (start new sentence) + if (word.length() == 1 && SentenceReader.punctuation(word.charAt(0))) { + // store sentence + currsentwords.clear(); + wordInSentenceCounter = 1; + allsentencecounter++; + continue; + } if (word.length() < wordminsize) continue; // get tags from autotagging @@ -144,40 +152,32 @@ public class Tokenizer { System.arraycopy(wordcache, 1, wordcache, 0, wordcache.length - 1); wordcache[wordcache.length - 1] = word; - // distinguish punctuation and words - wordlen = word.length(); - if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) { // TODO: wordlen == 1 never true (see earlier if < wordminsize ) - // store sentence - currsentwords.clear(); - wordInSentenceCounter = 1; + // check index.of detection + if (last_last && comb_indexof && word.equals("modified")) { + this.RESULT_FLAGS.set(flag_cat_indexof, true); + wordenum.pre(true); // parse lines as they come with CRLF + } + if (last_index && (wordminsize > 2 || word.equals("of"))) comb_indexof = true; + last_last = word.equals("last"); + last_index = word.equals("index"); + + // store word + allwordcounter++; + currsentwords.add(word); + Word wsp = this.words.get(word); + if (wsp != null) { + // word already exists + wordHandle = wsp.posInText; + wsp.inc(); } else { - // check index.of detection - if (last_last && comb_indexof && word.equals("modified")) { - this.RESULT_FLAGS.set(flag_cat_indexof, true); - wordenum.pre(true); // parse lines as they come with CRLF - } - if (last_index && (wordminsize > 2 || word.equals("of"))) comb_indexof = true; - last_last = word.equals("last"); - last_index = word.equals("index"); - - // store word - allwordcounter++; - currsentwords.add(word); - Word wsp = this.words.get(word); - if (wsp != null) { - // word already exists - wordHandle = wsp.posInText; - wsp.inc(); - } else { - // word does not yet exist, create new word entry - wordHandle = ++wordHandleCount; // let start pos with 1 - wsp = new Word(wordHandle, wordInSentenceCounter, /* sentences.size() + */ 100); - wsp.flags = this.RESULT_FLAGS.clone(); - this.words.put(word.toLowerCase(), wsp); - } - // we now have the unique handle of the word, put it into the sentence: - wordInSentenceCounter++; + // word does not yet exist, create new word entry + wordHandle = ++wordHandleCount; // let start pos with 1 + wsp = new Word(wordHandle, wordInSentenceCounter, allsentencecounter + 100); // nomal sentence start at 100 ! + wsp.flags = this.RESULT_FLAGS.clone(); + this.words.put(word.toLowerCase(), wsp); } + // we now have the unique handle of the word, put it into the sentence: + wordInSentenceCounter++; } } finally { wordenum.close(); diff --git a/source/net/yacy/document/WordTokenizer.java b/source/net/yacy/document/WordTokenizer.java index 69d78ae71..25caf88ac 100644 --- a/source/net/yacy/document/WordTokenizer.java +++ b/source/net/yacy/document/WordTokenizer.java @@ -56,7 +56,7 @@ public class WordTokenizer implements Enumeration { private StringBuilder nextElement0() { StringBuilder s; while (this.e.hasMoreElements()) { - s = this.e.nextElement(); // next word (punctuation and invisible chars filtered) + s = this.e.nextElement(); // next word (invisible chars filtered) return s; } return null; @@ -118,7 +118,13 @@ public class WordTokenizer implements Enumeration { for (int i = 0; i < r.length(); i++) { // tokenize one sentence c = r.charAt(i); if (SentenceReader.punctuation(c)) { // punctuation check is simple/quick, do it before invisible - if (sb.length() > 0) {this.s.add(sb); sb = new StringBuilder(20);} + if (sb.length() > 0) { + this.s.add(sb); + sb = new StringBuilder(1); + } + sb.append(c); + this.s.add(sb); + sb = new StringBuilder(20); } else if (SentenceReader.invisible(c)) { // ! currently punctuation again checked by invisible() if (sb.length() > 0) {this.s.add(sb); sb = new StringBuilder(20);} } else { diff --git a/test/java/net/yacy/document/WordTokenizerTest.java b/test/java/net/yacy/document/WordTokenizerTest.java index c32e71ead..7f4250953 100644 --- a/test/java/net/yacy/document/WordTokenizerTest.java +++ b/test/java/net/yacy/document/WordTokenizerTest.java @@ -22,8 +22,12 @@ public class WordTokenizerTest { int cnt = 0; while (wt.hasMoreElements()) { StringBuilder sb = wt.nextElement(); - assertEquals("word", sb.toString()); - cnt++; + if (sb.length() > 1) { // skip punktuation + assertEquals("word", sb.toString()); + cnt++; + } else { + assertTrue("punktuation", SentenceReader.punctuation(sb.charAt(0))); + } } wt.close(); assertEquals(10, cnt);