diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index 982dbc9a3..54df12232 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -80,9 +80,9 @@ public final class Condenser { private String fuzzy_signature_text = null; // signatures for double-check detection public int RESULT_NUMB_WORDS = -1; - public int RESULT_DIFF_WORDS = -1; + //public int RESULT_DIFF_WORDS = -1; public int RESULT_NUMB_SENTENCES = -1; - public int RESULT_DIFF_SENTENCES = -1; + //public int RESULT_DIFF_SENTENCES = -1; public Bitfield RESULT_FLAGS = new Bitfield(4); private final Identificator languageIdentificator; @@ -157,9 +157,9 @@ public final class Condenser { */ } else { this.RESULT_NUMB_WORDS = 0; - this.RESULT_DIFF_WORDS = 0; + //this.RESULT_DIFF_WORDS = 0; this.RESULT_NUMB_SENTENCES = 0; - this.RESULT_DIFF_SENTENCES = 0; + //this.RESULT_DIFF_SENTENCES = 0; } if (indexMedia) { @@ -274,7 +274,7 @@ public final class Condenser { this.words.put(word.toLowerCase(), wprop); pip++; this.RESULT_NUMB_WORDS++; - this.RESULT_DIFF_WORDS++; + //this.RESULT_DIFF_WORDS++; } } finally { wordenum.close(); @@ -330,12 +330,12 @@ public final class Condenser { final Word wsp1; int wordHandle; int wordHandleCount = 0; - final int sentenceHandleCount = 0; + //final int sentenceHandleCount = 0; int allwordcounter = 0; final int allsentencecounter = 0; int wordInSentenceCounter = 1; boolean comb_indexof = false, last_last = false, last_index = false; - final Map sentences = new HashMap(100); + //final Map sentences = new HashMap(100); if (LibraryProvider.autotagging.isEmpty()) doAutotagging = false; // read source @@ -379,7 +379,7 @@ public final class Condenser { // distinguish punctuation and words wordlen = word.length(); - if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) { + if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) { // TODO: wordlen == 1 never true (see earlier if < wordminsize ) // store sentence currsentwords.clear(); wordInSentenceCounter = 1; @@ -404,7 +404,7 @@ public final class Condenser { } else { // word does not yet exist, create new word entry wordHandle = wordHandleCount++; - wsp = new Word(wordHandle, wordInSentenceCounter, sentences.size() + 100); + wsp = new Word(wordHandle, wordInSentenceCounter, /* sentences.size() + */ 100); wsp.flags = this.RESULT_FLAGS.clone(); this.words.put(word.toLowerCase(), wsp); } @@ -446,9 +446,9 @@ public final class Condenser { // store result //this.RESULT_NUMB_TEXT_BYTES = wordenum.count(); this.RESULT_NUMB_WORDS = allwordcounter; - this.RESULT_DIFF_WORDS = wordHandleCount; + //this.RESULT_DIFF_WORDS = wordHandleCount; this.RESULT_NUMB_SENTENCES = allsentencecounter; - this.RESULT_DIFF_SENTENCES = sentenceHandleCount; + //this.RESULT_DIFF_SENTENCES = sentenceHandleCount; } public static Map getWords(final String text, final WordCache meaningLib) { diff --git a/source/net/yacy/document/WordTokenizer.java b/source/net/yacy/document/WordTokenizer.java index 8614c10ff..6e8a23d32 100644 --- a/source/net/yacy/document/WordTokenizer.java +++ b/source/net/yacy/document/WordTokenizer.java @@ -55,12 +55,8 @@ public class WordTokenizer implements Enumeration { private StringBuilder nextElement0() { StringBuilder s; - loop: while (this.e.hasMoreElements()) { - s = this.e.nextElement(); - if ((s.length() == 1) && (SentenceReader.punctuation(s.charAt(0)))) return s; - for (int i = 0; i < s.length(); i++) { - if (SentenceReader.invisible(s.charAt(i))) continue loop; - } + while (this.e.hasMoreElements()) { + s = this.e.nextElement(); // next word (punctuation and invisible chars filtered) return s; } return null; @@ -86,7 +82,7 @@ public class WordTokenizer implements Enumeration { this.buffer = null; } - private static class unsievedWordsEnum implements Enumeration { + private class unsievedWordsEnum implements Enumeration { // returns an enumeration of StringBuilder Objects private StringBuilder buffer = null; private SentenceReader sr; @@ -115,19 +111,16 @@ public class WordTokenizer implements Enumeration { } while (this.s.isEmpty()) { if (!this.sr.hasNext()) return null; - r = this.sr.next(); + r = this.sr.next(); // read next sentence (incl. ending punctuation) if (r == null) return null; r = trim(r); sb = new StringBuilder(20); - for (int i = 0; i < r.length(); i++) { + for (int i = 0; i < r.length(); i++) { // tokenize one sentence c = r.charAt(i); - if (SentenceReader.invisible(c)) { + if (SentenceReader.punctuation(c)) { // punctuation check is simple/quick, do it before invisible + if (sb.length() > 0) {this.s.add(sb); sb = new StringBuilder(20);} + } else if (SentenceReader.invisible(c)) { // ! currently punctuation again checked by invisible() if (sb.length() > 0) {this.s.add(sb); sb = new StringBuilder(20);} - } else if (SentenceReader.punctuation(c)) { - if (sb.length() > 0) {this.s.add(sb); sb = new StringBuilder(1);} - sb.append(c); - this.s.add(sb); - sb = new StringBuilder(20); } else { sb = sb.append(c); } @@ -157,8 +150,8 @@ public class WordTokenizer implements Enumeration { this.sIndex = 0; this.s.clear(); this.s = null; - this.sr.close(); - this.sr = null; + this.sr.close(); + this.sr = null; } }