From 15e3a57b4e650e83dfaa5a64b8f8c33b7b1ac6cb Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 5 May 2011 09:23:10 +0000 Subject: [PATCH] removed unused functions in condenser git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7698 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/net/yacy/document/Condenser.java | 95 ++----------------------- 1 file changed, 7 insertions(+), 88 deletions(-) diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index 2c9729c5f..00e7c9538 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -279,18 +279,15 @@ public final class Condenser { private void createCondensement(final InputStream is, final WordCache meaningLib) { assert is != null; final Set currsentwords = new HashSet(); - StringBuilder sentence = new StringBuilder(100); String word = ""; String k; int wordlen; Word wsp, wsp1; - Phrase psp; int wordHandle; int wordHandleCount = 0; int sentenceHandleCount = 0; int allwordcounter = 0; int allsentencecounter = 0; - int idx; int wordInSentenceCounter = 1; boolean comb_indexof = false, last_last = false, last_index = false; final Map sentences = new HashMap(100); @@ -298,58 +295,32 @@ public final class Condenser { // read source final WordTokenizer wordenum = new WordTokenizer(is, meaningLib); while (wordenum.hasMoreElements()) { - word = (wordenum.nextElement().toString()).toLowerCase(Locale.ENGLISH); // TODO: does toLowerCase work for non ISO-8859-1 chars? + word = wordenum.nextElement().toLowerCase(Locale.ENGLISH); if (languageIdentificator != null) languageIdentificator.add(word); if (word.length() < wordminsize) continue; // distinguish punctuation and words wordlen = word.length(); - Iterator it; - if ((wordlen == 1) && (SentenceReader.punctuation(word.charAt(0)))) { + if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) { // store sentence - if (sentence.length() > 0) { - // we store the punctuation symbol as first element of the sentence vector - allsentencecounter++; - sentence.insert(0, word); // append at beginning - if (sentences.containsKey(sentence)) { - // sentence already exists - psp = sentences.get(sentence); - psp.inc(); - idx = psp.handle(); - sentences.put(sentence, psp); - } else { - // create new sentence - idx = sentenceHandleCount++; - sentences.put(sentence, new Phrase(idx)); - } - // store to the words a link to this sentence - it = currsentwords.iterator(); - while (it.hasNext()) { - k = it.next(); - wsp = words.get(k); - wsp.check(idx); - words.put(k, wsp); // is that necessary? - } - } - sentence = new StringBuilder(100); currsentwords.clear(); wordInSentenceCounter = 1; } else { // check index.of detection - if ((last_last) && (comb_indexof) && (word.equals("modified"))) { + if (last_last && comb_indexof && word.equals("modified")) { this.RESULT_FLAGS.set(flag_cat_indexof, true); wordenum.pre(true); // parse lines as they come with CRLF } - if ((last_index) && (wordminsize > 2 || (word.equals("of")))) comb_indexof = true; + if (last_index && (wordminsize > 2 || word.equals("of"))) comb_indexof = true; last_last = word.equals("last"); last_index = word.equals("index"); // store word allwordcounter++; currsentwords.add(word); - if (words.containsKey(word)) { + wsp = words.get(word); + if (wsp != null) { // word already exists - wsp = words.get(word); wordHandle = wsp.posInText; wsp.inc(); } else { @@ -357,50 +328,12 @@ public final class Condenser { wordHandle = wordHandleCount++; wsp = new Word(wordHandle, wordInSentenceCounter, sentences.size() + 100); wsp.flags = RESULT_FLAGS.clone(); + words.put(word, wsp); } - words.put(word, wsp); // we now have the unique handle of the word, put it into the sentence: - sentence.append(intStringFormatter.format(wordHandle)); wordInSentenceCounter++; } } - // finish last sentence - if (sentence.length() > 0) { - allsentencecounter++; - sentence.insert(0, "."); // append at beginning - if (sentences.containsKey(sentence)) { - psp = sentences.get(sentence); - psp.inc(); - sentences.put(sentence, psp); - } else { - sentences.put(sentence, new Phrase(sentenceHandleCount++)); - } - } - - // we reconstruct the sentence hashtable - // and order the entries by the number of the sentence - // this structure is needed to replace double occurring words in sentences - final Object[] orderedSentences = new Object[sentenceHandleCount]; - String[] s; - int wc; - Object o; - final Iterator sit = sentences.keySet().iterator(); - while (sit.hasNext()) { - o = sit.next(); - if (o != null) { - sentence = (StringBuilder) o; - wc = (sentence.length() - 1) / numlength; - s = new String[wc + 2]; - psp = sentences.get(sentence); - s[0] = intStringFormatter.format(psp.occurrences()); // number of occurrences of this sentence - s[1] = sentence.substring(0, 1); // the termination symbol of this sentence - for (int i = 0; i < wc; i++) { - k = sentence.substring(i * numlength + 1, (i + 1) * numlength + 1); - s[i + 2] = k; - } - orderedSentences[psp.handle()] = s; - } - } if (pseudostemming) { Map.Entry entry; @@ -416,20 +349,6 @@ public final class Condenser { if (wordlen > i) { k = word.substring(0, wordlen - i); if (words.containsKey(k)) { - // we will delete the word 'word' and repoint the - // corresponding links - // in sentences that use this word - wsp1 = words.get(k); - final Iterator it1 = wsp.phrases(); // we iterate over all sentences that refer to this word - while (it1.hasNext()) { - idx = it1.next().intValue(); // number of a sentence - s = (String[]) orderedSentences[idx]; - for (int j = 2; j < s.length; j++) { - if (s[j].equals(intStringFormatter.format(wsp.posInText))) - s[j] = intStringFormatter.format(wsp1.posInText); - } - orderedSentences[idx] = s; - } // update word counter wsp1.count = wsp1.count + wsp.count; words.put(k, wsp1);