From f1cfee7703b7e7a53ddc0d6a1394ef6f805cc95c Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 19 Jan 2006 12:24:35 +0000 Subject: [PATCH] removed tabs from condenser git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1376 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/plasma/plasmaCondenser.java | 963 ++++++++++--------- 1 file changed, 497 insertions(+), 466 deletions(-) diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java index 594c92cc0..e4b1965c9 100644 --- a/source/de/anomic/plasma/plasmaCondenser.java +++ b/source/de/anomic/plasma/plasmaCondenser.java @@ -117,124 +117,134 @@ public final class plasmaCondenser { // number of occurrences of one word // if the word did not occur, this simply returns 0 statProp sp = (statProp) words.get(word); - if (sp == null) return 0; + if (sp == null) + return 0; return sp.count; } public static class statProp { - public int count; - public int handle; - public HashSet hash; - public statProp(int handle) { - this.count = 1; - this.handle = handle; - this.hash = new HashSet(); - } - public void inc() {count++;} - public void check(int i) {hash.add(Integer.toString(i));} - + public int count; + + public int handle; + + public HashSet hash; + + public statProp(int handle) { + this.count = 1; + this.handle = handle; + this.hash = new HashSet(); + } + + public void inc() { + count++; + } + + public void check(int i) { + hash.add(Integer.toString(i)); + } + } public String intString(int number, int length) { - String s = Integer.toString(number); - while (s.length() < length) s = "0" + s; - return s; + String s = Integer.toString(number); + while (s.length() < length) s = "0" + s; + return s; } private void createCondensement(InputStream is) { - words = new TreeMap(kelondroNaturalOrder.naturalOrder); - sentences = new HashMap(); - HashSet currsentwords = new HashSet(); - StringBuffer sentence = new StringBuffer(100); - String word = ""; - String k; - int wordlen; - statProp sp, sp1; - int wordHandle; - int wordHandleCount = 0; - int sentenceHandleCount = 0; - int allwordcounter = 0; - int allsentencecounter = 0; - int idx; - Iterator it, it1; - - // read source - sievedWordsEnum wordenum = new sievedWordsEnum(is, wordminsize); - while (wordenum.hasMoreElements()) { - word = ((String) wordenum.nextElement()).toLowerCase(); - //System.out.println("PARSED-WORD " + word); - wordlen = word.length(); - if ((wordlen == 1) && (punctuation(word.charAt(0)))) { - // store sentence - if (sentence.length() > 0) { - // we store the punctuation symbol as first element of the sentence vector - allsentencecounter++; - sentence.insert(0, word); // append at beginning - if (sentences.containsKey(sentence)) { - // sentence already exists - sp = (statProp) sentences.get(sentence); - sp.inc(); - idx = sp.handle; - sentences.put(sentence, sp); - } else { - // create new sentence - idx = sentenceHandleCount++; - sentences.put(sentence, new statProp(idx)); - } - // store to the words a link to this sentence - it = currsentwords.iterator(); - while (it.hasNext()) { - k = (String) it.next(); - sp = (statProp) words.get(k); - sp.check(idx); - words.put(k,sp); - } - } - sentence = new StringBuffer(100); - currsentwords.clear(); - } else { - // store word - allwordcounter++; - currsentwords.add(word); - if (words.containsKey(word)) { - // word already exists - sp = (statProp) words.get(word); - wordHandle = sp.handle; - sp.inc(); - } else { - // word does not yet exist, create new word entry - wordHandle = wordHandleCount++; - sp = new statProp(wordHandle); - } - words.put(word, sp); - // we now have the unique handle of the word, put it into the sentence: - sentence.append(intString(wordHandle, numlength)); - } - } - // finnish last sentence - if (sentence.length() > 0) { - allsentencecounter++; - sentence.insert(0, "."); // append at beginning - if (sentences.containsKey(sentence)) { - sp = (statProp) sentences.get(sentence); - sp.inc(); - sentences.put(sentence, sp); - } else { - sentences.put(sentence, new statProp(sentenceHandleCount++)); - } - } - - //------------------- - - // we reconstruct the sentence hashtable - // and order the entries by the number of the sentence - // this structure is needed to replace double occurring words in sentences - Object[] orderedSentences = new Object[sentenceHandleCount]; - String[] s; - int wc; + words = new TreeMap(kelondroNaturalOrder.naturalOrder); + sentences = new HashMap(); + HashSet currsentwords = new HashSet(); + StringBuffer sentence = new StringBuffer(100); + String word = ""; + String k; + int wordlen; + statProp sp, sp1; + int wordHandle; + int wordHandleCount = 0; + int sentenceHandleCount = 0; + int allwordcounter = 0; + int allsentencecounter = 0; + int idx; + Iterator it, it1; + + // read source + sievedWordsEnum wordenum = new sievedWordsEnum(is, wordminsize); + while (wordenum.hasMoreElements()) { + word = ((String) wordenum.nextElement()).toLowerCase(); + // System.out.println("PARSED-WORD " + word); + wordlen = word.length(); + if ((wordlen == 1) && (punctuation(word.charAt(0)))) { + // store sentence + if (sentence.length() > 0) { + // we store the punctuation symbol as first element of the sentence vector + allsentencecounter++; + sentence.insert(0, word); // append at beginning + if (sentences.containsKey(sentence)) { + // sentence already exists + sp = (statProp) sentences.get(sentence); + sp.inc(); + idx = sp.handle; + sentences.put(sentence, sp); + } else { + // create new sentence + idx = sentenceHandleCount++; + sentences.put(sentence, new statProp(idx)); + } + // store to the words a link to this sentence + it = currsentwords.iterator(); + while (it.hasNext()) { + k = (String) it.next(); + sp = (statProp) words.get(k); + sp.check(idx); + words.put(k, sp); + } + } + sentence = new StringBuffer(100); + currsentwords.clear(); + } else { + // store word + allwordcounter++; + currsentwords.add(word); + if (words.containsKey(word)) { + // word already exists + sp = (statProp) words.get(word); + wordHandle = sp.handle; + sp.inc(); + } else { + // word does not yet exist, create new word entry + wordHandle = wordHandleCount++; + sp = new statProp(wordHandle); + } + words.put(word, sp); + // we now have the unique handle of the word, put it into the sentence: + sentence.append(intString(wordHandle, numlength)); + } + } + // finnish last sentence + if (sentence.length() > 0) { + allsentencecounter++; + sentence.insert(0, "."); // append at beginning + if (sentences.containsKey(sentence)) { + sp = (statProp) sentences.get(sentence); + sp.inc(); + sentences.put(sentence, sp); + } else { + sentences.put(sentence, new statProp(sentenceHandleCount++)); + } + } + + // ------------------- + + // we reconstruct the sentence hashtable + // and order the entries by the number of the sentence + // this structure is needed to replace double occurring words in sentences + Object[] orderedSentences = new Object[sentenceHandleCount]; + String[] s; + int wc; Object o; - it = sentences.keySet().iterator(); + it = sentences.keySet().iterator(); while (it.hasNext()) { o = it.next(); if (o != null) { @@ -243,7 +253,7 @@ public final class plasmaCondenser { s = new String[wc + 2]; sp = (statProp) sentences.get(sentence); s[0] = intString(sp.count, numlength); // number of occurrences of this sentence - s[1] = sentence.substring(0,1); // the termination symbol of this sentence + s[1] = sentence.substring(0, 1); // the termination symbol of this sentence for (int i = 0; i < wc; i++) { k = sentence.substring(i * numlength + 1, (i + 1) * numlength + 1); s[i + 2] = k; @@ -252,375 +262,392 @@ public final class plasmaCondenser { } } - Map.Entry entry; - // we search for similar words and reorganize the corresponding sentences - // a word is similar, if a shortened version is equal - it = words.entrySet().iterator(); // enumerates the keys in descending order - wordsearch: while (it.hasNext()) { - entry = (Map.Entry) it.next(); - word = (String) entry.getKey(); - wordlen = word.length(); - sp = (statProp) entry.getValue(); - for (int i = wordcut; i > 0; i--) { - if (wordlen > i) { - k = word.substring(0, wordlen - i); - if (words.containsKey(k)) { - // we will delete the word 'word' and repoint the corresponding links - // in sentences that use this word - sp1 = (statProp) words.get(k); - it1 = sp.hash.iterator(); // we iterate over all sentences that refer to this word - while (it1.hasNext()) { - idx = Integer.parseInt((String) it1.next()); // number of a sentence - s = (String[]) orderedSentences[idx]; - for (int j = 2; j < s.length; j++) { - if (s[j].equals(intString(sp.handle, numlength))) s[j] = intString(sp1.handle, numlength); - } - orderedSentences[idx] = s; - } - // update word counter - sp1.count = sp1.count + sp.count; - words.put(k, sp1); - // remove current word - it.remove(); - continue wordsearch; - } - } - } - } - - // depending on the orderedSentences structure, we rebuild the sentence HashMap to - // eliminate double occuring sentences - sentences = new HashMap(); + Map.Entry entry; + // we search for similar words and reorganize the corresponding sentences + // a word is similar, if a shortened version is equal + it = words.entrySet().iterator(); // enumerates the keys in descending order + wordsearch: while (it.hasNext()) { + entry = (Map.Entry) it.next(); + word = (String) entry.getKey(); + wordlen = word.length(); + sp = (statProp) entry.getValue(); + for (int i = wordcut; i > 0; i--) { + if (wordlen > i) { + k = word.substring(0, wordlen - i); + if (words.containsKey(k)) { + // we will delete the word 'word' and repoint the + // corresponding links + // in sentences that use this word + sp1 = (statProp) words.get(k); + it1 = sp.hash.iterator(); // we iterate over all sentences that refer to this word + while (it1.hasNext()) { + idx = Integer.parseInt((String) it1.next()); // number of a sentence + s = (String[]) orderedSentences[idx]; + for (int j = 2; j < s.length; j++) { + if (s[j].equals(intString(sp.handle, numlength))) + s[j] = intString(sp1.handle, numlength); + } + orderedSentences[idx] = s; + } + // update word counter + sp1.count = sp1.count + sp.count; + words.put(k, sp1); + // remove current word + it.remove(); + continue wordsearch; + } + } + } + } + + // depending on the orderedSentences structure, we rebuild the sentence + // HashMap to eliminate double occuring sentences + sentences = new HashMap(); int le; - for (int i = 0; i < orderedSentences.length; i++) { - le = ((String[]) orderedSentences[i]).length; - sentence = new StringBuffer(le * 10); - for (int j = 1; j < le; j++) sentence.append(((String[]) orderedSentences[i])[j]); + for (int i = 0; i < orderedSentences.length; i++) { + le = ((String[]) orderedSentences[i]).length; + sentence = new StringBuffer(le * 10); + for (int j = 1; j < le; j++) + sentence.append(((String[]) orderedSentences[i])[j]); if (sentences.containsKey(sentence)) { - // add sentence counter to counter of found sentence - sp = (statProp) sentences.get(sentence); - sp.count = sp.count + Integer.parseInt(((String[]) orderedSentences[i])[0]); - sentences.put(sentence, sp); - //System.out.println("Found double occurring sentence " + i + " = " + sp.handle); - } else { - // create new sentence entry - sp = new statProp(i); - sp.count = Integer.parseInt(((String[]) orderedSentences[i])[0]); - sentences.put(sentence, sp); - } - } - - // store result + // add sentence counter to counter of found sentence + sp = (statProp) sentences.get(sentence); + sp.count = sp.count + Integer.parseInt(((String[]) orderedSentences[i])[0]); + sentences.put(sentence, sp); + // System.out.println("Found double occurring sentence " + i + " + // = " + sp.handle); + } else { + // create new sentence entry + sp = new statProp(i); + sp.count = Integer.parseInt(((String[]) orderedSentences[i])[0]); + sentences.put(sentence, sp); + } + } + + // store result this.RESULT_NUMB_TEXT_BYTES = wordenum.count(); - this.RESULT_NUMB_WORDS = allwordcounter; - this.RESULT_DIFF_WORDS = wordHandleCount; - this.RESULT_SIMI_WORDS = words.size(); - this.RESULT_WORD_ENTROPHY = (allwordcounter == 0) ? 0 : (255 * words.size() / allwordcounter); - this.RESULT_NUMB_SENTENCES = allsentencecounter; - this.RESULT_DIFF_SENTENCES = sentenceHandleCount; - this.RESULT_SIMI_SENTENCES = sentences.size(); - this.RESULT_AVERAGE_WORD_OCC = (words.size() == 0) ? 0 : (allwordcounter / words.size()); - this.RESULT_INFORMATION_VALUE = (allwordcounter == 0) ? 0 : (wordenum.count() * words.size() / allwordcounter / 16); + this.RESULT_NUMB_WORDS = allwordcounter; + this.RESULT_DIFF_WORDS = wordHandleCount; + this.RESULT_SIMI_WORDS = words.size(); + this.RESULT_WORD_ENTROPHY = (allwordcounter == 0) ? 0 : (255 * words.size() / allwordcounter); + this.RESULT_NUMB_SENTENCES = allsentencecounter; + this.RESULT_DIFF_SENTENCES = sentenceHandleCount; + this.RESULT_SIMI_SENTENCES = sentences.size(); + this.RESULT_AVERAGE_WORD_OCC = (words.size() == 0) ? 0 : (allwordcounter / words.size()); + this.RESULT_INFORMATION_VALUE = (allwordcounter == 0) ? 0 : (wordenum.count() * words.size() / allwordcounter / 16); } - public void print() { - String[] s = sentences(); + String[] s = sentences(); - // printout a reconstruction of the text - for (int i = 0; i < s.length; i++) { - if (s[i] != null) System.out.print("#T " + intString(i, numlength) + " " + s[i]); - } + // printout a reconstruction of the text + for (int i = 0; i < s.length; i++) { + if (s[i] != null) System.out.print("#T " + intString(i, numlength) + " " + s[i]); + } } public String[] sentences() { - // we reconstruct the word hashtable - // and order the entries by the number of the sentence - // this structure is only needed to reconstruct the text - String word; - statProp sp; - Map.Entry entry; - Iterator it; - String[] orderedWords = new String[words.size()+99]; // uuiiii, the '99' is only a quick hack... - it = words.entrySet().iterator(); // enumerates the keys in ascending order - while (it.hasNext()) { - entry = (Map.Entry) it.next(); - word = (String) entry.getKey(); - sp = (statProp) entry.getValue(); - orderedWords[sp.handle] = word; - } - - Object[] orderedSentences = makeOrderedSentences(); - - // create a reconstruction of the text + // we reconstruct the word hashtable + // and order the entries by the number of the sentence + // this structure is only needed to reconstruct the text + String word; + statProp sp; + Map.Entry entry; + Iterator it; + String[] orderedWords = new String[words.size() + 99]; // uuiiii, the '99' is only a quick hack... + it = words.entrySet().iterator(); // enumerates the keys in ascending order + while (it.hasNext()) { + entry = (Map.Entry) it.next(); + word = (String) entry.getKey(); + sp = (statProp) entry.getValue(); + orderedWords[sp.handle] = word; + } + + Object[] orderedSentences = makeOrderedSentences(); + + // create a reconstruction of the text String[] result = new String[orderedSentences.length]; String s; - for (int i = 0; i < orderedSentences.length; i++) { - if (orderedSentences[i] != null) { - s = ""; - for (int j = 2; j < ((String[]) orderedSentences[i]).length; j++) { - s += " " + orderedWords[Integer.parseInt(((String[]) orderedSentences[i])[j])]; - } - s += ((String[]) orderedSentences[i])[1]; + for (int i = 0; i < orderedSentences.length; i++) { + if (orderedSentences[i] != null) { + s = ""; + for (int j = 2; j < ((String[]) orderedSentences[i]).length; j++) { + s += " " + orderedWords[Integer.parseInt(((String[]) orderedSentences[i])[j])]; + } + s += ((String[]) orderedSentences[i])[1]; result[i] = (s.length() > 1) ? s.substring(1) : s; - } else { + } else { result[i] = ""; } - } + } return result; } private Object[] makeOrderedSentences() { - // we reconstruct the sentence hashtable again and create by-handle ordered entries - // this structure is needed to present the strings in the right order in a printout - int wc; - Iterator it; - statProp sp; - String[] s; - StringBuffer sentence; - Object[] orderedSentences = new Object[sentences.size()]; - for (int i = 0; i < sentences.size(); i++) orderedSentences[i] = null; // this array must be initialized - it = sentences.keySet().iterator(); - while (it.hasNext()) { - sentence = (StringBuffer) it.next(); - wc = (sentence.length() - 1) / numlength; - s = new String[wc + 2]; - sp = (statProp) sentences.get(sentence); - s[0] = intString(sp.count, numlength); // number of occurrences of this sentence - s[1] = sentence.substring(0,1); // the termination symbol of this sentence - for (int i = 0; i < wc; i++) s[i + 2] = sentence.substring(i * numlength + 1, (i + 1) * numlength + 1); - orderedSentences[sp.handle] = s; - } - return orderedSentences; + // we reconstruct the sentence hashtable again and create by-handle ordered entries + // this structure is needed to present the strings in the right order in a printout + int wc; + Iterator it; + statProp sp; + String[] s; + StringBuffer sentence; + Object[] orderedSentences = new Object[sentences.size()]; + for (int i = 0; i < sentences.size(); i++) + orderedSentences[i] = null; // this array must be initialized + it = sentences.keySet().iterator(); + while (it.hasNext()) { + sentence = (StringBuffer) it.next(); + wc = (sentence.length() - 1) / numlength; + s = new String[wc + 2]; + sp = (statProp) sentences.get(sentence); + s[0] = intString(sp.count, numlength); // number of occurrences of this sentence + s[1] = sentence.substring(0, 1); // the termination symbol of this sentence + for (int i = 0; i < wc; i++) + s[i + 2] = sentence.substring(i * numlength + 1, (i + 1) * numlength + 1); + orderedSentences[sp.handle] = s; + } + return orderedSentences; } public void writeMapToFile(File out) throws IOException { - Map.Entry entry; - String k; - String word; - Iterator it; - statProp sp; - - Object[] orderedSentences = makeOrderedSentences(); - - // we reconstruct the word hashtable - // and sort the entries by the number of occurrences - // this structure is needed to print out a sorted list of words - TreeMap sortedWords = new TreeMap(kelondroNaturalOrder.naturalOrder); - it = words.entrySet().iterator(); // enumerates the keys in ascending order - while (it.hasNext()) { - entry = (Map.Entry) it.next(); - word = (String) entry.getKey(); - sp = (statProp) entry.getValue(); - sortedWords.put(intString(sp.count, numlength) + intString(sp.handle, numlength), word); - } - - // start writing of words and sentences - FileWriter writer = new FileWriter(out); - writer.write("\r\n"); - it = sortedWords.entrySet().iterator(); // enumerates the keys in descending order - while (it.hasNext()) { - entry = (Map.Entry) it.next(); - k = (String) entry.getKey(); - writer.write("#W " + k.substring(numlength) + " " + k.substring(0, numlength) + " " + - ((String) entry.getValue()) + "\r\n"); - } - for (int i = 0; i < orderedSentences.length; i++) { - if (orderedSentences[i] != null) { - writer.write("#S " + intString(i, numlength) + " "); - for (int j = 0; j < ((String[]) orderedSentences[i]).length; j++) { - writer.write(((String[]) orderedSentences[i])[j] + " "); - } - writer.write("\r\n"); - } - } - writer.close(); + Map.Entry entry; + String k; + String word; + Iterator it; + statProp sp; + + Object[] orderedSentences = makeOrderedSentences(); + + // we reconstruct the word hashtable + // and sort the entries by the number of occurrences + // this structure is needed to print out a sorted list of words + TreeMap sortedWords = new TreeMap(kelondroNaturalOrder.naturalOrder); + it = words.entrySet().iterator(); // enumerates the keys in ascending order + while (it.hasNext()) { + entry = (Map.Entry) it.next(); + word = (String) entry.getKey(); + sp = (statProp) entry.getValue(); + sortedWords.put(intString(sp.count, numlength) + intString(sp.handle, numlength), word); + } + + // start writing of words and sentences + FileWriter writer = new FileWriter(out); + writer.write("\r\n"); + it = sortedWords.entrySet().iterator(); // enumerates the keys in descending order + while (it.hasNext()) { + entry = (Map.Entry) it.next(); + k = (String) entry.getKey(); + writer.write("#W " + k.substring(numlength) + " " + k.substring(0, numlength) + " " + ((String) entry.getValue()) + "\r\n"); + } + for (int i = 0; i < orderedSentences.length; i++) { + if (orderedSentences[i] != null) { + writer.write("#S " + intString(i, numlength) + " "); + for (int j = 0; j < ((String[]) orderedSentences[i]).length; j++) { + writer.write(((String[]) orderedSentences[i])[j] + " "); + } + writer.write("\r\n"); + } + } + writer.close(); } private static boolean punctuation(char c) { - return ("!?.".indexOf(c) >= 0); + return ("!?.".indexOf(c) >= 0); } public static boolean invisible(char c) { - if ((c < ' ') || (c > 'z')) return true; - return ("$%&/()=\"$%&/()=`^+*~#'-_:;,|<>[]\\".indexOf(c) >= 0); + if ((c < ' ') || (c > 'z')) return true; + return ("$%&/()=\"$%&/()=`^+*~#'-_:;,|<>[]\\".indexOf(c) >= 0); } - public static Enumeration wordTokenizer(String s, int minLength) { - try { - return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes()), minLength); - } catch (Exception e) { - return null; - } + try { + return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes()), minLength); + } catch (Exception e) { + return null; + } } - public static class sievedWordsEnum implements Enumeration { - Object buffer = null; - unsievedWordsEnum e; - int ml; - - public sievedWordsEnum(InputStream is, int minLength) { - e = new unsievedWordsEnum(is); - buffer = nextElement0(); - ml = minLength; - } - - private Object nextElement0() { - String s, r; - char c; - loop: while (e.hasMoreElements()) { - s = (String) e.nextElement(); - r = s.toLowerCase(); - if ((s.length() == 1) && (punctuation(s.charAt(0)))) return s; - if (s.length() < ml) continue loop; - for (int i = 0; i < r.length(); i++) { - c = r.charAt(i); - if (!(((c >= 'a') && (c <= 'z')) || - ((c >= '0') && (c <= '9')))) continue loop; // go to next while loop - } - return s; - } - return null; - } + // this enumeration removes all words that contain either wrong characters or are too short + + Object buffer = null; + unsievedWordsEnum e; + int ml; + + public sievedWordsEnum(InputStream is, int minLength) { + e = new unsievedWordsEnum(is); + buffer = nextElement0(); + ml = minLength; + } - - public boolean hasMoreElements() { - return buffer != null; - } - - public Object nextElement() { - Object r = buffer; buffer = nextElement0(); return r; - } + private Object nextElement0() { + String s; + char c; + loop: while (e.hasMoreElements()) { + s = (String) e.nextElement(); + if ((s.length() == 1) && (punctuation(s.charAt(0)))) return s; + if (s.length() < ml) continue loop; + for (int i = 0; i < s.length(); i++) { + c = s.charAt(i); + if (((c < 'a') || (c > 'z')) && + ((c < 'A') || (c > 'Z')) && + ((c < '0') || (c > '9'))) + continue loop; // go to next while loop + } + return s; + } + return null; + } - public int count() { - return e.count(); - } + public boolean hasMoreElements() { + return buffer != null; + } + + public Object nextElement() { + Object r = buffer; + buffer = nextElement0(); + return r; + } + + public int count() { + return e.count(); + } } private static class unsievedWordsEnum implements Enumeration { - Object buffer = null; - linesFromFileEnum e; - String s; - - public unsievedWordsEnum(InputStream is) { - e = new linesFromFileEnum(is); - s = ""; - buffer = nextElement0(); - } - - private Object nextElement0() { - String r; - StringBuffer sb; - char c; - while (s.length() == 0) { - if (e.hasMoreElements()) { - r = (String) e.nextElement(); - if (r == null) return null; - r = r.trim(); - sb = new StringBuffer(r.length() * 2); - for (int i = 0; i < r.length(); i++) { - c = r.charAt(i); - if (invisible(c)) sb = sb.append(' '); - else if (punctuation(c)) sb = sb.append(' ').append(c).append(' '); - else sb = sb.append(c); - } - s = sb.toString().trim(); - //System.out.println("PARSING-LINE '" + r + "'->'" + s + "'"); - } else { - return null; - } - } - int p = s.indexOf(" "); - if (p < 0) {r = s; s = ""; return r;} - r = s.substring(0, p); - s = s.substring(p + 1).trim(); - return r; - } - - public boolean hasMoreElements() { - return buffer != null; - } - - public Object nextElement() { - Object r = buffer; buffer = nextElement0(); return r; - } + + Object buffer = null; + linesFromFileEnum e; + String s; + + public unsievedWordsEnum(InputStream is) { + e = new linesFromFileEnum(is); + s = ""; + buffer = nextElement0(); + } + + private Object nextElement0() { + String r; + StringBuffer sb; + char c; + while (s.length() == 0) { + if (e.hasMoreElements()) { + r = (String) e.nextElement(); + if (r == null) return null; + r = r.trim(); + sb = new StringBuffer(r.length() * 2); + for (int i = 0; i < r.length(); i++) { + c = r.charAt(i); + if (invisible(c)) sb = sb.append(' '); + else if (punctuation(c)) sb = sb.append(' ').append(c).append(' '); + else sb = sb.append(c); + } + s = sb.toString().trim(); + //System.out.println("PARSING-LINE '" + r + "'->'" + s + "'"); + } else { + return null; + } + } + int p = s.indexOf(" "); + if (p < 0) { + r = s; + s = ""; + return r; + } + r = s.substring(0, p); + s = s.substring(p + 1).trim(); + return r; + } + + public boolean hasMoreElements() { + return buffer != null; + } + + public Object nextElement() { + Object r = buffer; + buffer = nextElement0(); + return r; + } - public int count() { - return e.count(); - } + public int count() { + return e.count(); + } } private static class linesFromFileEnum implements Enumeration { - // read in lines from a given input stream - // every line starting with a '#' is treated as a comment. - - Object buffer = null; - BufferedReader raf; - int counter = 0; - - public linesFromFileEnum(InputStream is) { - raf = new BufferedReader(new InputStreamReader(is)); - buffer = nextElement0(); - counter = 0; - } - - private Object nextElement0() { - try { - String s; - while (true) { - s = raf.readLine(); - if (s == null) {raf.close(); return null;} - if (!(s.startsWith("#"))) return s; - } - } catch (IOException e) { - try {raf.close();} catch (Exception ee) {} - return null; - } - } - - public boolean hasMoreElements() { - return buffer != null; - } - - public Object nextElement() { - if (buffer == null) { - return null; - } else { - counter = counter + ((String) buffer).length() + 1; - Object r = buffer; - buffer = nextElement0(); - return r; - } - } - - public int count() { - return counter; - } + // read in lines from a given input stream + // every line starting with a '#' is treated as a comment. + + Object buffer = null; + BufferedReader raf; + int counter = 0; + + public linesFromFileEnum(InputStream is) { + raf = new BufferedReader(new InputStreamReader(is)); + buffer = nextElement0(); + counter = 0; + } + + private Object nextElement0() { + try { + String s; + while (true) { + s = raf.readLine(); + if (s == null) { + raf.close(); + return null; + } + if (!(s.startsWith("#"))) return s; + } + } catch (IOException e) { + try { + raf.close(); + } catch (Exception ee) { + } + return null; + } + } + + public boolean hasMoreElements() { + return buffer != null; + } + + public Object nextElement() { + if (buffer == null) { + return null; + } else { + counter = counter + ((String) buffer).length() + 1; + Object r = buffer; + buffer = nextElement0(); + return r; + } + } + + public int count() { + return counter; + } } /* private static void addLineSearchProp(Properties prop, String s, String[] searchwords, HashSet foundsearch) { - // we store lines containing a key in search vector - int p; - String r; - s = " " + s.toLowerCase() + " "; - for (int i = 0; i < searchwords.length; i++) { - if (!(foundsearch.contains(searchwords[i]))) { - p = s.indexOf((String) searchwords[i]); - if (p >= 0) { - // we found one key in the result text - // prepare a line and put it to the property - r = s.substring(0, p) + "" + - s.substring(p, p + searchwords[i].length()) + "" + - s.substring(p + searchwords[i].length()); - prop.setProperty("key-" + searchwords[i], r); - // remember that we found this - foundsearch.add(searchwords[i]); - } - } - } + // we store lines containing a key in search vector + int p; + String r; + s = " " + s.toLowerCase() + " "; + for (int i = 0; i < searchwords.length; i++) { + if (!(foundsearch.contains(searchwords[i]))) { + p = s.indexOf((String) searchwords[i]); + if (p >= 0) { + // we found one key in the result text + // prepare a line and put it to the property + r = s.substring(0, p) + "" + s.substring(p, p + searchwords[i].length()) + "" + s.substring(p + searchwords[i].length()); + prop.setProperty("key-" + searchwords[i], r); + // remember that we found this + foundsearch.add(searchwords[i]); + } + } + } } */ @@ -632,43 +659,47 @@ public final class plasmaCondenser { } public static void main(String[] args) { - if ((args.length == 0) || (args.length > 3)) System.out.println("wrong number of arguments: plasmaCondenser -text|-html "); else try { - - plasmaCondenser pc = null; - - // read and analyse file - File file = new File(args[1]); - InputStream textStream = null; - if (args[0].equals("-text")) { - // read a text file - textStream = new FileInputStream(file); - } else if (args[0].equals("-html")) { - // read a html file - htmlFilterContentScraper cs = new htmlFilterContentScraper(new java.net.URL("http://localhost/")); - htmlFilterOutputStream fos = new htmlFilterOutputStream(null, cs, null, false); - FileInputStream fis = new FileInputStream(file); - byte[] buffer = new byte[512]; - int i; - while ((i = fis.read(buffer)) > 0) fos.write(buffer, 0, i); - fis.close(); - fos.close(); - //cs.print(); - //System.out.println("TEXT:" + new String(cs.getText())); - textStream = new ByteArrayInputStream(cs.getText()); - } else { - System.out.println("first argument must be either '-text' or '-html'"); - System.exit(-1); - } - // call condenser - pc = new plasmaCondenser(textStream, 1, 0); - textStream.close(); - // output result - pc.writeMapToFile(new File(args[2])); - pc.print(); - //System.out.println("ANALYSIS:" + pc.getAnalysis().toString()); - } catch (IOException e) { - System.out.println("Problem with input file: " + e.getMessage()); - } + if ((args.length == 0) || (args.length > 3)) + System.out.println("wrong number of arguments: plasmaCondenser -text|-html "); + else + try { + plasmaCondenser pc = null; + + // read and analyse file + File file = new File(args[1]); + InputStream textStream = null; + if (args[0].equals("-text")) { + // read a text file + textStream = new FileInputStream(file); + } else if (args[0].equals("-html")) { + // read a html file + htmlFilterContentScraper cs = new htmlFilterContentScraper(new java.net.URL("http://localhost/")); + htmlFilterOutputStream fos = new htmlFilterOutputStream(null, cs, null, false); + FileInputStream fis = new FileInputStream(file); + byte[] buffer = new byte[512]; + int i; + while ((i = fis.read(buffer)) > 0) fos.write(buffer, 0, i); + fis.close(); + fos.close(); + // cs.print(); + // System.out.println("TEXT:" + new String(cs.getText())); + textStream = new ByteArrayInputStream(cs.getText()); + } else { + System.out.println("first argument must be either '-text' or '-html'"); + System.exit(-1); + } + + // call condenser + pc = new plasmaCondenser(textStream, 1, 0); + textStream.close(); + + // output result + pc.writeMapToFile(new File(args[2])); + pc.print(); + //System.out.println("ANALYSIS:" + pc.getAnalysis().toString()); + } catch (IOException e) { + System.out.println("Problem with input file: " + e.getMessage()); + } } }