// plasmaCondenser.java // ----------------------- // part of YaCy // (C) by Michael Peter Christen; mc@anomic.de // first published on http://www.anomic.de // Frankfurt, Germany, 2004 // last change: 09.01.2004 // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // // Using this software in any meaning (reading, learning, copying, compiling, // running) means that you agree that the Author(s) is (are) not responsible // for cost, loss of data or any harm that may be caused directly or indirectly // by usage of this softare or this documentation. The usage of this software // is on your own risk. The installation and usage (starting/running) of this // software may allow other people or application to access your computer and // any attached devices and is highly dependent on the configuration of the // software which must be done by the user of the software; the author(s) is // (are) also not responsible for proper configuration and usage of the // software, even if provoked by documentation provided together with // the software. // // Any changes to this file according to the GPL as documented in the file // gpl.txt aside this file in the shipment you received can be done to the // lines that follows this copyright notice here, but changes must not be // done inside the copyright notive above. A re-distribution must contain // the intact and unchanged copyright notice. // Contributions and changes to the program code must be marked as such. // compile with javac -sourcepath source source/de/anomic/plasma/plasmaCondenser.java // execute with java -cp source de.anomic.plasma.plasmaCondenser package de.anomic.plasma; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.io.UnsupportedEncodingException; import java.util.Enumeration; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Properties; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBitfield; import de.anomic.kelondro.kelondroMSetTools; import de.anomic.server.serverCodings; import de.anomic.yacy.yacySeedDB; public final class plasmaCondenser { // this is the page analysis class // category flags that show how the page can be distinguished in different interest groups public static final int flag_cat_indexof = 0; // a directory listing page (i.e. containing 'index of') public static final int flag_cat_opencontent = 1; // open source, any free stuff public static final int flag_cat_business = 2; // web shops, marketing, trade public static final int flag_cat_stockfinance = 3; // stock exchange (quotes), finance, economy public static final int flag_cat_health = 4; // health public static final int flag_cat_sport = 5; // any sport, cars etc. public static final int flag_cat_lifestyle = 6; // travel, lifestyle public static final int flag_cat_politics = 7; // politics public static final int flag_cat_news = 8; // blogs, news pages public static final int flag_cat_children = 9; // toys, childrens education, help for parents public static final int flag_cat_entertainment = 10; // boulevard, entertainment, cultural content public static final int flag_cat_knowledge = 11; // science, school stuff, help for homework public static final int flag_cat_computer = 12; // any computer related stuff, networks, operation systems public static final int flag_cat_p2p = 13; // p2p support, filesharing archives etc. public static final int flag_cat_sex = 14; // sexual content public static final int flag_cat_spam = 15; // pages that anybody would consider as not interesting public static final int flag_cat_linux = 16; // pages about linux software public static final int flag_cat_macos = 17; // pages about macintosh, apple computers and the mac os public static final int flag_cat_windows = 18; // pages about windows os and softare public static final int flag_cat_osreserve = 19; // reserve private final static int numlength = 5; //private Properties analysis; private TreeMap words; // a string (the words) to (wordStatProp) - relation private HashMap sentences; private int wordminsize; private int wordcut; public int RESULT_NUMB_TEXT_BYTES = -1; public int RESULT_NUMB_WORDS = -1; public int RESULT_DIFF_WORDS = -1; public int RESULT_SIMI_WORDS = -1; public int RESULT_WORD_ENTROPHY = -1; public int RESULT_NUMB_SENTENCES = -1; public int RESULT_DIFF_SENTENCES = -1; public int RESULT_SIMI_SENTENCES = -1; public kelondroBitfield RESULT_FLAGS = new kelondroBitfield(4); public plasmaCondenser(InputStream text) { this(text, 3, 2); } public plasmaCondenser(InputStream text, int wordminsize, int wordcut) { this.wordminsize = wordminsize; this.wordcut = wordcut; // analysis = new Properties(); words = new TreeMap(); sentences = new HashMap(); createCondensement(text); } // create a word hash public static final String word2hash(String word) { return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(word.toLowerCase())).substring(0, yacySeedDB.commonHashLength); } public static final Set words2hashSet(String[] words) { TreeSet hashes = new TreeSet(); for (int i = 0; i < words.length; i++) hashes.add(word2hash(words[i])); return hashes; } public static final String words2hashString(String[] words) { StringBuffer sb = new StringBuffer(); for (int i = 0; i < words.length; i++) sb.append(word2hash(words[i])); return new String(sb); } public static final Set words2hashes(Set words) { Iterator i = words.iterator(); TreeSet hashes = new TreeSet(); while (i.hasNext()) hashes.add(word2hash((String) i.next())); return hashes; } public int excludeWords(TreeSet stopwords) { // subtracts the given stopwords from the word list // the word list shrinkes. This returns the number of shrinked words int oldsize = words.size(); words = kelondroMSetTools.excludeConstructive(words, stopwords); return oldsize - words.size(); } public Iterator words() { // returns an entry set iterator // key is a String (the word), value is a wordStatProp Object return words.entrySet().iterator(); } public static class wordStatProp { // object carries statistics for words and sentences public int count; // number of occurrences public int posInText; // unique handle, is initialized with word position (excluding double occurring words) public int posInPhrase; // public int numOfPhrase; public HashSet hash; // public wordStatProp(int handle, int pip, int nop) { this.count = 1; this.posInText = handle; this.posInPhrase = pip; this.numOfPhrase = nop; this.hash = new HashSet(); } public void inc() { count++; } public void check(int i) { hash.add(Integer.toString(i)); } } public static class phraseStatProp { // object carries statistics for words and sentences public int count; // number of occurrences public int handle; // unique handle, is initialized with sentence counter public HashSet hash; // public phraseStatProp(int handle) { this.count = 1; this.handle = handle; this.hash = new HashSet(); } public void inc() { count++; } public void check(int i) { hash.add(Integer.toString(i)); } } public String intString(int number, int length) { String s = Integer.toString(number); while (s.length() < length) s = "0" + s; return s; } private void createCondensement(InputStream is) { words = new TreeMap(/*kelondroNaturalOrder.naturalOrder*/); sentences = new HashMap(); HashSet currsentwords = new HashSet(); StringBuffer sentence = new StringBuffer(100); String word = ""; String k; int wordlen; wordStatProp wsp, wsp1; phraseStatProp psp; int wordHandle; int wordHandleCount = 0; int sentenceHandleCount = 0; int allwordcounter = 0; int allsentencecounter = 0; int idx; int wordInSentenceCounter = 1; Iterator it, it1; boolean comb_indexof = false, comb_lastmodified = false, last_last = false, last_index = false; // read source sievedWordsEnum wordenum = new sievedWordsEnum(is, wordminsize); while (wordenum.hasMoreElements()) { word = ((String) wordenum.nextElement()).toLowerCase(); // TODO: does toLowerCase work for non ISO-8859-1 chars? //System.out.println("PARSED-WORD " + word); // distinguish punctuation and words wordlen = word.length(); if ((wordlen == 1) && (htmlFilterContentScraper.punctuation(word.charAt(0)))) { // store sentence if (sentence.length() > 0) { // we store the punctuation symbol as first element of the sentence vector allsentencecounter++; sentence.insert(0, word); // append at beginning if (sentences.containsKey(sentence)) { // sentence already exists psp = (phraseStatProp) sentences.get(sentence); psp.inc(); idx = psp.handle; sentences.put(sentence, psp); } else { // create new sentence idx = sentenceHandleCount++; sentences.put(sentence, new phraseStatProp(idx)); } // store to the words a link to this sentence it = currsentwords.iterator(); while (it.hasNext()) { k = (String) it.next(); wsp = (wordStatProp) words.get(k); wsp.check(idx); words.put(k, wsp); } } sentence = new StringBuffer(100); currsentwords.clear(); wordInSentenceCounter = 1; } else { // check index.of detection if ((last_last) && (word.equals("modified"))) comb_lastmodified = true; if ((last_index) && (word.equals("of"))) comb_indexof = true; last_last = word.equals("last"); last_index = word.equals("index"); // store word allwordcounter++; currsentwords.add(word); if (words.containsKey(word)) { // word already exists wsp = (wordStatProp) words.get(word); wordHandle = wsp.posInText; wsp.inc(); } else { // word does not yet exist, create new word entry wordHandle = wordHandleCount++; wsp = new wordStatProp(wordHandle, wordInSentenceCounter, sentences.size() + 1); } words.put(word, wsp); // we now have the unique handle of the word, put it into the sentence: sentence.append(intString(wordHandle, numlength)); wordInSentenceCounter++; } } // finnish last sentence if (sentence.length() > 0) { allsentencecounter++; sentence.insert(0, "."); // append at beginning if (sentences.containsKey(sentence)) { psp = (phraseStatProp) sentences.get(sentence); psp.inc(); sentences.put(sentence, psp); } else { sentences.put(sentence, new phraseStatProp(sentenceHandleCount++)); } } // ------------------- // we reconstruct the sentence hashtable // and order the entries by the number of the sentence // this structure is needed to replace double occurring words in sentences Object[] orderedSentences = new Object[sentenceHandleCount]; String[] s; int wc; Object o; it = sentences.keySet().iterator(); while (it.hasNext()) { o = it.next(); if (o != null) { sentence = (StringBuffer) o; wc = (sentence.length() - 1) / numlength; s = new String[wc + 2]; psp = (phraseStatProp) sentences.get(sentence); s[0] = intString(psp.count, numlength); // number of occurrences of this sentence s[1] = sentence.substring(0, 1); // the termination symbol of this sentence for (int i = 0; i < wc; i++) { k = sentence.substring(i * numlength + 1, (i + 1) * numlength + 1); s[i + 2] = k; } orderedSentences[psp.handle] = s; } } Map.Entry entry; // we search for similar words and reorganize the corresponding sentences // a word is similar, if a shortened version is equal it = words.entrySet().iterator(); // enumerates the keys in descending order wordsearch: while (it.hasNext()) { entry = (Map.Entry) it.next(); word = (String) entry.getKey(); wordlen = word.length(); wsp = (wordStatProp) entry.getValue(); for (int i = wordcut; i > 0; i--) { if (wordlen > i) { k = word.substring(0, wordlen - i); if (words.containsKey(k)) { // we will delete the word 'word' and repoint the // corresponding links // in sentences that use this word wsp1 = (wordStatProp) words.get(k); it1 = wsp.hash.iterator(); // we iterate over all sentences that refer to this word while (it1.hasNext()) { idx = Integer.parseInt((String) it1.next()); // number of a sentence s = (String[]) orderedSentences[idx]; for (int j = 2; j < s.length; j++) { if (s[j].equals(intString(wsp.posInText, numlength))) s[j] = intString(wsp1.posInText, numlength); } orderedSentences[idx] = s; } // update word counter wsp1.count = wsp1.count + wsp.count; words.put(k, wsp1); // remove current word it.remove(); continue wordsearch; } } } } // depending on the orderedSentences structure, we rebuild the sentence // HashMap to eliminate double occuring sentences sentences = new HashMap(); int le; for (int i = 0; i < orderedSentences.length; i++) { le = ((String[]) orderedSentences[i]).length; sentence = new StringBuffer(le * 10); for (int j = 1; j < le; j++) sentence.append(((String[]) orderedSentences[i])[j]); if (sentences.containsKey(sentence)) { // add sentence counter to counter of found sentence psp = (phraseStatProp) sentences.get(sentence); psp.count = psp.count + Integer.parseInt(((String[]) orderedSentences[i])[0]); sentences.put(sentence, psp); // System.out.println("Found double occurring sentence " + i + " // = " + sp.handle); } else { // create new sentence entry psp = new phraseStatProp(i); psp.count = Integer.parseInt(((String[]) orderedSentences[i])[0]); sentences.put(sentence, psp); } } // store result this.RESULT_NUMB_TEXT_BYTES = wordenum.count(); this.RESULT_NUMB_WORDS = allwordcounter; this.RESULT_DIFF_WORDS = wordHandleCount; this.RESULT_SIMI_WORDS = words.size(); this.RESULT_WORD_ENTROPHY = (allwordcounter == 0) ? 0 : (255 * words.size() / allwordcounter); this.RESULT_NUMB_SENTENCES = allsentencecounter; this.RESULT_DIFF_SENTENCES = sentenceHandleCount; this.RESULT_SIMI_SENTENCES = sentences.size(); this.RESULT_FLAGS.set(flag_cat_indexof, comb_indexof && comb_lastmodified); } public void print() { String[] s = sentences(); // printout a reconstruction of the text for (int i = 0; i < s.length; i++) { if (s[i] != null) System.out.print("#T " + intString(i, numlength) + " " + s[i]); } } public String[] sentences() { // we reconstruct the word hashtable // and order the entries by the number of the sentence // this structure is only needed to reconstruct the text String word; wordStatProp wsp; Map.Entry entry; Iterator it; String[] orderedWords = new String[words.size() + 99]; // uuiiii, the '99' is only a quick hack... it = words.entrySet().iterator(); // enumerates the keys in ascending order while (it.hasNext()) { entry = (Map.Entry) it.next(); word = (String) entry.getKey(); wsp = (wordStatProp) entry.getValue(); orderedWords[wsp.posInText] = word; } Object[] orderedSentences = makeOrderedSentences(); // create a reconstruction of the text String[] result = new String[orderedSentences.length]; String s; for (int i = 0; i < orderedSentences.length; i++) { if (orderedSentences[i] != null) { // TODO: bugfix for UTF-8: avoid this form of string concatenation s = ""; for (int j = 2; j < ((String[]) orderedSentences[i]).length; j++) { s += " " + orderedWords[Integer.parseInt(((String[]) orderedSentences[i])[j])]; } s += ((String[]) orderedSentences[i])[1]; result[i] = (s.length() > 1) ? s.substring(1) : s; } else { result[i] = ""; } } return result; } private Object[] makeOrderedSentences() { // we reconstruct the sentence hashtable again and create by-handle ordered entries // this structure is needed to present the strings in the right order in a printout int wc; Iterator it; phraseStatProp psp; String[] s; StringBuffer sentence; Object[] orderedSentences = new Object[sentences.size()]; for (int i = 0; i < sentences.size(); i++) orderedSentences[i] = null; // this array must be initialized it = sentences.keySet().iterator(); while (it.hasNext()) { sentence = (StringBuffer) it.next(); wc = (sentence.length() - 1) / numlength; s = new String[wc + 2]; psp = (phraseStatProp) sentences.get(sentence); s[0] = intString(psp.count, numlength); // number of occurrences of this sentence s[1] = sentence.substring(0, 1); // the termination symbol of this sentence for (int i = 0; i < wc; i++) s[i + 2] = sentence.substring(i * numlength + 1, (i + 1) * numlength + 1); orderedSentences[psp.handle] = s; } return orderedSentences; } public void writeMapToFile(File out) throws IOException { Map.Entry entry; String k; String word; Iterator it; wordStatProp wsp; Object[] orderedSentences = makeOrderedSentences(); // we reconstruct the word hashtable // and sort the entries by the number of occurrences // this structure is needed to print out a sorted list of words TreeMap sortedWords = new TreeMap(/*kelondroNaturalOrder.naturalOrder*/); it = words.entrySet().iterator(); // enumerates the keys in ascending order while (it.hasNext()) { entry = (Map.Entry) it.next(); word = (String) entry.getKey(); wsp = (wordStatProp) entry.getValue(); sortedWords.put(intString(wsp.count, numlength) + intString(wsp.posInText, numlength), word); } // start writing of words and sentences FileWriter writer = new FileWriter(out); writer.write("\r\n"); it = sortedWords.entrySet().iterator(); // enumerates the keys in descending order while (it.hasNext()) { entry = (Map.Entry) it.next(); k = (String) entry.getKey(); writer.write("#W " + k.substring(numlength) + " " + k.substring(0, numlength) + " " + ((String) entry.getValue()) + "\r\n"); } for (int i = 0; i < orderedSentences.length; i++) { if (orderedSentences[i] != null) { writer.write("#S " + intString(i, numlength) + " "); for (int j = 0; j < ((String[]) orderedSentences[i]).length; j++) { writer.write(((String[]) orderedSentences[i])[j] + " "); } writer.write("\r\n"); } } writer.close(); } public final static boolean invisible(char c) { // TODO: Bugfix for UTF-8: does this work for non ISO-8859-1 chars? if ((c < ' ') || (c > 'z')) return true; return ("$%&/()=\"$%&/()=`^+*~#'-_:;,|<>[]\\".indexOf(c) >= 0); } public static Enumeration wordTokenizer(String s, int minLength) { try { // TODO: Bugfix for UTF-8 needed return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes()), minLength); } catch (Exception e) { return null; } } public static class sievedWordsEnum implements Enumeration { // this enumeration removes all words that contain either wrong characters or are too short Object buffer = null; unsievedWordsEnum e; int ml; public sievedWordsEnum(InputStream is, int minLength) { e = new unsievedWordsEnum(is); buffer = nextElement0(); ml = minLength; } private Object nextElement0() { String s; char c; loop: while (e.hasMoreElements()) { s = (String) e.nextElement(); if ((s.length() == 1) && (htmlFilterContentScraper.punctuation(s.charAt(0)))) return s; if ((s.length() < ml) && (!(s.equals("of")))) continue loop; for (int i = 0; i < s.length(); i++) { c = s.charAt(i); // TODO: Bugfix needed for UTF-8 if (((c < 'a') || (c > 'z')) && ((c < 'A') || (c > 'Z')) && ((c < '0') || (c > '9'))) continue loop; // go to next while loop } return s; } return null; } public boolean hasMoreElements() { return buffer != null; } public Object nextElement() { Object r = buffer; buffer = nextElement0(); return r; } public int count() { return e.count(); } } private static class unsievedWordsEnum implements Enumeration { Object buffer = null; linesFromFileEnum e; String s; public unsievedWordsEnum(InputStream is) { e = new linesFromFileEnum(is); s = ""; buffer = nextElement0(); } private Object nextElement0() { String r; StringBuffer sb; char c; while (s.length() == 0) { if (e.hasMoreElements()) { r = (String) e.nextElement(); if (r == null) return null; r = r.trim(); sb = new StringBuffer(r.length() * 2); for (int i = 0; i < r.length(); i++) { c = r.charAt(i); if (invisible(c)) sb = sb.append(' '); // TODO: Bugfix needed for UTF-8 else if (htmlFilterContentScraper.punctuation(c)) sb = sb.append(' ').append(c).append(' '); else sb = sb.append(c); } s = sb.toString().trim(); //System.out.println("PARSING-LINE '" + r + "'->'" + s + "'"); } else { return null; } } int p = s.indexOf(" "); if (p < 0) { r = s; s = ""; return r; } r = s.substring(0, p); s = s.substring(p + 1).trim(); return r; } public boolean hasMoreElements() { return buffer != null; } public Object nextElement() { Object r = buffer; buffer = nextElement0(); return r; } public int count() { return e.count(); } } private static class linesFromFileEnum implements Enumeration { // read in lines from a given input stream // every line starting with a '#' is treated as a comment. Object buffer = null; BufferedReader raf; int counter = 0; public linesFromFileEnum(InputStream is) { raf = new BufferedReader(new InputStreamReader(is)); // TODO: bugfix needed for UTF-8, use charset for reader buffer = nextElement0(); counter = 0; } private Object nextElement0() { try { String s; while (true) { s = raf.readLine(); if (s == null) { raf.close(); return null; } if (!(s.startsWith("#"))) return s; } } catch (IOException e) { try { raf.close(); } catch (Exception ee) { } return null; } } public boolean hasMoreElements() { return buffer != null; } public Object nextElement() { if (buffer == null) { return null; } else { counter = counter + ((String) buffer).length() + 1; Object r = buffer; buffer = nextElement0(); return r; } } public int count() { return counter; } } public static Enumeration sentencesFromInputStream(InputStream is, String charset) { try { return new sentencesFromInputStreamEnum(is, charset); } catch (UnsupportedEncodingException e) { return null; } } private static class sentencesFromInputStreamEnum implements Enumeration { // read sentences from a given input stream // this enumerates String objects Object buffer = null; BufferedReader raf; int counter = 0; public sentencesFromInputStreamEnum(InputStream is, String charset) throws UnsupportedEncodingException { raf = new BufferedReader((charset == null) ? new InputStreamReader(is) : new InputStreamReader(is, charset)); buffer = nextElement0(); counter = 0; } private Object nextElement0() { try { String s = readSentence(raf); //System.out.println(" SENTENCE='" + s + "'"); // DEBUG if (s == null) { raf.close(); return null; } return s; } catch (IOException e) { try { raf.close(); } catch (Exception ee) { } return null; } } public boolean hasMoreElements() { return buffer != null; } public Object nextElement() { if (buffer == null) { return null; } else { counter = counter + ((String) buffer).length() + 1; Object r = buffer; buffer = nextElement0(); return r; } } public int count() { return counter; } } static String readSentence(Reader reader) throws IOException { StringBuffer s = new StringBuffer(); int nextChar; char c; // find sentence end for (;;) { nextChar = reader.read(); //System.out.print((char) nextChar); // DEBUG if (nextChar < 0) { if (s.length() == 0) return null; else break; } c = (char) nextChar; s.append(c); if (htmlFilterContentScraper.punctuation(c)) break; } // replace line endings and tabs by blanks for (int i = 0; i < s.length(); i++) { if ((s.charAt(i) == (char) 10) || (s.charAt(i) == (char) 13) || (s.charAt(i) == (char) 8)) s.setCharAt(i, ' '); } // remove all double-spaces int p; while ((p = s.indexOf(" ")) >= 0) s.deleteCharAt(p); return new String(s); } public static Iterator getWords(InputStream input) { if (input == null) return null; plasmaCondenser condenser = new plasmaCondenser(input); return condenser.words(); } public static Iterator getWords(byte[] text) { if (text == null) return null; ByteArrayInputStream buffer = new ByteArrayInputStream(text); return getWords(buffer); } public static void main(String[] args) { // read a property file and converty them into configuration lines try { File f = new File(args[0]); Properties p = new Properties(); p.load(new FileInputStream(f)); StringBuffer sb = new StringBuffer(); sb.append("{\n"); for (int i = 0; i <= 15; i++) { sb.append('"'); String s = p.getProperty("keywords" + i); String[] l = s.split(","); for (int j = 0; j < l.length; j++) { sb.append(word2hash(l[j])); } if (i < 15) sb.append(",\n"); } sb.append("}\n"); System.out.println(new String(sb)); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }