yacy_search_server/source/de/anomic/plasma/plasmaCondenser.java

// plasmaCondenser.java
// -----------------------
// part of YaCy
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
// last change: 09.01.2004
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.

// compile with javac -sourcepath source source/de/anomic/plasma/plasmaCondenser.java
// execute with java -cp source de.anomic.plasma.plasmaCondenser

package de.anomic.plasma;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;

import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterOutputStream;
import de.anomic.kelondro.kelondroMSetTools;

public class plasmaCondenser {

    private final static int numlength = 5;

    private Properties analysis;
    private TreeMap words; // a string (the words) to (statProp) - relation
    private HashMap sentences;
    private int wordminsize;
    private int wordcut;

    public plasmaCondenser(InputStream text) throws IOException {
	this(text, 3, 2);
    }

    public plasmaCondenser(InputStream text, int wordminsize, int wordcut) throws IOException {
	this.wordminsize = wordminsize;
	this.wordcut = wordcut;
	analysis = new Properties();
	words = new TreeMap();
	sentences = new HashMap();
	createCondensement(text);
    }

    public Properties getAnalysis() {
	return analysis;
    }

    public int excludeWords(TreeSet stopwords) {
        // subtracts the given stopwords from the word list
        // the word list shrinkes. This returns the number of shrinked words
        int oldsize = words.size();
        words = kelondroMSetTools.excludeConstructive(words, stopwords);
        return oldsize - words.size();
    }

    public Set getWords() {
	return words.keySet();
    }

    public int wordCount(String word) {
	// number of occurrences of one word
	// if the word did not occur, this simply returns 0
	statProp sp = (statProp) words.get(word);
	if (sp == null) return 0;
	return sp.count;
    }

    public static class statProp {
	public int count;
	public int handle;
	public HashSet hash;
	public statProp(int handle) {
	    this.count = 1;
	    this.handle = handle;
	    this.hash = new HashSet();
	}
	public void inc() {count++;}
	public void check(int i) {hash.add("" + i);}

    }


    public static String intString(int number, int length) {
	String s = "" + number;
	while (s.length() < length) s = "0" + s;
	return s;
    }

    private void createCondensement(InputStream is) throws IOException {

	words = new TreeMap(kelondroMSetTools.fastStringComparator);
	sentences = new HashMap();
	HashSet currsentwords = new HashSet();
	String sentence = "";
	String word = "";
	String k;
	int wordlen;
	statProp sp, sp1;
	int wordHandle;
	int wordHandleCount = 0;
	int sentenceHandle;
	int sentenceHandleCount = 0;
	int allwordcounter = 0;
	int allsentencecounter = 0;
	int idx;
	Iterator it, it1;

	// read source
	sievedWordsEnum wordenum = new sievedWordsEnum(is, wordminsize);
	while (wordenum.hasMoreElements()) {
	    word = ((String) wordenum.nextElement()).toLowerCase();
	    //System.out.println("PARSED-WORD " + word);
	    wordlen = word.length();
	    if ((wordlen == 1) && (punctuation(word.charAt(0)))) {
		// store sentence
		if (sentence.length() > 0) {
		    // we store the punctuation symbol as first element of the sentence vector
		    allsentencecounter++;
		    sentence = word + sentence;
		    if (sentences.containsKey(sentence)) {
			// sentence already exists
			sp = (statProp) sentences.get(sentence);
			sp.inc();
			idx = sp.handle;
			sentences.put(sentence, sp);
		    } else {
			// create new sentence
			idx = sentenceHandleCount++;
			sentences.put(sentence, new statProp(idx));
		    }
		    // store to the words a link to this sentence
		    it = currsentwords.iterator();
		    while (it.hasNext()) {
			k = (String) it.next();
			sp = (statProp) words.get(k);
			sp.check(idx);
			words.put(k,sp);
		    }
		}
		sentence = "";
		currsentwords.clear();
	    } else {
		// store word
		allwordcounter++;
		currsentwords.add(word);
		if (words.containsKey(word)) {
		    // word already exists
		    sp = (statProp) words.get(word);
		    wordHandle = sp.handle;
		    sp.inc();
		} else {
		    // word does not yet exist, create new word entry
		    wordHandle = wordHandleCount++;
		    sp = new statProp(wordHandle);
		}
		words.put(word, sp);
		// we now have the unique handle of the word, put it into the sentence:
		sentence = sentence + intString(wordHandle, numlength);
	    }
	}
	// finnish last sentence
	if (sentence.length() > 0) {
	    allsentencecounter++;
	    sentence = "." + sentence;
	    if (sentences.containsKey(sentence)) {
		sp = (statProp) sentences.get(sentence);
		sp.inc();
		sentences.put(sentence, sp);
	    } else {
		sentences.put(sentence, new statProp(sentenceHandleCount++));
	    }
	}

	//-------------------

	// we reconstruct the sentence hashtable
	// and order the entries by the number of the sentence
	// this structure is needed to replace double occurring words in sentences
	Object[] orderedSentences = new Object[sentenceHandleCount];
	String[] s;
	int wc;
	it = sentences.keySet().iterator();
	while (it.hasNext()) {
	    sentence = (String) it.next();
	    wc = (sentence.length() - 1) / numlength;
	    s = new String[wc + 2];
	    sp = (statProp) sentences.get(sentence);
	    s[0] = intString(sp.count, numlength); // number of occurrences of this sentence
	    s[1] = sentence.substring(0,1); // the termination symbol of this sentence
	    for (int i = 0; i < wc; i++) {
		k = sentence.substring(i * numlength + 1, (i + 1) * numlength + 1);
		s[i + 2] = k;
	    }
	    orderedSentences[sp.handle] = s;
	}

	Map.Entry entry;
	// we search for similar words and reorganize the corresponding sentences
	// a word is similar, if a shortened version is equal
	it = words.entrySet().iterator(); // enumerates the keys in descending order
	wordsearch: while (it.hasNext()) {
	    entry = (Map.Entry) it.next();
	    word = (String) entry.getKey();
	    wordlen = word.length();
	    sp = (statProp) entry.getValue();
	    for (int i = wordcut; i > 0; i--) {
		if (wordlen > i) {
		    k = word.substring(0, wordlen - i);
		    if (words.containsKey(k)) {
			// we will delete the word 'word' and repoint the corresponding links
			// in sentences that use this word
			sp1 = (statProp) words.get(k);
			it1 = sp.hash.iterator(); // we iterate over all sentences that refer to this word
			while (it1.hasNext()) {
			    idx = Integer.parseInt((String) it1.next()); // number of a sentence
			    s = (String[]) orderedSentences[idx];
			    for (int j = 2; j < s.length; j++) {
				if (s[j].equals(intString(sp.handle, numlength))) s[j] = intString(sp1.handle, numlength);
			    }
			    orderedSentences[idx] = s;
			}
			// update word counter
			sp1.count = sp1.count + sp.count;
			words.put(k, sp1);
			// remove current word
			it.remove();
			continue wordsearch;
		    }
		}
	    }
	}

	// depending on the orderedSentences structure, we rebuild the sentence HashMap to
	// eliminate double occuring sentences
	sentences = new HashMap();
	for (int i = 0; i < orderedSentences.length; i++) {
	    sentence = "";
	    for (int j = 1; j < ((String[]) orderedSentences[i]).length; j++) sentence = sentence + ((String[]) orderedSentences[i])[j];
	    if (sentences.containsKey(sentence)) {
		// add sentence counter to counter of found sentence
		sp = (statProp) sentences.get(sentence);
		sp.count = sp.count + Integer.parseInt(((String[]) orderedSentences[i])[0]);
		sentences.put(sentence, sp);
		//System.out.println("Found double occurring sentence " + i + " = " + sp.handle);
	    } else {
		// create new sentence entry
		sp = new statProp(i);
		sp.count = Integer.parseInt(((String[]) orderedSentences[i])[0]);
		sentences.put(sentence, sp);
	    }
	}

	//-------------------

	// what do we have here:
	// sentences
	// words

	// we now have the sentence structure and word list
	// create properties with this information

	analysis.setProperty("NUMB_TEXT_BYTES", Long.toHexString(wordenum.count()));
	analysis.setProperty("NUMB_WORDS", Long.toHexString(allwordcounter));
	analysis.setProperty("DIFF_WORDS", Long.toHexString(wordHandleCount));
	analysis.setProperty("SIMI_WORDS", Long.toHexString(words.size()));
	analysis.setProperty("WORD_ENTROPHY", Long.toHexString((allwordcounter == 0) ? 0 : (255 * words.size() / allwordcounter)));
	analysis.setProperty("NUMB_SENTENCES", Long.toHexString(allsentencecounter));
	analysis.setProperty("DIFF_SENTENCES", Long.toHexString(sentenceHandleCount));
	analysis.setProperty("SIMI_SENTENCES", Long.toHexString(sentences.size()));
	analysis.setProperty("AVERAGE_WORD_OCC", Long.toHexString((words.size() == 0) ? 0 : (allwordcounter / words.size())));
	analysis.setProperty("INFORMATION_VALUE", Long.toHexString((allwordcounter == 0) ? 0 : (wordenum.count() * words.size() / allwordcounter / 16)));

	// string, characterisation of text content (a guess)

    }

    public void reconstruct() {
	// we reconstruct the word hashtable
	// and order the entries by the number of the sentence
	// this structure is only needed to reconstruct the text
	String word;
	statProp sp;
	Map.Entry entry;
	Iterator it;
	String[] orderedWords = new String[words.size()+99]; // uuiiii, the '99' is only a quick hack...
	it = words.entrySet().iterator(); // enumerates the keys in ascending order
	while (it.hasNext()) {
	    entry = (Map.Entry) it.next();
	    word = (String) entry.getKey();
	    sp = (statProp) entry.getValue();
	    orderedWords[sp.handle] = word;
	}

	Object[] orderedSentences = makeOrderedSentences();

	// printout a reconstruction of the text
	for (int i = 0; i < orderedSentences.length; i++) {
	    if (orderedSentences[i] != null) {
		System.out.print("#T " + intString(i, numlength) + " " + ((String[]) orderedSentences[i])[0] + " ");
		for (int j = 2; j < ((String[]) orderedSentences[i]).length; j++) {
		    System.out.print(" " +
				     orderedWords[Integer.parseInt(((String[]) orderedSentences[i])[j])]
				     );
		}
		System.out.println(((String[]) orderedSentences[i])[1]);
	    }
	}
    }

    private Object[] makeOrderedSentences() {
	// we reconstruct the sentence hashtable again and create by-handle ordered entries
	// this structure is needed to present the strings in the right order in a printout
	int wc;
	Iterator it;
	statProp sp;
	String[] s;
	String sentence;
	Object[] orderedSentences = new Object[sentences.size()];
	for (int i = 0; i < sentences.size(); i++) orderedSentences[i] = null; // this array must be initialized
	it = sentences.keySet().iterator();
	while (it.hasNext()) {
	    sentence = (String) it.next();
	    wc = (sentence.length() - 1) / numlength;
	    s = new String[wc + 2];
	    sp = (statProp) sentences.get(sentence);
	    s[0] = intString(sp.count, numlength); // number of occurrences of this sentence
	    s[1] = sentence.substring(0,1); // the termination symbol of this sentence
	    for (int i = 0; i < wc; i++) s[i + 2] = sentence.substring(i * numlength + 1, (i + 1) * numlength + 1);
	    orderedSentences[sp.handle] = s;
	}
	return orderedSentences;
    }

    public void writeMapToFile(File out) throws IOException {
	Map.Entry entry;
	String k;
	String word;
	Iterator it;
	statProp sp;

	Object[] orderedSentences = makeOrderedSentences();

	// we reconstruct the word hashtable
	// and sort the entries by the number of occurrences
	// this structure is needed to print out a sorted list of words
	TreeMap sortedWords = new TreeMap(kelondroMSetTools.fastStringComparator);
	it = words.entrySet().iterator(); // enumerates the keys in ascending order
	while (it.hasNext()) {
	    entry = (Map.Entry) it.next();
	    word = (String) entry.getKey();
	    sp = (statProp) entry.getValue();
	    sortedWords.put(intString(sp.count, numlength) + intString(sp.handle, numlength), word);
	}

	// start writing of words and sentences
	FileWriter writer = new FileWriter(out);
	writer.write("\r\n");
	it = sortedWords.entrySet().iterator(); // enumerates the keys in descending order
	while (it.hasNext()) {
	    entry = (Map.Entry) it.next();
	    k = (String) entry.getKey();
	    writer.write("#W " + k.substring(numlength) + " " + k.substring(0, numlength) + " " +
			       ((String) entry.getValue()) + "\r\n");
	}
	for (int i = 0; i < orderedSentences.length; i++) {
	    if (orderedSentences[i] != null) {
		writer.write("#S " + intString(i, numlength) + " ");
		for (int j = 0; j < ((String[]) orderedSentences[i]).length; j++) {
		    writer.write(((String[]) orderedSentences[i])[j] + " ");
		}
		writer.write("\r\n");
	    }
	}
	writer.close();
    }

    private static boolean punctuation(char c) {
	return ("!?.".indexOf(c) >= 0);
    }

    public static boolean invisible(char c) {
	if ((c < ' ') || (c > 'z')) return true;
	return ("$%&/()=\"$%&/()=`^+*~#'-_:;,|<>[]\\".indexOf(c) >= 0);
    }


    public static Enumeration wordTokenizer(String s) {
	try {
	    return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes()), 3);
	} catch (Exception e) {
	    return null;
	}
    }


    public static class sievedWordsEnum implements Enumeration {
	Object buffer = null;
	unsievedWordsEnum e;
	int ml;

	public sievedWordsEnum(InputStream is, int minLength) throws IOException {
	    e = new unsievedWordsEnum(is);
	    buffer = nextElement0();
	    ml = minLength;
	}

	private Object nextElement0() {
	    String s, r;
	    char c;
	    loop: while (e.hasMoreElements()) {
		s = (String) e.nextElement();
		r = s.toLowerCase();
		if ((s.length() == 1) && (punctuation(s.charAt(0)))) return s;
		if (s.length() < ml) continue loop;
		for (int i = 0; i < r.length(); i++) {
		    c = r.charAt(i);
		    if (!(((c >= 'a') && (c <= 'z')) ||
			  ((c >= '0') && (c <= '9')))) continue loop; // go to next while loop
		}
		return s;
	    }
	    return null;
	}


	public boolean hasMoreElements() {
	    return buffer != null;
	}

	public Object nextElement() {
	    Object r = buffer; buffer = nextElement0(); return r;
	}

	public int count() {
	    return e.count();
	}
    }

    private static class unsievedWordsEnum implements Enumeration {
	Object buffer = null;
	linesFromFileEnum e;
	String s;

	public unsievedWordsEnum(InputStream is) throws IOException {
	    e = new linesFromFileEnum(is);
	    s = "";
	    buffer = nextElement0();
	}

	private Object nextElement0() {
	    String r;
	    StringBuffer sb;
	    char c;
	    while (s.length() == 0) {
		if (e.hasMoreElements()) {
		    r = (String) e.nextElement();
		    if (r == null) return null;
		    r = r.trim();
		    sb = new StringBuffer(r.length() * 2);
		    for (int i = 0; i < r.length(); i++) {
			c = r.charAt(i);
			if (invisible(c)) sb = sb.append(' ');
			else if (punctuation(c)) sb = sb.append(' ').append(c).append(' ');
			else sb = sb.append(c);
		    }
		    s = sb.toString().trim();
		    //System.out.println("PARSING-LINE '" + r + "'->'" + s + "'");
		} else {
		    return null;
		}
	    }
	    int p = s.indexOf(" ");
	    if (p < 0) {r = s; s = ""; return r;}
	    r = s.substring(0, p);
	    s = s.substring(p + 1).trim();
	    return r;
	}

	public boolean hasMoreElements() {
	    return buffer != null;
	}

	public Object nextElement() {
	    Object r = buffer; buffer = nextElement0(); return r;
	}

	public int count() {
	    return e.count();
	}
    }

    private static class linesFromFileEnum implements Enumeration {
	// read in lines from a given input stream
	// every line starting with a '#' is treated as a comment.

	Object buffer = null;
	BufferedReader raf;
	int counter = 0;

	public linesFromFileEnum(InputStream is) throws IOException {
	    raf = new BufferedReader(new InputStreamReader(is));
	    buffer = nextElement0();
	    counter = 0;
	}

	private Object nextElement0() {
	    try {
		String s;
		while (true) {
		    s = raf.readLine();
		    if (s == null) {raf.close(); return null;}
		    if (!(s.startsWith("#"))) return s;
		}
	    } catch (IOException e) {
		try {raf.close();} catch (Exception ee) {}
		return null;
	    }
	}

	public boolean hasMoreElements() {
	    return buffer != null;
	}

	public Object nextElement() {
	    if (buffer == null) {
		return null;
	    } else {
		counter = counter + ((String) buffer).length() + 1;
		Object r = buffer;
		buffer = nextElement0();
		return r;
	    }
	}

	public int count() {
	    return counter;
	}
    }

    private static void addLineSearchProp(Properties prop, String s, String[] searchwords, HashSet foundsearch) {
	// we store lines containing a key in search vector
	int p;
	String r;
	s = " " + s.toLowerCase() + " ";
	for (int i = 0; i < searchwords.length; i++) {
	    if (!(foundsearch.contains(searchwords[i]))) {
		p = s.indexOf((String) searchwords[i]);
		if (p >= 0) {
		    // we found one key in the result text
		    // prepare a line and put it to the property
		    r = s.substring(0, p) + "<B>" +
			s.substring(p, p + searchwords[i].length()) + "</B>" +
			s.substring(p + searchwords[i].length());
		    prop.setProperty("key-" + searchwords[i], r);
				// remember that we found this
		    foundsearch.add(searchwords[i]);
		}
	    }
	}
    }

    public static Set getWords(byte[] text) {
	if (text == null) return null;
        ByteArrayInputStream buffer = new ByteArrayInputStream(text);
        try {
            plasmaCondenser condenser = new plasmaCondenser(buffer);
            return condenser.getWords();
        } catch (IOException e) {
            return null;
        }
    }

    public static void main(String[] args) {
	if ((args.length == 0) || (args.length > 3)) System.out.println("wrong number of arguments: plasmaCondenser -text|-html <infile> <outfile>"); else try {

	    plasmaCondenser pc = null;

	    // read and analyse file
	    File file = new File(args[1]);
	    InputStream textStream = null;
	    if (args[0].equals("-text")) {
		// read a text file
		textStream = new FileInputStream(file);
	    } else if (args[0].equals("-html")) {
		// read a html file
		htmlFilterContentScraper cs = new htmlFilterContentScraper(new java.net.URL("http://localhost/"));
		htmlFilterOutputStream fos = new htmlFilterOutputStream(null, cs, null, false);
		FileInputStream fis = new FileInputStream(file);
		byte[] buffer = new byte[512];
		int i;
		while ((i = fis.read(buffer)) > 0) fos.write(buffer, 0, i);
		fis.close();
		fos.close();
		//cs.print();
		//System.out.println("TEXT:" + new String(cs.getText()));
		textStream = new ByteArrayInputStream(cs.getText());
	    } else {
		System.out.println("first argument must be either '-text' or '-html'");
		System.exit(-1);
	    }
	    // call condenser
	    pc = new plasmaCondenser(textStream, 1, 0);
	    textStream.close();
	    // output result
	    pc.writeMapToFile(new File(args[2]));
	    pc.reconstruct();
	    System.out.println("ANALYSIS:" + pc.getAnalysis().toString());
	} catch (IOException e) {
	    System.out.println("Problem with input file: " + e.getMessage());
	}
    }

}