|
|
|
@ -117,21 +117,31 @@ public final class plasmaCondenser {
|
|
|
|
|
// number of occurrences of one word
|
|
|
|
|
// if the word did not occur, this simply returns 0
|
|
|
|
|
statProp sp = (statProp) words.get(word);
|
|
|
|
|
if (sp == null) return 0;
|
|
|
|
|
if (sp == null)
|
|
|
|
|
return 0;
|
|
|
|
|
return sp.count;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static class statProp {
|
|
|
|
|
public int count;
|
|
|
|
|
|
|
|
|
|
public int handle;
|
|
|
|
|
|
|
|
|
|
public HashSet hash;
|
|
|
|
|
|
|
|
|
|
public statProp(int handle) {
|
|
|
|
|
this.count = 1;
|
|
|
|
|
this.handle = handle;
|
|
|
|
|
this.hash = new HashSet();
|
|
|
|
|
}
|
|
|
|
|
public void inc() {count++;}
|
|
|
|
|
public void check(int i) {hash.add(Integer.toString(i));}
|
|
|
|
|
|
|
|
|
|
public void inc() {
|
|
|
|
|
count++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void check(int i) {
|
|
|
|
|
hash.add(Integer.toString(i));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -163,7 +173,7 @@ public final class plasmaCondenser {
|
|
|
|
|
sievedWordsEnum wordenum = new sievedWordsEnum(is, wordminsize);
|
|
|
|
|
while (wordenum.hasMoreElements()) {
|
|
|
|
|
word = ((String) wordenum.nextElement()).toLowerCase();
|
|
|
|
|
//System.out.println("PARSED-WORD " + word);
|
|
|
|
|
// System.out.println("PARSED-WORD " + word);
|
|
|
|
|
wordlen = word.length();
|
|
|
|
|
if ((wordlen == 1) && (punctuation(word.charAt(0)))) {
|
|
|
|
|
// store sentence
|
|
|
|
@ -188,7 +198,7 @@ public final class plasmaCondenser {
|
|
|
|
|
k = (String) it.next();
|
|
|
|
|
sp = (statProp) words.get(k);
|
|
|
|
|
sp.check(idx);
|
|
|
|
|
words.put(k,sp);
|
|
|
|
|
words.put(k, sp);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
sentence = new StringBuffer(100);
|
|
|
|
@ -225,7 +235,7 @@ public final class plasmaCondenser {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//-------------------
|
|
|
|
|
// -------------------
|
|
|
|
|
|
|
|
|
|
// we reconstruct the sentence hashtable
|
|
|
|
|
// and order the entries by the number of the sentence
|
|
|
|
@ -243,7 +253,7 @@ public final class plasmaCondenser {
|
|
|
|
|
s = new String[wc + 2];
|
|
|
|
|
sp = (statProp) sentences.get(sentence);
|
|
|
|
|
s[0] = intString(sp.count, numlength); // number of occurrences of this sentence
|
|
|
|
|
s[1] = sentence.substring(0,1); // the termination symbol of this sentence
|
|
|
|
|
s[1] = sentence.substring(0, 1); // the termination symbol of this sentence
|
|
|
|
|
for (int i = 0; i < wc; i++) {
|
|
|
|
|
k = sentence.substring(i * numlength + 1, (i + 1) * numlength + 1);
|
|
|
|
|
s[i + 2] = k;
|
|
|
|
@ -265,7 +275,8 @@ public final class plasmaCondenser {
|
|
|
|
|
if (wordlen > i) {
|
|
|
|
|
k = word.substring(0, wordlen - i);
|
|
|
|
|
if (words.containsKey(k)) {
|
|
|
|
|
// we will delete the word 'word' and repoint the corresponding links
|
|
|
|
|
// we will delete the word 'word' and repoint the
|
|
|
|
|
// corresponding links
|
|
|
|
|
// in sentences that use this word
|
|
|
|
|
sp1 = (statProp) words.get(k);
|
|
|
|
|
it1 = sp.hash.iterator(); // we iterate over all sentences that refer to this word
|
|
|
|
@ -273,7 +284,8 @@ public final class plasmaCondenser {
|
|
|
|
|
idx = Integer.parseInt((String) it1.next()); // number of a sentence
|
|
|
|
|
s = (String[]) orderedSentences[idx];
|
|
|
|
|
for (int j = 2; j < s.length; j++) {
|
|
|
|
|
if (s[j].equals(intString(sp.handle, numlength))) s[j] = intString(sp1.handle, numlength);
|
|
|
|
|
if (s[j].equals(intString(sp.handle, numlength)))
|
|
|
|
|
s[j] = intString(sp1.handle, numlength);
|
|
|
|
|
}
|
|
|
|
|
orderedSentences[idx] = s;
|
|
|
|
|
}
|
|
|
|
@ -288,20 +300,22 @@ public final class plasmaCondenser {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// depending on the orderedSentences structure, we rebuild the sentence HashMap to
|
|
|
|
|
// eliminate double occuring sentences
|
|
|
|
|
// depending on the orderedSentences structure, we rebuild the sentence
|
|
|
|
|
// HashMap to eliminate double occuring sentences
|
|
|
|
|
sentences = new HashMap();
|
|
|
|
|
int le;
|
|
|
|
|
for (int i = 0; i < orderedSentences.length; i++) {
|
|
|
|
|
le = ((String[]) orderedSentences[i]).length;
|
|
|
|
|
sentence = new StringBuffer(le * 10);
|
|
|
|
|
for (int j = 1; j < le; j++) sentence.append(((String[]) orderedSentences[i])[j]);
|
|
|
|
|
for (int j = 1; j < le; j++)
|
|
|
|
|
sentence.append(((String[]) orderedSentences[i])[j]);
|
|
|
|
|
if (sentences.containsKey(sentence)) {
|
|
|
|
|
// add sentence counter to counter of found sentence
|
|
|
|
|
sp = (statProp) sentences.get(sentence);
|
|
|
|
|
sp.count = sp.count + Integer.parseInt(((String[]) orderedSentences[i])[0]);
|
|
|
|
|
sentences.put(sentence, sp);
|
|
|
|
|
//System.out.println("Found double occurring sentence " + i + " = " + sp.handle);
|
|
|
|
|
// System.out.println("Found double occurring sentence " + i + "
|
|
|
|
|
// = " + sp.handle);
|
|
|
|
|
} else {
|
|
|
|
|
// create new sentence entry
|
|
|
|
|
sp = new statProp(i);
|
|
|
|
@ -323,7 +337,6 @@ public final class plasmaCondenser {
|
|
|
|
|
this.RESULT_INFORMATION_VALUE = (allwordcounter == 0) ? 0 : (wordenum.count() * words.size() / allwordcounter / 16);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public void print() {
|
|
|
|
|
String[] s = sentences();
|
|
|
|
|
|
|
|
|
@ -341,7 +354,7 @@ public final class plasmaCondenser {
|
|
|
|
|
statProp sp;
|
|
|
|
|
Map.Entry entry;
|
|
|
|
|
Iterator it;
|
|
|
|
|
String[] orderedWords = new String[words.size()+99]; // uuiiii, the '99' is only a quick hack...
|
|
|
|
|
String[] orderedWords = new String[words.size() + 99]; // uuiiii, the '99' is only a quick hack...
|
|
|
|
|
it = words.entrySet().iterator(); // enumerates the keys in ascending order
|
|
|
|
|
while (it.hasNext()) {
|
|
|
|
|
entry = (Map.Entry) it.next();
|
|
|
|
@ -379,7 +392,8 @@ public final class plasmaCondenser {
|
|
|
|
|
String[] s;
|
|
|
|
|
StringBuffer sentence;
|
|
|
|
|
Object[] orderedSentences = new Object[sentences.size()];
|
|
|
|
|
for (int i = 0; i < sentences.size(); i++) orderedSentences[i] = null; // this array must be initialized
|
|
|
|
|
for (int i = 0; i < sentences.size(); i++)
|
|
|
|
|
orderedSentences[i] = null; // this array must be initialized
|
|
|
|
|
it = sentences.keySet().iterator();
|
|
|
|
|
while (it.hasNext()) {
|
|
|
|
|
sentence = (StringBuffer) it.next();
|
|
|
|
@ -387,8 +401,9 @@ public final class plasmaCondenser {
|
|
|
|
|
s = new String[wc + 2];
|
|
|
|
|
sp = (statProp) sentences.get(sentence);
|
|
|
|
|
s[0] = intString(sp.count, numlength); // number of occurrences of this sentence
|
|
|
|
|
s[1] = sentence.substring(0,1); // the termination symbol of this sentence
|
|
|
|
|
for (int i = 0; i < wc; i++) s[i + 2] = sentence.substring(i * numlength + 1, (i + 1) * numlength + 1);
|
|
|
|
|
s[1] = sentence.substring(0, 1); // the termination symbol of this sentence
|
|
|
|
|
for (int i = 0; i < wc; i++)
|
|
|
|
|
s[i + 2] = sentence.substring(i * numlength + 1, (i + 1) * numlength + 1);
|
|
|
|
|
orderedSentences[sp.handle] = s;
|
|
|
|
|
}
|
|
|
|
|
return orderedSentences;
|
|
|
|
@ -422,8 +437,7 @@ public final class plasmaCondenser {
|
|
|
|
|
while (it.hasNext()) {
|
|
|
|
|
entry = (Map.Entry) it.next();
|
|
|
|
|
k = (String) entry.getKey();
|
|
|
|
|
writer.write("#W " + k.substring(numlength) + " " + k.substring(0, numlength) + " " +
|
|
|
|
|
((String) entry.getValue()) + "\r\n");
|
|
|
|
|
writer.write("#W " + k.substring(numlength) + " " + k.substring(0, numlength) + " " + ((String) entry.getValue()) + "\r\n");
|
|
|
|
|
}
|
|
|
|
|
for (int i = 0; i < orderedSentences.length; i++) {
|
|
|
|
|
if (orderedSentences[i] != null) {
|
|
|
|
@ -446,7 +460,6 @@ public final class plasmaCondenser {
|
|
|
|
|
return ("$%&/()=\"$%&/()=`^+*~#'-_:;,|<>[]\\".indexOf(c) >= 0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public static Enumeration wordTokenizer(String s, int minLength) {
|
|
|
|
|
try {
|
|
|
|
|
return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes()), minLength);
|
|
|
|
@ -455,8 +468,9 @@ public final class plasmaCondenser {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public static class sievedWordsEnum implements Enumeration {
|
|
|
|
|
// this enumeration removes all words that contain either wrong characters or are too short
|
|
|
|
|
|
|
|
|
|
Object buffer = null;
|
|
|
|
|
unsievedWordsEnum e;
|
|
|
|
|
int ml;
|
|
|
|
@ -468,30 +482,32 @@ public final class plasmaCondenser {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private Object nextElement0() {
|
|
|
|
|
String s, r;
|
|
|
|
|
String s;
|
|
|
|
|
char c;
|
|
|
|
|
loop: while (e.hasMoreElements()) {
|
|
|
|
|
s = (String) e.nextElement();
|
|
|
|
|
r = s.toLowerCase();
|
|
|
|
|
if ((s.length() == 1) && (punctuation(s.charAt(0)))) return s;
|
|
|
|
|
if (s.length() < ml) continue loop;
|
|
|
|
|
for (int i = 0; i < r.length(); i++) {
|
|
|
|
|
c = r.charAt(i);
|
|
|
|
|
if (!(((c >= 'a') && (c <= 'z')) ||
|
|
|
|
|
((c >= '0') && (c <= '9')))) continue loop; // go to next while loop
|
|
|
|
|
for (int i = 0; i < s.length(); i++) {
|
|
|
|
|
c = s.charAt(i);
|
|
|
|
|
if (((c < 'a') || (c > 'z')) &&
|
|
|
|
|
((c < 'A') || (c > 'Z')) &&
|
|
|
|
|
((c < '0') || (c > '9')))
|
|
|
|
|
continue loop; // go to next while loop
|
|
|
|
|
}
|
|
|
|
|
return s;
|
|
|
|
|
}
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public boolean hasMoreElements() {
|
|
|
|
|
return buffer != null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public Object nextElement() {
|
|
|
|
|
Object r = buffer; buffer = nextElement0(); return r;
|
|
|
|
|
Object r = buffer;
|
|
|
|
|
buffer = nextElement0();
|
|
|
|
|
return r;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public int count() {
|
|
|
|
@ -500,6 +516,7 @@ public final class plasmaCondenser {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private static class unsievedWordsEnum implements Enumeration {
|
|
|
|
|
|
|
|
|
|
Object buffer = null;
|
|
|
|
|
linesFromFileEnum e;
|
|
|
|
|
String s;
|
|
|
|
@ -533,7 +550,11 @@ public final class plasmaCondenser {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
int p = s.indexOf(" ");
|
|
|
|
|
if (p < 0) {r = s; s = ""; return r;}
|
|
|
|
|
if (p < 0) {
|
|
|
|
|
r = s;
|
|
|
|
|
s = "";
|
|
|
|
|
return r;
|
|
|
|
|
}
|
|
|
|
|
r = s.substring(0, p);
|
|
|
|
|
s = s.substring(p + 1).trim();
|
|
|
|
|
return r;
|
|
|
|
@ -544,7 +565,9 @@ public final class plasmaCondenser {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public Object nextElement() {
|
|
|
|
|
Object r = buffer; buffer = nextElement0(); return r;
|
|
|
|
|
Object r = buffer;
|
|
|
|
|
buffer = nextElement0();
|
|
|
|
|
return r;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public int count() {
|
|
|
|
@ -571,11 +594,17 @@ public final class plasmaCondenser {
|
|
|
|
|
String s;
|
|
|
|
|
while (true) {
|
|
|
|
|
s = raf.readLine();
|
|
|
|
|
if (s == null) {raf.close(); return null;}
|
|
|
|
|
if (s == null) {
|
|
|
|
|
raf.close();
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
if (!(s.startsWith("#"))) return s;
|
|
|
|
|
}
|
|
|
|
|
} catch (IOException e) {
|
|
|
|
|
try {raf.close();} catch (Exception ee) {}
|
|
|
|
|
try {
|
|
|
|
|
raf.close();
|
|
|
|
|
} catch (Exception ee) {
|
|
|
|
|
}
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
@ -612,9 +641,7 @@ public final class plasmaCondenser {
|
|
|
|
|
if (p >= 0) {
|
|
|
|
|
// we found one key in the result text
|
|
|
|
|
// prepare a line and put it to the property
|
|
|
|
|
r = s.substring(0, p) + "<B>" +
|
|
|
|
|
s.substring(p, p + searchwords[i].length()) + "</B>" +
|
|
|
|
|
s.substring(p + searchwords[i].length());
|
|
|
|
|
r = s.substring(0, p) + "<B>" + s.substring(p, p + searchwords[i].length()) + "</B>" + s.substring(p + searchwords[i].length());
|
|
|
|
|
prop.setProperty("key-" + searchwords[i], r);
|
|
|
|
|
// remember that we found this
|
|
|
|
|
foundsearch.add(searchwords[i]);
|
|
|
|
@ -632,8 +659,10 @@ public final class plasmaCondenser {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static void main(String[] args) {
|
|
|
|
|
if ((args.length == 0) || (args.length > 3)) System.out.println("wrong number of arguments: plasmaCondenser -text|-html <infile> <outfile>"); else try {
|
|
|
|
|
|
|
|
|
|
if ((args.length == 0) || (args.length > 3))
|
|
|
|
|
System.out.println("wrong number of arguments: plasmaCondenser -text|-html <infile> <outfile>");
|
|
|
|
|
else
|
|
|
|
|
try {
|
|
|
|
|
plasmaCondenser pc = null;
|
|
|
|
|
|
|
|
|
|
// read and analyse file
|
|
|
|
@ -652,16 +681,18 @@ public final class plasmaCondenser {
|
|
|
|
|
while ((i = fis.read(buffer)) > 0) fos.write(buffer, 0, i);
|
|
|
|
|
fis.close();
|
|
|
|
|
fos.close();
|
|
|
|
|
//cs.print();
|
|
|
|
|
//System.out.println("TEXT:" + new String(cs.getText()));
|
|
|
|
|
// cs.print();
|
|
|
|
|
// System.out.println("TEXT:" + new String(cs.getText()));
|
|
|
|
|
textStream = new ByteArrayInputStream(cs.getText());
|
|
|
|
|
} else {
|
|
|
|
|
System.out.println("first argument must be either '-text' or '-html'");
|
|
|
|
|
System.exit(-1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// call condenser
|
|
|
|
|
pc = new plasmaCondenser(textStream, 1, 0);
|
|
|
|
|
textStream.close();
|
|
|
|
|
|
|
|
|
|
// output result
|
|
|
|
|
pc.writeMapToFile(new File(args[2]));
|
|
|
|
|
pc.print();
|
|
|
|
|