removed tabs from condenser

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1376 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 37791fd529
commit f1cfee7703

@ -117,21 +117,31 @@ public final class plasmaCondenser {
// number of occurrences of one word // number of occurrences of one word
// if the word did not occur, this simply returns 0 // if the word did not occur, this simply returns 0
statProp sp = (statProp) words.get(word); statProp sp = (statProp) words.get(word);
if (sp == null) return 0; if (sp == null)
return 0;
return sp.count; return sp.count;
} }
public static class statProp { public static class statProp {
public int count; public int count;
public int handle; public int handle;
public HashSet hash; public HashSet hash;
public statProp(int handle) { public statProp(int handle) {
this.count = 1; this.count = 1;
this.handle = handle; this.handle = handle;
this.hash = new HashSet(); this.hash = new HashSet();
} }
public void inc() {count++;}
public void check(int i) {hash.add(Integer.toString(i));} public void inc() {
count++;
}
public void check(int i) {
hash.add(Integer.toString(i));
}
} }
@ -265,7 +275,8 @@ public final class plasmaCondenser {
if (wordlen > i) { if (wordlen > i) {
k = word.substring(0, wordlen - i); k = word.substring(0, wordlen - i);
if (words.containsKey(k)) { if (words.containsKey(k)) {
// we will delete the word 'word' and repoint the corresponding links // we will delete the word 'word' and repoint the
// corresponding links
// in sentences that use this word // in sentences that use this word
sp1 = (statProp) words.get(k); sp1 = (statProp) words.get(k);
it1 = sp.hash.iterator(); // we iterate over all sentences that refer to this word it1 = sp.hash.iterator(); // we iterate over all sentences that refer to this word
@ -273,7 +284,8 @@ public final class plasmaCondenser {
idx = Integer.parseInt((String) it1.next()); // number of a sentence idx = Integer.parseInt((String) it1.next()); // number of a sentence
s = (String[]) orderedSentences[idx]; s = (String[]) orderedSentences[idx];
for (int j = 2; j < s.length; j++) { for (int j = 2; j < s.length; j++) {
if (s[j].equals(intString(sp.handle, numlength))) s[j] = intString(sp1.handle, numlength); if (s[j].equals(intString(sp.handle, numlength)))
s[j] = intString(sp1.handle, numlength);
} }
orderedSentences[idx] = s; orderedSentences[idx] = s;
} }
@ -288,20 +300,22 @@ public final class plasmaCondenser {
} }
} }
// depending on the orderedSentences structure, we rebuild the sentence HashMap to // depending on the orderedSentences structure, we rebuild the sentence
// eliminate double occuring sentences // HashMap to eliminate double occuring sentences
sentences = new HashMap(); sentences = new HashMap();
int le; int le;
for (int i = 0; i < orderedSentences.length; i++) { for (int i = 0; i < orderedSentences.length; i++) {
le = ((String[]) orderedSentences[i]).length; le = ((String[]) orderedSentences[i]).length;
sentence = new StringBuffer(le * 10); sentence = new StringBuffer(le * 10);
for (int j = 1; j < le; j++) sentence.append(((String[]) orderedSentences[i])[j]); for (int j = 1; j < le; j++)
sentence.append(((String[]) orderedSentences[i])[j]);
if (sentences.containsKey(sentence)) { if (sentences.containsKey(sentence)) {
// add sentence counter to counter of found sentence // add sentence counter to counter of found sentence
sp = (statProp) sentences.get(sentence); sp = (statProp) sentences.get(sentence);
sp.count = sp.count + Integer.parseInt(((String[]) orderedSentences[i])[0]); sp.count = sp.count + Integer.parseInt(((String[]) orderedSentences[i])[0]);
sentences.put(sentence, sp); sentences.put(sentence, sp);
//System.out.println("Found double occurring sentence " + i + " = " + sp.handle); // System.out.println("Found double occurring sentence " + i + "
// = " + sp.handle);
} else { } else {
// create new sentence entry // create new sentence entry
sp = new statProp(i); sp = new statProp(i);
@ -323,7 +337,6 @@ public final class plasmaCondenser {
this.RESULT_INFORMATION_VALUE = (allwordcounter == 0) ? 0 : (wordenum.count() * words.size() / allwordcounter / 16); this.RESULT_INFORMATION_VALUE = (allwordcounter == 0) ? 0 : (wordenum.count() * words.size() / allwordcounter / 16);
} }
public void print() { public void print() {
String[] s = sentences(); String[] s = sentences();
@ -379,7 +392,8 @@ public final class plasmaCondenser {
String[] s; String[] s;
StringBuffer sentence; StringBuffer sentence;
Object[] orderedSentences = new Object[sentences.size()]; Object[] orderedSentences = new Object[sentences.size()];
for (int i = 0; i < sentences.size(); i++) orderedSentences[i] = null; // this array must be initialized for (int i = 0; i < sentences.size(); i++)
orderedSentences[i] = null; // this array must be initialized
it = sentences.keySet().iterator(); it = sentences.keySet().iterator();
while (it.hasNext()) { while (it.hasNext()) {
sentence = (StringBuffer) it.next(); sentence = (StringBuffer) it.next();
@ -388,7 +402,8 @@ public final class plasmaCondenser {
sp = (statProp) sentences.get(sentence); sp = (statProp) sentences.get(sentence);
s[0] = intString(sp.count, numlength); // number of occurrences of this sentence s[0] = intString(sp.count, numlength); // number of occurrences of this sentence
s[1] = sentence.substring(0, 1); // the termination symbol of this sentence s[1] = sentence.substring(0, 1); // the termination symbol of this sentence
for (int i = 0; i < wc; i++) s[i + 2] = sentence.substring(i * numlength + 1, (i + 1) * numlength + 1); for (int i = 0; i < wc; i++)
s[i + 2] = sentence.substring(i * numlength + 1, (i + 1) * numlength + 1);
orderedSentences[sp.handle] = s; orderedSentences[sp.handle] = s;
} }
return orderedSentences; return orderedSentences;
@ -422,8 +437,7 @@ public final class plasmaCondenser {
while (it.hasNext()) { while (it.hasNext()) {
entry = (Map.Entry) it.next(); entry = (Map.Entry) it.next();
k = (String) entry.getKey(); k = (String) entry.getKey();
writer.write("#W " + k.substring(numlength) + " " + k.substring(0, numlength) + " " + writer.write("#W " + k.substring(numlength) + " " + k.substring(0, numlength) + " " + ((String) entry.getValue()) + "\r\n");
((String) entry.getValue()) + "\r\n");
} }
for (int i = 0; i < orderedSentences.length; i++) { for (int i = 0; i < orderedSentences.length; i++) {
if (orderedSentences[i] != null) { if (orderedSentences[i] != null) {
@ -446,7 +460,6 @@ public final class plasmaCondenser {
return ("$%&/()=\"$%&/()=`^+*~#'-_:;,|<>[]\\".indexOf(c) >= 0); return ("$%&/()=\"$%&/()=`^+*~#'-_:;,|<>[]\\".indexOf(c) >= 0);
} }
public static Enumeration wordTokenizer(String s, int minLength) { public static Enumeration wordTokenizer(String s, int minLength) {
try { try {
return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes()), minLength); return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes()), minLength);
@ -455,8 +468,9 @@ public final class plasmaCondenser {
} }
} }
public static class sievedWordsEnum implements Enumeration { public static class sievedWordsEnum implements Enumeration {
// this enumeration removes all words that contain either wrong characters or are too short
Object buffer = null; Object buffer = null;
unsievedWordsEnum e; unsievedWordsEnum e;
int ml; int ml;
@ -468,30 +482,32 @@ public final class plasmaCondenser {
} }
private Object nextElement0() { private Object nextElement0() {
String s, r; String s;
char c; char c;
loop: while (e.hasMoreElements()) { loop: while (e.hasMoreElements()) {
s = (String) e.nextElement(); s = (String) e.nextElement();
r = s.toLowerCase();
if ((s.length() == 1) && (punctuation(s.charAt(0)))) return s; if ((s.length() == 1) && (punctuation(s.charAt(0)))) return s;
if (s.length() < ml) continue loop; if (s.length() < ml) continue loop;
for (int i = 0; i < r.length(); i++) { for (int i = 0; i < s.length(); i++) {
c = r.charAt(i); c = s.charAt(i);
if (!(((c >= 'a') && (c <= 'z')) || if (((c < 'a') || (c > 'z')) &&
((c >= '0') && (c <= '9')))) continue loop; // go to next while loop ((c < 'A') || (c > 'Z')) &&
((c < '0') || (c > '9')))
continue loop; // go to next while loop
} }
return s; return s;
} }
return null; return null;
} }
public boolean hasMoreElements() { public boolean hasMoreElements() {
return buffer != null; return buffer != null;
} }
public Object nextElement() { public Object nextElement() {
Object r = buffer; buffer = nextElement0(); return r; Object r = buffer;
buffer = nextElement0();
return r;
} }
public int count() { public int count() {
@ -500,6 +516,7 @@ public final class plasmaCondenser {
} }
private static class unsievedWordsEnum implements Enumeration { private static class unsievedWordsEnum implements Enumeration {
Object buffer = null; Object buffer = null;
linesFromFileEnum e; linesFromFileEnum e;
String s; String s;
@ -533,7 +550,11 @@ public final class plasmaCondenser {
} }
} }
int p = s.indexOf(" "); int p = s.indexOf(" ");
if (p < 0) {r = s; s = ""; return r;} if (p < 0) {
r = s;
s = "";
return r;
}
r = s.substring(0, p); r = s.substring(0, p);
s = s.substring(p + 1).trim(); s = s.substring(p + 1).trim();
return r; return r;
@ -544,7 +565,9 @@ public final class plasmaCondenser {
} }
public Object nextElement() { public Object nextElement() {
Object r = buffer; buffer = nextElement0(); return r; Object r = buffer;
buffer = nextElement0();
return r;
} }
public int count() { public int count() {
@ -571,11 +594,17 @@ public final class plasmaCondenser {
String s; String s;
while (true) { while (true) {
s = raf.readLine(); s = raf.readLine();
if (s == null) {raf.close(); return null;} if (s == null) {
raf.close();
return null;
}
if (!(s.startsWith("#"))) return s; if (!(s.startsWith("#"))) return s;
} }
} catch (IOException e) { } catch (IOException e) {
try {raf.close();} catch (Exception ee) {} try {
raf.close();
} catch (Exception ee) {
}
return null; return null;
} }
} }
@ -612,9 +641,7 @@ public final class plasmaCondenser {
if (p >= 0) { if (p >= 0) {
// we found one key in the result text // we found one key in the result text
// prepare a line and put it to the property // prepare a line and put it to the property
r = s.substring(0, p) + "<B>" + r = s.substring(0, p) + "<B>" + s.substring(p, p + searchwords[i].length()) + "</B>" + s.substring(p + searchwords[i].length());
s.substring(p, p + searchwords[i].length()) + "</B>" +
s.substring(p + searchwords[i].length());
prop.setProperty("key-" + searchwords[i], r); prop.setProperty("key-" + searchwords[i], r);
// remember that we found this // remember that we found this
foundsearch.add(searchwords[i]); foundsearch.add(searchwords[i]);
@ -632,8 +659,10 @@ public final class plasmaCondenser {
} }
public static void main(String[] args) { public static void main(String[] args) {
if ((args.length == 0) || (args.length > 3)) System.out.println("wrong number of arguments: plasmaCondenser -text|-html <infile> <outfile>"); else try { if ((args.length == 0) || (args.length > 3))
System.out.println("wrong number of arguments: plasmaCondenser -text|-html <infile> <outfile>");
else
try {
plasmaCondenser pc = null; plasmaCondenser pc = null;
// read and analyse file // read and analyse file
@ -659,9 +688,11 @@ public final class plasmaCondenser {
System.out.println("first argument must be either '-text' or '-html'"); System.out.println("first argument must be either '-text' or '-html'");
System.exit(-1); System.exit(-1);
} }
// call condenser // call condenser
pc = new plasmaCondenser(textStream, 1, 0); pc = new plasmaCondenser(textStream, 1, 0);
textStream.close(); textStream.close();
// output result // output result
pc.writeMapToFile(new File(args[2])); pc.writeMapToFile(new File(args[2]));
pc.print(); pc.print();

Loading…
Cancel
Save