fixed a worst case situation of the condenser which may cause a temporary full CPU load because of a bad data structure usage

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6372 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent f1bde59c50
commit ea427df944

@ -35,6 +35,7 @@ import java.io.InputStream;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.io.Reader; import java.io.Reader;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Enumeration; import java.util.Enumeration;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
@ -453,13 +454,13 @@ public final class Condenser {
public final static boolean invisible(final char c) { public final static boolean invisible(final char c) {
final int type = Character.getType(c); final int type = Character.getType(c);
if ( if (
(type == Character.LOWERCASE_LETTER) type == Character.LOWERCASE_LETTER
|| (type == Character.DECIMAL_DIGIT_NUMBER) || type == Character.DECIMAL_DIGIT_NUMBER
|| (type == Character.UPPERCASE_LETTER) || type == Character.UPPERCASE_LETTER
|| (type == Character.MODIFIER_LETTER) || type == Character.MODIFIER_LETTER
|| (type == Character.OTHER_LETTER) || type == Character.OTHER_LETTER
|| (type == Character.TITLECASE_LETTER) || type == Character.TITLECASE_LETTER
|| (ContentScraper.punctuation(c))) { || ContentScraper.punctuation(c)) {
return false; return false;
} }
return true; return true;
@ -529,18 +530,17 @@ public final class Condenser {
} }
/*
private static class unsievedWordsEnum implements Enumeration<StringBuilder> { private static class unsievedWordsEnum implements Enumeration<StringBuilder> {
// returns an enumeration of StringBuilder Objects // returns an enumeration of StringBuilder Objects
StringBuilder buffer = null; StringBuilder buffer = null;
sentencesFromInputStreamEnum e; sentencesFromInputStreamEnum e;
StringBuilder s; ArrayList<StringBuilder> s;
int off; int sIndex;
public unsievedWordsEnum(final InputStream is) throws UnsupportedEncodingException { public unsievedWordsEnum(final InputStream is) throws UnsupportedEncodingException {
e = new sentencesFromInputStreamEnum(is); e = new sentencesFromInputStreamEnum(is);
s = new StringBuilder(0); s = new ArrayList<StringBuilder>();
off = 0; sIndex = 0;
buffer = nextElement0(); buffer = nextElement0();
} }
@ -552,97 +552,31 @@ public final class Condenser {
StringBuilder r; StringBuilder r;
StringBuilder sb; StringBuilder sb;
char c; char c;
while (s.length() - off <= 0) { if (sIndex >= s.size()) {
if (e.hasNext()) { sIndex = 0;
r = e.next(); s.clear();
if (r == null) return null;
r = trim(r);
sb = new StringBuilder(r.length() * 2);
for (int i = 0; i < r.length(); i++) {
c = r.charAt(i);
if (invisible(c)) sb = sb.append(' '); // TODO: Bugfix needed for UTF-8
else if (htmlFilterContentScraper.punctuation(c)) sb = sb.append(' ').append(c).append(' ');
else sb = sb.append(c);
}
s = trim(sb);
off = 0;
//System.out.println("PARSING-LINE '" + r + "'->'" + s + "'");
} else {
return null;
}
}
final int p = s.indexOf(" ", off);
if (p < 0) {
r = new StringBuilder(s.substring(off));
s = new StringBuilder(0);
off = 0;
return r;
} }
r = trim(new StringBuilder(s.substring(off, p))); while (s.size() == 0) {
off = p + 1; if (!e.hasNext()) return null;
while (off < s.length() && s.charAt(off) <= ' ') off++;
return r;
}
public boolean hasMoreElements() {
return buffer != null;
}
public StringBuilder nextElement() {
final StringBuilder r = buffer;
buffer = nextElement0();
return r;
}
}
*/
private static class unsievedWordsEnum implements Enumeration<StringBuilder> {
// returns an enumeration of StringBuilder Objects
StringBuilder buffer = null;
sentencesFromInputStreamEnum e;
StringBuilder s;
public unsievedWordsEnum(final InputStream is) throws UnsupportedEncodingException {
e = new sentencesFromInputStreamEnum(is);
s = new StringBuilder(20);
buffer = nextElement0();
}
public void pre(final boolean x) {
e.pre(x);
}
private StringBuilder nextElement0() {
StringBuilder r;
StringBuilder sb;
char c;
while (s.length() == 0) {
if (e.hasNext()) {
r = e.next(); r = e.next();
if (r == null) return null; if (r == null) return null;
r = trim(r); r = trim(r);
sb = new StringBuilder(r.length() * 2); sb = new StringBuilder(20);
for (int i = 0; i < r.length(); i++) { for (int i = 0; i < r.length(); i++) {
c = r.charAt(i); c = r.charAt(i);
if (invisible(c)) sb = sb.append(' '); // TODO: Bugfix needed for UTF-8 if (invisible(c)) {
else if (ContentScraper.punctuation(c)) sb = sb.append(' ').append(c).append(' '); if (sb.length() > 0) {s.add(sb); sb = new StringBuilder(20);}
else sb = sb.append(c); } else if (ContentScraper.punctuation(c)) {
} if (sb.length() > 0) {s.add(sb); sb = new StringBuilder(1);}
s = trim(sb); sb.append(c);
//System.out.println("PARSING-LINE '" + r + "'->'" + s + "'"); s.add(sb);
sb = new StringBuilder(20);
} else { } else {
return null; sb = sb.append(c);
} }
} }
final int p = s.indexOf(" ");
if (p < 0) {
r = s;
s = new StringBuilder();
return r;
} }
r = trim(new StringBuilder(s.substring(0, p))); r = s.get(sIndex++);
s = trim(s.delete(0, p + 1));
return r; return r;
} }

Loading…
Cancel
Save