From 9ecf7f0da20a605ffbb0a1890257d7313a886ebd Mon Sep 17 00:00:00 2001 From: theli Date: Thu, 14 Sep 2006 05:37:46 +0000 Subject: [PATCH] *) some TODO makers for UTF-8 problem git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2578 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/plasma/plasmaCondenser.java | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java index aa82532b5..2da1d87cf 100644 --- a/source/de/anomic/plasma/plasmaCondenser.java +++ b/source/de/anomic/plasma/plasmaCondenser.java @@ -190,7 +190,7 @@ public final class plasmaCondenser { // read source sievedWordsEnum wordenum = new sievedWordsEnum(is, wordminsize); while (wordenum.hasMoreElements()) { - word = ((String) wordenum.nextElement()).toLowerCase(); + word = ((String) wordenum.nextElement()).toLowerCase(); // TODO: does toLowerCase work for non ISO-8859-1 chars? // System.out.println("PARSED-WORD " + word); wordlen = word.length(); if ((wordlen == 1) && (punctuation(word.charAt(0)))) { @@ -389,6 +389,7 @@ public final class plasmaCondenser { String s; for (int i = 0; i < orderedSentences.length; i++) { if (orderedSentences[i] != null) { + // TODO: bugfix for UTF-8: avoid this form of string concatenation s = ""; for (int j = 2; j < ((String[]) orderedSentences[i]).length; j++) { s += " " + orderedWords[Integer.parseInt(((String[]) orderedSentences[i])[j])]; @@ -455,7 +456,7 @@ public final class plasmaCondenser { it = sortedWords.entrySet().iterator(); // enumerates the keys in descending order while (it.hasNext()) { entry = (Map.Entry) it.next(); - k = (String) entry.getKey(); + k = (String) entry.getKey(); writer.write("#W " + k.substring(numlength) + " " + k.substring(0, numlength) + " " + ((String) entry.getValue()) + "\r\n"); } for (int i = 0; i < orderedSentences.length; i++) { @@ -475,12 +476,14 @@ public final class plasmaCondenser { } public final static boolean invisible(char c) { + // TODO: Bugfix for UTF-8: does this work for non ISO-8859-1 chars? if ((c < ' ') || (c > 'z')) return true; return ("$%&/()=\"$%&/()=`^+*~#'-_:;,|<>[]\\".indexOf(c) >= 0); } public static Enumeration wordTokenizer(String s, int minLength) { try { + // TODO: Bugfix for UTF-8 needed return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes()), minLength); } catch (Exception e) { return null; @@ -509,6 +512,7 @@ public final class plasmaCondenser { if (s.length() < ml) continue loop; for (int i = 0; i < s.length(); i++) { c = s.charAt(i); + // TODO: Bugfix needed for UTF-8 if (((c < 'a') || (c > 'z')) && ((c < 'A') || (c > 'Z')) && ((c < '0') || (c > '9'))) @@ -558,11 +562,11 @@ public final class plasmaCondenser { sb = new StringBuffer(r.length() * 2); for (int i = 0; i < r.length(); i++) { c = r.charAt(i); - if (invisible(c)) sb = sb.append(' '); + if (invisible(c)) sb = sb.append(' '); // TODO: Bugfix needed for UTF-8 else if (punctuation(c)) sb = sb.append(' ').append(c).append(' '); else sb = sb.append(c); } - s = sb.toString().trim(); + s = sb.toString().trim(); //System.out.println("PARSING-LINE '" + r + "'->'" + s + "'"); } else { return null; @@ -603,7 +607,7 @@ public final class plasmaCondenser { int counter = 0; public linesFromFileEnum(InputStream is) { - raf = new BufferedReader(new InputStreamReader(is)); + raf = new BufferedReader(new InputStreamReader(is)); // TODO: bugfix needed for UTF-8, use charset for reader buffer = nextElement0(); counter = 0; }