fix bug: data from plasmaParser is allready converted to UTF-8

After removing the restrictions in the code, YaCy should be able to index Unicode-charaters! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5290 6c8d7289-2bf4-0310-a012-ef5d649a1542
17 years ago · dd8441f102
parent 47f0c3b002
commit dd8441f102
2 changed files with 14 additions and 12 deletions
--- a/source/de/anomic/index/indexWord.java
+++ b/source/de/anomic/index/indexWord.java
@ -28,6 +28,7 @@ package de.anomic.index;

 import java.util.HashSet;
 import java.util.Iterator;
+import java.util.Locale;
 import java.util.Set;
 import java.util.TreeSet;

@ -77,7 +78,7 @@ public class indexWord {

    // create a word hash
    public static final String word2hash(final String word) {
-        return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(word.toLowerCase())).substring(0, yacySeedDB.commonHashLength);
+        return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(word.toLowerCase(Locale.ENGLISH))).substring(0, yacySeedDB.commonHashLength);
    }
    
    public static final Set<String> words2hashSet(final String[] words) {
--- a/source/de/anomic/plasma/plasmaCondenser.java
+++ b/source/de/anomic/plasma/plasmaCondenser.java
@ -40,6 +40,7 @@ import java.util.Enumeration;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
+import java.util.Locale;
 import java.util.Map;
 import java.util.Properties;
 import java.util.TreeMap;
@ -234,13 +235,13 @@ public final class plasmaCondenser {
        indexWord wprop;
        sievedWordsEnum wordenum;
        try {
-            wordenum = new sievedWordsEnum(new ByteArrayInputStream(text.getBytes()), "UTF-8");
+            wordenum = new sievedWordsEnum(new ByteArrayInputStream(text.getBytes()));
        } catch (final UnsupportedEncodingException e) {
            return;
        }
        int pip = 0;
        while (wordenum.hasMoreElements()) {
-            word = (new String(wordenum.nextElement())).toLowerCase();
+            word = (new String(wordenum.nextElement())).toLowerCase(Locale.ENGLISH);
            if (useForLanguageIdentification) languageIdentificator.add(word);
            if (word.length() < 3) continue;
            wprop = words.get(word);
@ -321,7 +322,7 @@ public final class plasmaCondenser {
        // read source
        final sievedWordsEnum wordenum = new sievedWordsEnum(is, charset);
        while (wordenum.hasMoreElements()) {
-            word = (new String(wordenum.nextElement())).toLowerCase(); // TODO: does toLowerCase work for non ISO-8859-1 chars?
+            word = (new String(wordenum.nextElement())).toLowerCase(Locale.ENGLISH); // TODO: does toLowerCase work for non ISO-8859-1 chars?
            if (languageIdentificator != null) languageIdentificator.add(word);
            if (word.length() < wordminsize) continue;
            //System.out.println("PARSED-WORD " + word);
@ -509,8 +510,8 @@ public final class plasmaCondenser {
        StringBuffer buffer = null;
        unsievedWordsEnum e;

-        public sievedWordsEnum(final InputStream is, final String charset) throws UnsupportedEncodingException {
-            e = new unsievedWordsEnum(is, charset);
+        public sievedWordsEnum(final InputStream is) throws UnsupportedEncodingException {
+            e = new unsievedWordsEnum(is);
            buffer = nextElement0();
        }

@ -555,8 +556,8 @@ public final class plasmaCondenser {
        sentencesFromInputStreamEnum e;
        StringBuffer s;

-        public unsievedWordsEnum(final InputStream is, final String charset) throws UnsupportedEncodingException {
-            e = new sentencesFromInputStreamEnum(is, charset);
+        public unsievedWordsEnum(final InputStream is) throws UnsupportedEncodingException {
+            e = new sentencesFromInputStreamEnum(is);
            s = new StringBuffer(20);
            buffer = nextElement0();
        }
@ -616,9 +617,9 @@ public final class plasmaCondenser {
        return sb;
    }
    
-    public static sentencesFromInputStreamEnum sentencesFromInputStream(final InputStream is, final String charset) {
+    public static sentencesFromInputStreamEnum sentencesFromInputStream(final InputStream is) {
        try {
-            return new sentencesFromInputStreamEnum(is, charset);
+            return new sentencesFromInputStreamEnum(is);
        } catch (final UnsupportedEncodingException e) {
            return null;
        }
@ -633,8 +634,8 @@ public final class plasmaCondenser {
        int counter = 0;
        boolean pre = false;

-        public sentencesFromInputStreamEnum(final InputStream is, final String charset) throws UnsupportedEncodingException {
-            raf = new BufferedReader((charset == null) ? new InputStreamReader(is) : new InputStreamReader(is, charset));
+        public sentencesFromInputStreamEnum(final InputStream is) throws UnsupportedEncodingException {
+            raf = new BufferedReader(new InputStreamReader(is, "UTF-8"));
            buffer = nextElement0();
            counter = 0;
            pre = false;