From dd8441f102238eb6cc37d278d4cde61dd6590907 Mon Sep 17 00:00:00 2001 From: f1ori Date: Tue, 21 Oct 2008 20:19:10 +0000 Subject: [PATCH] fix bug: data from plasmaParser is allready converted to UTF-8 After removing the restrictions in the code, YaCy should be able to index Unicode-charaters! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5290 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/index/indexWord.java | 3 ++- source/de/anomic/plasma/plasmaCondenser.java | 23 ++++++++++---------- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/source/de/anomic/index/indexWord.java b/source/de/anomic/index/indexWord.java index f598c016c..082981003 100644 --- a/source/de/anomic/index/indexWord.java +++ b/source/de/anomic/index/indexWord.java @@ -28,6 +28,7 @@ package de.anomic.index; import java.util.HashSet; import java.util.Iterator; +import java.util.Locale; import java.util.Set; import java.util.TreeSet; @@ -77,7 +78,7 @@ public class indexWord { // create a word hash public static final String word2hash(final String word) { - return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(word.toLowerCase())).substring(0, yacySeedDB.commonHashLength); + return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(word.toLowerCase(Locale.ENGLISH))).substring(0, yacySeedDB.commonHashLength); } public static final Set words2hashSet(final String[] words) { diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java index b31ba6373..3183575d1 100644 --- a/source/de/anomic/plasma/plasmaCondenser.java +++ b/source/de/anomic/plasma/plasmaCondenser.java @@ -40,6 +40,7 @@ import java.util.Enumeration; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; +import java.util.Locale; import java.util.Map; import java.util.Properties; import java.util.TreeMap; @@ -234,13 +235,13 @@ public final class plasmaCondenser { indexWord wprop; sievedWordsEnum wordenum; try { - wordenum = new sievedWordsEnum(new ByteArrayInputStream(text.getBytes()), "UTF-8"); + wordenum = new sievedWordsEnum(new ByteArrayInputStream(text.getBytes())); } catch (final UnsupportedEncodingException e) { return; } int pip = 0; while (wordenum.hasMoreElements()) { - word = (new String(wordenum.nextElement())).toLowerCase(); + word = (new String(wordenum.nextElement())).toLowerCase(Locale.ENGLISH); if (useForLanguageIdentification) languageIdentificator.add(word); if (word.length() < 3) continue; wprop = words.get(word); @@ -321,7 +322,7 @@ public final class plasmaCondenser { // read source final sievedWordsEnum wordenum = new sievedWordsEnum(is, charset); while (wordenum.hasMoreElements()) { - word = (new String(wordenum.nextElement())).toLowerCase(); // TODO: does toLowerCase work for non ISO-8859-1 chars? + word = (new String(wordenum.nextElement())).toLowerCase(Locale.ENGLISH); // TODO: does toLowerCase work for non ISO-8859-1 chars? if (languageIdentificator != null) languageIdentificator.add(word); if (word.length() < wordminsize) continue; //System.out.println("PARSED-WORD " + word); @@ -509,8 +510,8 @@ public final class plasmaCondenser { StringBuffer buffer = null; unsievedWordsEnum e; - public sievedWordsEnum(final InputStream is, final String charset) throws UnsupportedEncodingException { - e = new unsievedWordsEnum(is, charset); + public sievedWordsEnum(final InputStream is) throws UnsupportedEncodingException { + e = new unsievedWordsEnum(is); buffer = nextElement0(); } @@ -555,8 +556,8 @@ public final class plasmaCondenser { sentencesFromInputStreamEnum e; StringBuffer s; - public unsievedWordsEnum(final InputStream is, final String charset) throws UnsupportedEncodingException { - e = new sentencesFromInputStreamEnum(is, charset); + public unsievedWordsEnum(final InputStream is) throws UnsupportedEncodingException { + e = new sentencesFromInputStreamEnum(is); s = new StringBuffer(20); buffer = nextElement0(); } @@ -616,9 +617,9 @@ public final class plasmaCondenser { return sb; } - public static sentencesFromInputStreamEnum sentencesFromInputStream(final InputStream is, final String charset) { + public static sentencesFromInputStreamEnum sentencesFromInputStream(final InputStream is) { try { - return new sentencesFromInputStreamEnum(is, charset); + return new sentencesFromInputStreamEnum(is); } catch (final UnsupportedEncodingException e) { return null; } @@ -633,8 +634,8 @@ public final class plasmaCondenser { int counter = 0; boolean pre = false; - public sentencesFromInputStreamEnum(final InputStream is, final String charset) throws UnsupportedEncodingException { - raf = new BufferedReader((charset == null) ? new InputStreamReader(is) : new InputStreamReader(is, charset)); + public sentencesFromInputStreamEnum(final InputStream is) throws UnsupportedEncodingException { + raf = new BufferedReader(new InputStreamReader(is, "UTF-8")); buffer = nextElement0(); counter = 0; pre = false;