fix bug: data from plasmaParser is allready converted to UTF-8

After removing the restrictions in the code, YaCy should be able to index Unicode-charaters!


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5290 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
f1ori 16 years ago
parent 47f0c3b002
commit dd8441f102

@ -28,6 +28,7 @@ package de.anomic.index;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Locale;
import java.util.Set;
import java.util.TreeSet;
@ -77,7 +78,7 @@ public class indexWord {
// create a word hash
public static final String word2hash(final String word) {
return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(word.toLowerCase())).substring(0, yacySeedDB.commonHashLength);
return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(word.toLowerCase(Locale.ENGLISH))).substring(0, yacySeedDB.commonHashLength);
}
public static final Set<String> words2hashSet(final String[] words) {

@ -40,6 +40,7 @@ import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.TreeMap;
@ -234,13 +235,13 @@ public final class plasmaCondenser {
indexWord wprop;
sievedWordsEnum wordenum;
try {
wordenum = new sievedWordsEnum(new ByteArrayInputStream(text.getBytes()), "UTF-8");
wordenum = new sievedWordsEnum(new ByteArrayInputStream(text.getBytes()));
} catch (final UnsupportedEncodingException e) {
return;
}
int pip = 0;
while (wordenum.hasMoreElements()) {
word = (new String(wordenum.nextElement())).toLowerCase();
word = (new String(wordenum.nextElement())).toLowerCase(Locale.ENGLISH);
if (useForLanguageIdentification) languageIdentificator.add(word);
if (word.length() < 3) continue;
wprop = words.get(word);
@ -321,7 +322,7 @@ public final class plasmaCondenser {
// read source
final sievedWordsEnum wordenum = new sievedWordsEnum(is, charset);
while (wordenum.hasMoreElements()) {
word = (new String(wordenum.nextElement())).toLowerCase(); // TODO: does toLowerCase work for non ISO-8859-1 chars?
word = (new String(wordenum.nextElement())).toLowerCase(Locale.ENGLISH); // TODO: does toLowerCase work for non ISO-8859-1 chars?
if (languageIdentificator != null) languageIdentificator.add(word);
if (word.length() < wordminsize) continue;
//System.out.println("PARSED-WORD " + word);
@ -509,8 +510,8 @@ public final class plasmaCondenser {
StringBuffer buffer = null;
unsievedWordsEnum e;
public sievedWordsEnum(final InputStream is, final String charset) throws UnsupportedEncodingException {
e = new unsievedWordsEnum(is, charset);
public sievedWordsEnum(final InputStream is) throws UnsupportedEncodingException {
e = new unsievedWordsEnum(is);
buffer = nextElement0();
}
@ -555,8 +556,8 @@ public final class plasmaCondenser {
sentencesFromInputStreamEnum e;
StringBuffer s;
public unsievedWordsEnum(final InputStream is, final String charset) throws UnsupportedEncodingException {
e = new sentencesFromInputStreamEnum(is, charset);
public unsievedWordsEnum(final InputStream is) throws UnsupportedEncodingException {
e = new sentencesFromInputStreamEnum(is);
s = new StringBuffer(20);
buffer = nextElement0();
}
@ -616,9 +617,9 @@ public final class plasmaCondenser {
return sb;
}
public static sentencesFromInputStreamEnum sentencesFromInputStream(final InputStream is, final String charset) {
public static sentencesFromInputStreamEnum sentencesFromInputStream(final InputStream is) {
try {
return new sentencesFromInputStreamEnum(is, charset);
return new sentencesFromInputStreamEnum(is);
} catch (final UnsupportedEncodingException e) {
return null;
}
@ -633,8 +634,8 @@ public final class plasmaCondenser {
int counter = 0;
boolean pre = false;
public sentencesFromInputStreamEnum(final InputStream is, final String charset) throws UnsupportedEncodingException {
raf = new BufferedReader((charset == null) ? new InputStreamReader(is) : new InputStreamReader(is, charset));
public sentencesFromInputStreamEnum(final InputStream is) throws UnsupportedEncodingException {
raf = new BufferedReader(new InputStreamReader(is, "UTF-8"));
buffer = nextElement0();
counter = 0;
pre = false;

Loading…
Cancel
Save