fix bug: data from plasmaParser is allready converted to UTF-8

After removing the restrictions in the code, YaCy should be able to index Unicode-charaters!


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5290 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
f1ori 17 years ago
parent 47f0c3b002
commit dd8441f102

@ -28,6 +28,7 @@ package de.anomic.index;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.Locale;
import java.util.Set; import java.util.Set;
import java.util.TreeSet; import java.util.TreeSet;
@ -77,7 +78,7 @@ public class indexWord {
// create a word hash // create a word hash
public static final String word2hash(final String word) { public static final String word2hash(final String word) {
return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(word.toLowerCase())).substring(0, yacySeedDB.commonHashLength); return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(word.toLowerCase(Locale.ENGLISH))).substring(0, yacySeedDB.commonHashLength);
} }
public static final Set<String> words2hashSet(final String[] words) { public static final Set<String> words2hashSet(final String[] words) {

@ -40,6 +40,7 @@ import java.util.Enumeration;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.Locale;
import java.util.Map; import java.util.Map;
import java.util.Properties; import java.util.Properties;
import java.util.TreeMap; import java.util.TreeMap;
@ -234,13 +235,13 @@ public final class plasmaCondenser {
indexWord wprop; indexWord wprop;
sievedWordsEnum wordenum; sievedWordsEnum wordenum;
try { try {
wordenum = new sievedWordsEnum(new ByteArrayInputStream(text.getBytes()), "UTF-8"); wordenum = new sievedWordsEnum(new ByteArrayInputStream(text.getBytes()));
} catch (final UnsupportedEncodingException e) { } catch (final UnsupportedEncodingException e) {
return; return;
} }
int pip = 0; int pip = 0;
while (wordenum.hasMoreElements()) { while (wordenum.hasMoreElements()) {
word = (new String(wordenum.nextElement())).toLowerCase(); word = (new String(wordenum.nextElement())).toLowerCase(Locale.ENGLISH);
if (useForLanguageIdentification) languageIdentificator.add(word); if (useForLanguageIdentification) languageIdentificator.add(word);
if (word.length() < 3) continue; if (word.length() < 3) continue;
wprop = words.get(word); wprop = words.get(word);
@ -321,7 +322,7 @@ public final class plasmaCondenser {
// read source // read source
final sievedWordsEnum wordenum = new sievedWordsEnum(is, charset); final sievedWordsEnum wordenum = new sievedWordsEnum(is, charset);
while (wordenum.hasMoreElements()) { while (wordenum.hasMoreElements()) {
word = (new String(wordenum.nextElement())).toLowerCase(); // TODO: does toLowerCase work for non ISO-8859-1 chars? word = (new String(wordenum.nextElement())).toLowerCase(Locale.ENGLISH); // TODO: does toLowerCase work for non ISO-8859-1 chars?
if (languageIdentificator != null) languageIdentificator.add(word); if (languageIdentificator != null) languageIdentificator.add(word);
if (word.length() < wordminsize) continue; if (word.length() < wordminsize) continue;
//System.out.println("PARSED-WORD " + word); //System.out.println("PARSED-WORD " + word);
@ -509,8 +510,8 @@ public final class plasmaCondenser {
StringBuffer buffer = null; StringBuffer buffer = null;
unsievedWordsEnum e; unsievedWordsEnum e;
public sievedWordsEnum(final InputStream is, final String charset) throws UnsupportedEncodingException { public sievedWordsEnum(final InputStream is) throws UnsupportedEncodingException {
e = new unsievedWordsEnum(is, charset); e = new unsievedWordsEnum(is);
buffer = nextElement0(); buffer = nextElement0();
} }
@ -555,8 +556,8 @@ public final class plasmaCondenser {
sentencesFromInputStreamEnum e; sentencesFromInputStreamEnum e;
StringBuffer s; StringBuffer s;
public unsievedWordsEnum(final InputStream is, final String charset) throws UnsupportedEncodingException { public unsievedWordsEnum(final InputStream is) throws UnsupportedEncodingException {
e = new sentencesFromInputStreamEnum(is, charset); e = new sentencesFromInputStreamEnum(is);
s = new StringBuffer(20); s = new StringBuffer(20);
buffer = nextElement0(); buffer = nextElement0();
} }
@ -616,9 +617,9 @@ public final class plasmaCondenser {
return sb; return sb;
} }
public static sentencesFromInputStreamEnum sentencesFromInputStream(final InputStream is, final String charset) { public static sentencesFromInputStreamEnum sentencesFromInputStream(final InputStream is) {
try { try {
return new sentencesFromInputStreamEnum(is, charset); return new sentencesFromInputStreamEnum(is);
} catch (final UnsupportedEncodingException e) { } catch (final UnsupportedEncodingException e) {
return null; return null;
} }
@ -633,8 +634,8 @@ public final class plasmaCondenser {
int counter = 0; int counter = 0;
boolean pre = false; boolean pre = false;
public sentencesFromInputStreamEnum(final InputStream is, final String charset) throws UnsupportedEncodingException { public sentencesFromInputStreamEnum(final InputStream is) throws UnsupportedEncodingException {
raf = new BufferedReader((charset == null) ? new InputStreamReader(is) : new InputStreamReader(is, charset)); raf = new BufferedReader(new InputStreamReader(is, "UTF-8"));
buffer = nextElement0(); buffer = nextElement0();
counter = 0; counter = 0;
pre = false; pre = false;

Loading…
Cancel
Save