diff --git a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
index f0c244a7b..0e7b8de14 100644
--- a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
@@ -74,7 +74,7 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
bc++;
r.append(' ');
} else if (c == rb) {
- bc --;
+ bc--;
} else if (bc <= 0) {
r.append(c);
}
diff --git a/source/de/anomic/htmlFilter/htmlFilterCharacterCoding.java b/source/de/anomic/htmlFilter/htmlFilterCharacterCoding.java
index 430df4c4a..14076b591 100644
--- a/source/de/anomic/htmlFilter/htmlFilterCharacterCoding.java
+++ b/source/de/anomic/htmlFilter/htmlFilterCharacterCoding.java
@@ -267,7 +267,7 @@ public class htmlFilterCharacterCoding {
}
// the entity is unknown, skip it
}
- return new String(sb);
+ return sb.toString();
}
public static void main(final String[] args) {
diff --git a/source/de/anomic/index/indexWord.java b/source/de/anomic/index/indexWord.java
index 082981003..01b5609f1 100644
--- a/source/de/anomic/index/indexWord.java
+++ b/source/de/anomic/index/indexWord.java
@@ -78,7 +78,11 @@ public class indexWord {
// create a word hash
public static final String word2hash(final String word) {
- return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(word.toLowerCase(Locale.ENGLISH))).substring(0, yacySeedDB.commonHashLength);
+ String e = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(word.toLowerCase(Locale.ENGLISH))).substring(0, yacySeedDB.commonHashLength);
+ if (word.startsWith("hofbr")) {
+ System.out.println("*** DEBUG ENCODING: " + word + " -> " + e);
+ }
+ return e;
}
public static final Set words2hashSet(final String[] words) {
diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java
index 734e9ea76..9637bc11d 100644
--- a/source/de/anomic/plasma/plasmaCondenser.java
+++ b/source/de/anomic/plasma/plasmaCondenser.java
@@ -491,8 +491,7 @@ public final class plasmaCondenser {
}
public final static boolean invisible(final char c) {
- // TODO: Bugfix for UTF-8: does this work for non ISO-8859-1 chars?
- if ((c < ' ') || (c > 'z')) return true;
+ if (c - ' ' >= invisibleChar.length) return false;
return invisibleChar[c - ' '];
}
@@ -521,17 +520,11 @@ public final class plasmaCondenser {
private StringBuffer nextElement0() {
StringBuffer s;
- char c;
loop: while (e.hasMoreElements()) {
s = e.nextElement();
if ((s.length() == 1) && (htmlFilterContentScraper.punctuation(s.charAt(0)))) return s;
for (int i = 0; i < s.length(); i++) {
- c = s.charAt(i);
- // TODO: Bugfix needed for UTF-8
- if (((c < 'a') || (c > 'z')) &&
- ((c < 'A') || (c > 'Z')) &&
- ((c < '0') || (c > '9')))
- continue loop; // go to next while loop
+ if (invisible(s.charAt(i))) continue loop;
}
return s;
}