diff --git a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java index f0c244a7b..0e7b8de14 100644 --- a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java @@ -74,7 +74,7 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper { bc++; r.append(' '); } else if (c == rb) { - bc --; + bc--; } else if (bc <= 0) { r.append(c); } diff --git a/source/de/anomic/htmlFilter/htmlFilterCharacterCoding.java b/source/de/anomic/htmlFilter/htmlFilterCharacterCoding.java index 430df4c4a..14076b591 100644 --- a/source/de/anomic/htmlFilter/htmlFilterCharacterCoding.java +++ b/source/de/anomic/htmlFilter/htmlFilterCharacterCoding.java @@ -267,7 +267,7 @@ public class htmlFilterCharacterCoding { } // the entity is unknown, skip it } - return new String(sb); + return sb.toString(); } public static void main(final String[] args) { diff --git a/source/de/anomic/index/indexWord.java b/source/de/anomic/index/indexWord.java index 082981003..01b5609f1 100644 --- a/source/de/anomic/index/indexWord.java +++ b/source/de/anomic/index/indexWord.java @@ -78,7 +78,11 @@ public class indexWord { // create a word hash public static final String word2hash(final String word) { - return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(word.toLowerCase(Locale.ENGLISH))).substring(0, yacySeedDB.commonHashLength); + String e = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(word.toLowerCase(Locale.ENGLISH))).substring(0, yacySeedDB.commonHashLength); + if (word.startsWith("hofbr")) { + System.out.println("*** DEBUG ENCODING: " + word + " -> " + e); + } + return e; } public static final Set words2hashSet(final String[] words) { diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java index 734e9ea76..9637bc11d 100644 --- a/source/de/anomic/plasma/plasmaCondenser.java +++ b/source/de/anomic/plasma/plasmaCondenser.java @@ -491,8 +491,7 @@ public final class plasmaCondenser { } public final static boolean invisible(final char c) { - // TODO: Bugfix for UTF-8: does this work for non ISO-8859-1 chars? - if ((c < ' ') || (c > 'z')) return true; + if (c - ' ' >= invisibleChar.length) return false; return invisibleChar[c - ' ']; } @@ -521,17 +520,11 @@ public final class plasmaCondenser { private StringBuffer nextElement0() { StringBuffer s; - char c; loop: while (e.hasMoreElements()) { s = e.nextElement(); if ((s.length() == 1) && (htmlFilterContentScraper.punctuation(s.charAt(0)))) return s; for (int i = 0; i < s.length(); i++) { - c = s.charAt(i); - // TODO: Bugfix needed for UTF-8 - if (((c < 'a') || (c > 'z')) && - ((c < 'A') || (c > 'Z')) && - ((c < '0') || (c > '9'))) - continue loop; // go to next while loop + if (invisible(s.charAt(i))) continue loop; } return s; }