some very small advances to index utf-8 (not working yet), inserted also debugging code

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5298 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 2f49666908
commit b098522977

@ -74,7 +74,7 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
bc++;
r.append(' ');
} else if (c == rb) {
bc --;
bc--;
} else if (bc <= 0) {
r.append(c);
}

@ -267,7 +267,7 @@ public class htmlFilterCharacterCoding {
}
// the entity is unknown, skip it
}
return new String(sb);
return sb.toString();
}
public static void main(final String[] args) {

@ -78,7 +78,11 @@ public class indexWord {
// create a word hash
public static final String word2hash(final String word) {
return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(word.toLowerCase(Locale.ENGLISH))).substring(0, yacySeedDB.commonHashLength);
String e = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(word.toLowerCase(Locale.ENGLISH))).substring(0, yacySeedDB.commonHashLength);
if (word.startsWith("hofbr")) {
System.out.println("*** DEBUG ENCODING: " + word + " -> " + e);
}
return e;
}
public static final Set<String> words2hashSet(final String[] words) {

@ -491,8 +491,7 @@ public final class plasmaCondenser {
}
public final static boolean invisible(final char c) {
// TODO: Bugfix for UTF-8: does this work for non ISO-8859-1 chars?
if ((c < ' ') || (c > 'z')) return true;
if (c - ' ' >= invisibleChar.length) return false;
return invisibleChar[c - ' '];
}
@ -521,17 +520,11 @@ public final class plasmaCondenser {
private StringBuffer nextElement0() {
StringBuffer s;
char c;
loop: while (e.hasMoreElements()) {
s = e.nextElement();
if ((s.length() == 1) && (htmlFilterContentScraper.punctuation(s.charAt(0)))) return s;
for (int i = 0; i < s.length(); i++) {
c = s.charAt(i);
// TODO: Bugfix needed for UTF-8
if (((c < 'a') || (c > 'z')) &&
((c < 'A') || (c > 'Z')) &&
((c < '0') || (c > '9')))
continue loop; // go to next while loop
if (invisible(s.charAt(i))) continue loop;
}
return s;
}

Loading…
Cancel
Save