diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index 56bdee44d..ecf38c2f0 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -214,22 +214,22 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen } String h; if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) { - h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); + h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); // TODO: bugfix needed for UTF-8 if (h.length() > 0) headlines[0].add(h); } if ((tagname.equalsIgnoreCase("h2")) && (text.length < 1024)) { - h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); + h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); // TODO: bugfix needed for UTF-8 if (h.length() > 0) headlines[1].add(h); } if ((tagname.equalsIgnoreCase("h3")) && (text.length < 1024)) { - h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); + h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); // TODO: bugfix needed for UTF-8 if (h.length() > 0) headlines[2].add(h); } if ((tagname.equalsIgnoreCase("h4")) && (text.length < 1024)) { - h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); + h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); // TODO: bugfix needed for UTF-8 if (h.length() > 0) headlines[3].add(h); } - if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) title = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); + if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) title = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); // TODO: bugfix needed for UTF-8 } private static String cleanLine(String s) { diff --git a/source/de/anomic/htmlFilter/htmlFilterOutputStream.java b/source/de/anomic/htmlFilter/htmlFilterOutputStream.java index 8a60a01ac..0a679f0ed 100644 --- a/source/de/anomic/htmlFilter/htmlFilterOutputStream.java +++ b/source/de/anomic/htmlFilter/htmlFilterOutputStream.java @@ -58,6 +58,7 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.util.Enumeration; import java.util.Properties; @@ -157,7 +158,13 @@ public final class htmlFilterOutputStream extends OutputStream { while (e.hasMoreElements()) { key = (String) e.nextElement(); bb = bb.append((byte) 32).append(key.getBytes()).append((byte) '='); - bb = bb.append(quotechar).append(prop.getProperty(key).getBytes()).append(quotechar); + bb = bb.append(quotechar); + try { + bb.append(prop.getProperty(key).getBytes("UTF-8")); + } catch (UnsupportedEncodingException e1) { + bb.append(prop.getProperty(key).getBytes()); + } + bb.append(quotechar); } if (bb.length() > 0) return bb.getBytes(1); return bb.getBytes(); diff --git a/source/de/anomic/server/serverByteBuffer.java b/source/de/anomic/server/serverByteBuffer.java index 64a17898b..0943327df 100644 --- a/source/de/anomic/server/serverByteBuffer.java +++ b/source/de/anomic/server/serverByteBuffer.java @@ -45,6 +45,7 @@ import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.OutputStream; +import java.io.UnsupportedEncodingException; import java.util.Properties; public final class serverByteBuffer extends OutputStream { @@ -379,7 +380,11 @@ public final class serverByteBuffer extends OutputStream { start = pos; while ((pos < length) && (buffer[pos] != doublequote)) pos++; if (pos >= length) break; // this is the case if we found no parent doublequote - p.setProperty(key, new String(buffer, start, pos - start).trim()); + try { + p.setProperty(key, new String(buffer, start, pos - start,"UTF-8").trim()); + } catch (UnsupportedEncodingException e) { + p.setProperty(key, new String(buffer, start, pos - start).trim()); + } pos++; } else if (buffer[pos] == singlequote) { // search next singlequote