*) some bugfixes for UTF-8 related problems

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2577 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 19 years ago
parent f4af607b79
commit e2f8339827

@ -214,22 +214,22 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
} }
String h; String h;
if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) { if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) {
h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); // TODO: bugfix needed for UTF-8
if (h.length() > 0) headlines[0].add(h); if (h.length() > 0) headlines[0].add(h);
} }
if ((tagname.equalsIgnoreCase("h2")) && (text.length < 1024)) { if ((tagname.equalsIgnoreCase("h2")) && (text.length < 1024)) {
h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); // TODO: bugfix needed for UTF-8
if (h.length() > 0) headlines[1].add(h); if (h.length() > 0) headlines[1].add(h);
} }
if ((tagname.equalsIgnoreCase("h3")) && (text.length < 1024)) { if ((tagname.equalsIgnoreCase("h3")) && (text.length < 1024)) {
h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); // TODO: bugfix needed for UTF-8
if (h.length() > 0) headlines[2].add(h); if (h.length() > 0) headlines[2].add(h);
} }
if ((tagname.equalsIgnoreCase("h4")) && (text.length < 1024)) { if ((tagname.equalsIgnoreCase("h4")) && (text.length < 1024)) {
h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); // TODO: bugfix needed for UTF-8
if (h.length() > 0) headlines[3].add(h); if (h.length() > 0) headlines[3].add(h);
} }
if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) title = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) title = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); // TODO: bugfix needed for UTF-8
} }
private static String cleanLine(String s) { private static String cleanLine(String s) {

@ -58,6 +58,7 @@ import java.io.FileOutputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.OutputStream; import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.util.Enumeration; import java.util.Enumeration;
import java.util.Properties; import java.util.Properties;
@ -157,7 +158,13 @@ public final class htmlFilterOutputStream extends OutputStream {
while (e.hasMoreElements()) { while (e.hasMoreElements()) {
key = (String) e.nextElement(); key = (String) e.nextElement();
bb = bb.append((byte) 32).append(key.getBytes()).append((byte) '='); bb = bb.append((byte) 32).append(key.getBytes()).append((byte) '=');
bb = bb.append(quotechar).append(prop.getProperty(key).getBytes()).append(quotechar); bb = bb.append(quotechar);
try {
bb.append(prop.getProperty(key).getBytes("UTF-8"));
} catch (UnsupportedEncodingException e1) {
bb.append(prop.getProperty(key).getBytes());
}
bb.append(quotechar);
} }
if (bb.length() > 0) return bb.getBytes(1); if (bb.length() > 0) return bb.getBytes(1);
return bb.getBytes(); return bb.getBytes();

@ -45,6 +45,7 @@ import java.io.FileInputStream;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.IOException; import java.io.IOException;
import java.io.OutputStream; import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.util.Properties; import java.util.Properties;
public final class serverByteBuffer extends OutputStream { public final class serverByteBuffer extends OutputStream {
@ -379,7 +380,11 @@ public final class serverByteBuffer extends OutputStream {
start = pos; start = pos;
while ((pos < length) && (buffer[pos] != doublequote)) pos++; while ((pos < length) && (buffer[pos] != doublequote)) pos++;
if (pos >= length) break; // this is the case if we found no parent doublequote if (pos >= length) break; // this is the case if we found no parent doublequote
p.setProperty(key, new String(buffer, start, pos - start).trim()); try {
p.setProperty(key, new String(buffer, start, pos - start,"UTF-8").trim());
} catch (UnsupportedEncodingException e) {
p.setProperty(key, new String(buffer, start, pos - start).trim());
}
pos++; pos++;
} else if (buffer[pos] == singlequote) { } else if (buffer[pos] == singlequote) {
// search next singlequote // search next singlequote

Loading…
Cancel
Save