*) some bugfixes for UTF-8 related problems

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2577 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 19 years ago
parent f4af607b79
commit e2f8339827

@ -214,22 +214,22 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
}
String h;
if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) {
h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString());
h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); // TODO: bugfix needed for UTF-8
if (h.length() > 0) headlines[0].add(h);
}
if ((tagname.equalsIgnoreCase("h2")) && (text.length < 1024)) {
h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString());
h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); // TODO: bugfix needed for UTF-8
if (h.length() > 0) headlines[1].add(h);
}
if ((tagname.equalsIgnoreCase("h3")) && (text.length < 1024)) {
h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString());
h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); // TODO: bugfix needed for UTF-8
if (h.length() > 0) headlines[2].add(h);
}
if ((tagname.equalsIgnoreCase("h4")) && (text.length < 1024)) {
h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString());
h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); // TODO: bugfix needed for UTF-8
if (h.length() > 0) headlines[3].add(h);
}
if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) title = cleanLine(super.stripAll(new serverByteBuffer(text)).toString());
if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) title = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); // TODO: bugfix needed for UTF-8
}
private static String cleanLine(String s) {

@ -58,6 +58,7 @@ import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.util.Enumeration;
import java.util.Properties;
@ -157,7 +158,13 @@ public final class htmlFilterOutputStream extends OutputStream {
while (e.hasMoreElements()) {
key = (String) e.nextElement();
bb = bb.append((byte) 32).append(key.getBytes()).append((byte) '=');
bb = bb.append(quotechar).append(prop.getProperty(key).getBytes()).append(quotechar);
bb = bb.append(quotechar);
try {
bb.append(prop.getProperty(key).getBytes("UTF-8"));
} catch (UnsupportedEncodingException e1) {
bb.append(prop.getProperty(key).getBytes());
}
bb.append(quotechar);
}
if (bb.length() > 0) return bb.getBytes(1);
return bb.getBytes();

@ -45,6 +45,7 @@ import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.util.Properties;
public final class serverByteBuffer extends OutputStream {
@ -379,7 +380,11 @@ public final class serverByteBuffer extends OutputStream {
start = pos;
while ((pos < length) && (buffer[pos] != doublequote)) pos++;
if (pos >= length) break; // this is the case if we found no parent doublequote
try {
p.setProperty(key, new String(buffer, start, pos - start,"UTF-8").trim());
} catch (UnsupportedEncodingException e) {
p.setProperty(key, new String(buffer, start, pos - start).trim());
}
pos++;
} else if (buffer[pos] == singlequote) {
// search next singlequote

Loading…
Cancel
Save