*) htmlFilterContentScraper.java: using proper charset for document title

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2595 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 18 years ago
parent 5015e780c2
commit 06fa891152

@ -211,7 +211,8 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString());
if (h.length() > 0) headlines[3].add(h);
}
if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) title = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); // TODO: bugfix needed for UTF-8
if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024))
title = cleanLine(super.stripAll(new serverByteBuffer(text)).toString(this.charset));
}
private static String cleanLine(String s) {

@ -725,11 +725,11 @@ public final class plasmaParser {
serverFileUtils.write(contentBytes, contentFile);
}
if ((args.length == 4)&&(args[2].equalsIgnoreCase("-m"))) {
if ((args.length >= 4)&&(args[2].equalsIgnoreCase("-m"))) {
contentMimeType = args[3];
}
if ((args.length == 6)&&(args[4].equalsIgnoreCase("-c"))) {
if ((args.length >= 6)&&(args[4].equalsIgnoreCase("-c"))) {
charSet = args[5];
}
@ -747,6 +747,9 @@ public final class plasmaParser {
// printing out all parsed sentences
if (document != null) {
System.out.print("Document titel: ");
System.out.println(document.getMainLongTitle());
// found text
String[] sentences = document.getSentences();
if (sentences != null) {

Loading…
Cancel
Save