*) htmlFilterContentScraper.java: using proper charset for document title

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2595 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago · 06fa891152
parent 5015e780c2
commit 06fa891152
2 changed files with 7 additions and 3 deletions
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@ -211,7 +211,8 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
            h = cleanLine(super.stripAll(new serverByteBuffer(text)).toString());
            if (h.length() > 0) headlines[3].add(h);
        }
-        if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) title = cleanLine(super.stripAll(new serverByteBuffer(text)).toString()); // TODO: bugfix needed for UTF-8       
+        if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) 
+            title = cleanLine(super.stripAll(new serverByteBuffer(text)).toString(this.charset));        
    }

    private static String cleanLine(String s) {
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@ -725,11 +725,11 @@ public final class plasmaParser {
                serverFileUtils.write(contentBytes, contentFile);
            }
            
-            if ((args.length == 4)&&(args[2].equalsIgnoreCase("-m"))) {
+            if ((args.length >= 4)&&(args[2].equalsIgnoreCase("-m"))) {
                contentMimeType = args[3];
            }
            
-            if ((args.length == 6)&&(args[4].equalsIgnoreCase("-c"))) {
+            if ((args.length >= 6)&&(args[4].equalsIgnoreCase("-c"))) {
                charSet = args[5];
            }            
            
@ -747,6 +747,9 @@ public final class plasmaParser {

            // printing out all parsed sentences
            if (document != null) {
+                System.out.print("Document titel: ");
+                System.out.println(document.getMainLongTitle());
+                
                // found text
                String[] sentences = document.getSentences();
                if (sentences != null) {