* improve encoding detection of http service

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5337 6c8d7289-2bf4-0310-a012-ef5d649a1542
16 years ago · 90e78b2cf6
parent 3246358485
commit 90e78b2cf6
4 changed files with 9 additions and 5 deletions
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@ -46,6 +46,7 @@ import javax.swing.event.EventListenerList;
 import de.anomic.crawler.HTTPLoader;
 import de.anomic.http.HttpClient;
 import de.anomic.http.httpRequestHeader;
 import de.anomic.plasma.plasmaParser;
 import de.anomic.server.serverCharBuffer;
 import de.anomic.server.serverFileUtils;
 import de.anomic.yacy.yacyURL;
@ -487,7 +488,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
        // scrape document to look up charset
        final htmlFilterInputStream htmlFilter = new htmlFilterInputStream(new ByteArrayInputStream(page),"UTF-8",new yacyURL("http://localhost", null),null,false);
-        final String charset = htmlFilter.detectCharset();
+        final String charset = plasmaParser.patchCharsetEncoding(htmlFilter.detectCharset());
        // scrape content
        final htmlFilterContentScraper scraper = new htmlFilterContentScraper(new yacyURL("http://localhost", null));
--- a/source/de/anomic/http/httpd.java
+++ b/source/de/anomic/http/httpd.java
@ -37,6 +37,7 @@ import java.net.InetAddress;
 import java.net.MalformedURLException;
 import java.net.URLDecoder;
 import java.net.URLEncoder;
 import java.nio.charset.Charset;
 import java.util.Arrays;
 import java.util.Date;
 import java.util.HashMap;
@ -1310,7 +1311,7 @@ public final class httpd implements serverHandler, Cloneable {
        if (contentType == null) 
            contentType = "text/html; charset=UTF-8";
        else if (contentType.startsWith("text/") && contentType.toLowerCase().indexOf("charset=")==-1)
-            contentType +="; charset=UTF-8";
+            contentType +="; charset=" + Charset.defaultCharset().name();
        headers.put(httpHeader.CONTENT_TYPE, contentType);  
        if (contentLength > 0)   headers.put(httpResponseHeader.CONTENT_LENGTH, Long.toString(contentLength));
        //if (cookie != null)      headers.put(httpResponseHeader.SET_COOKIE, cookie);
--- a/source/de/anomic/http/httpdFileHandler.java
+++ b/source/de/anomic/http/httpdFileHandler.java
@ -569,6 +569,7 @@ public final class httpdFileHandler {
                String mimeType = mimeTable.getProperty(targetExt,"text/html");
                final boolean zipContent = requestHeader.acceptGzip() && httpd.shallTransportZipped("." + conProp.getProperty("EXT",""));
                if (path.endsWith("html") || 
                        path.endsWith("htm") || 
                        path.endsWith("xml") || 
                        path.endsWith("rdf") || 
                        path.endsWith("rss") || 
@ -701,7 +702,7 @@ public final class httpdFileHandler {
                		fis.mark(1000);
                        // scrape document to look up charset
                        final htmlFilterInputStream htmlFilter = new htmlFilterInputStream(fis,"UTF-8",new yacyURL("http://localhost", null),null,false);
-                        final String charset = htmlFilter.detectCharset();
+                        final String charset = plasmaParser.patchCharsetEncoding(htmlFilter.detectCharset());
                        // reset position
                        fis.reset();
                        if(charset != null)
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@ -350,8 +350,8 @@ public final class plasmaParser {
     */
    public static String patchCharsetEncoding(String encoding) {
-        // return a default encoding
+        // return the system default encoding
-    	if ((encoding == null) || (encoding.length() < 3)) return "ISO-8859-1";
+    	if ((encoding == null) || (encoding.length() < 3)) return Charset.defaultCharset().name();
    	// trim encoding string
    	encoding = encoding.trim();
@ -362,6 +362,7 @@ public final class plasmaParser {
    	if (encoding.startsWith("BIG")) return "Big5";
    	// all other names but such with "windows" use uppercase
    	if (encoding.startsWith("WINDOWS")) encoding = "windows" + encoding.substring(7);
    	if (encoding.startsWith("MACINTOSH")) encoding = "MacRoman";
    	// fix wrong fill characters
    	encoding = encoding.replaceAll("_", "-");