* improve encoding detection of http service

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5337 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
f1ori 16 years ago
parent 3246358485
commit 90e78b2cf6

@ -46,6 +46,7 @@ import javax.swing.event.EventListenerList;
import de.anomic.crawler.HTTPLoader; import de.anomic.crawler.HTTPLoader;
import de.anomic.http.HttpClient; import de.anomic.http.HttpClient;
import de.anomic.http.httpRequestHeader; import de.anomic.http.httpRequestHeader;
import de.anomic.plasma.plasmaParser;
import de.anomic.server.serverCharBuffer; import de.anomic.server.serverCharBuffer;
import de.anomic.server.serverFileUtils; import de.anomic.server.serverFileUtils;
import de.anomic.yacy.yacyURL; import de.anomic.yacy.yacyURL;
@ -487,7 +488,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
// scrape document to look up charset // scrape document to look up charset
final htmlFilterInputStream htmlFilter = new htmlFilterInputStream(new ByteArrayInputStream(page),"UTF-8",new yacyURL("http://localhost", null),null,false); final htmlFilterInputStream htmlFilter = new htmlFilterInputStream(new ByteArrayInputStream(page),"UTF-8",new yacyURL("http://localhost", null),null,false);
final String charset = htmlFilter.detectCharset(); final String charset = plasmaParser.patchCharsetEncoding(htmlFilter.detectCharset());
// scrape content // scrape content
final htmlFilterContentScraper scraper = new htmlFilterContentScraper(new yacyURL("http://localhost", null)); final htmlFilterContentScraper scraper = new htmlFilterContentScraper(new yacyURL("http://localhost", null));

@ -37,6 +37,7 @@ import java.net.InetAddress;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.net.URLDecoder; import java.net.URLDecoder;
import java.net.URLEncoder; import java.net.URLEncoder;
import java.nio.charset.Charset;
import java.util.Arrays; import java.util.Arrays;
import java.util.Date; import java.util.Date;
import java.util.HashMap; import java.util.HashMap;
@ -1310,7 +1311,7 @@ public final class httpd implements serverHandler, Cloneable {
if (contentType == null) if (contentType == null)
contentType = "text/html; charset=UTF-8"; contentType = "text/html; charset=UTF-8";
else if (contentType.startsWith("text/") && contentType.toLowerCase().indexOf("charset=")==-1) else if (contentType.startsWith("text/") && contentType.toLowerCase().indexOf("charset=")==-1)
contentType +="; charset=UTF-8"; contentType +="; charset=" + Charset.defaultCharset().name();
headers.put(httpHeader.CONTENT_TYPE, contentType); headers.put(httpHeader.CONTENT_TYPE, contentType);
if (contentLength > 0) headers.put(httpResponseHeader.CONTENT_LENGTH, Long.toString(contentLength)); if (contentLength > 0) headers.put(httpResponseHeader.CONTENT_LENGTH, Long.toString(contentLength));
//if (cookie != null) headers.put(httpResponseHeader.SET_COOKIE, cookie); //if (cookie != null) headers.put(httpResponseHeader.SET_COOKIE, cookie);

@ -569,6 +569,7 @@ public final class httpdFileHandler {
String mimeType = mimeTable.getProperty(targetExt,"text/html"); String mimeType = mimeTable.getProperty(targetExt,"text/html");
final boolean zipContent = requestHeader.acceptGzip() && httpd.shallTransportZipped("." + conProp.getProperty("EXT","")); final boolean zipContent = requestHeader.acceptGzip() && httpd.shallTransportZipped("." + conProp.getProperty("EXT",""));
if (path.endsWith("html") || if (path.endsWith("html") ||
path.endsWith("htm") ||
path.endsWith("xml") || path.endsWith("xml") ||
path.endsWith("rdf") || path.endsWith("rdf") ||
path.endsWith("rss") || path.endsWith("rss") ||
@ -701,7 +702,7 @@ public final class httpdFileHandler {
fis.mark(1000); fis.mark(1000);
// scrape document to look up charset // scrape document to look up charset
final htmlFilterInputStream htmlFilter = new htmlFilterInputStream(fis,"UTF-8",new yacyURL("http://localhost", null),null,false); final htmlFilterInputStream htmlFilter = new htmlFilterInputStream(fis,"UTF-8",new yacyURL("http://localhost", null),null,false);
final String charset = htmlFilter.detectCharset(); final String charset = plasmaParser.patchCharsetEncoding(htmlFilter.detectCharset());
// reset position // reset position
fis.reset(); fis.reset();
if(charset != null) if(charset != null)

@ -350,8 +350,8 @@ public final class plasmaParser {
*/ */
public static String patchCharsetEncoding(String encoding) { public static String patchCharsetEncoding(String encoding) {
// return a default encoding // return the system default encoding
if ((encoding == null) || (encoding.length() < 3)) return "ISO-8859-1"; if ((encoding == null) || (encoding.length() < 3)) return Charset.defaultCharset().name();
// trim encoding string // trim encoding string
encoding = encoding.trim(); encoding = encoding.trim();
@ -362,6 +362,7 @@ public final class plasmaParser {
if (encoding.startsWith("BIG")) return "Big5"; if (encoding.startsWith("BIG")) return "Big5";
// all other names but such with "windows" use uppercase // all other names but such with "windows" use uppercase
if (encoding.startsWith("WINDOWS")) encoding = "windows" + encoding.substring(7); if (encoding.startsWith("WINDOWS")) encoding = "windows" + encoding.substring(7);
if (encoding.startsWith("MACINTOSH")) encoding = "MacRoman";
// fix wrong fill characters // fix wrong fill characters
encoding = encoding.replaceAll("_", "-"); encoding = encoding.replaceAll("_", "-");

Loading…
Cancel
Save