*) Trying to be more tolerant against wrong charset names

See: http://www.yacy-forum.de/viewtopic.php?p=26662

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2759 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 18 years ago
parent 7526c831a8
commit e9afe39cbb

@ -343,6 +343,18 @@ public final class plasmaParser {
} }
} }
public static String getRealCharsetEncoding(String encoding) {
if ((encoding == null) || (encoding.length() == 0)) return "ISO-8859-1";
if (encoding.toLowerCase().startsWith("windows") && encoding.length() > 7) {
char c = encoding.charAt(7);
if (c == '_') encoding = "windows-" + encoding.substring(8);
else if ((c >= '0') && (c <= '9')) encoding = "windows-" + encoding.substring(7);
}
return encoding;
}
public static String getRealMimeType(String mimeType) { public static String getRealMimeType(String mimeType) {
//if (mimeType == null) doMimeTypeAnalysis //if (mimeType == null) doMimeTypeAnalysis
if (mimeType == null) mimeType = "application/octet-stream"; if (mimeType == null) mimeType = "application/octet-stream";
@ -562,7 +574,7 @@ public final class plasmaParser {
// getting the charset of the document // getting the charset of the document
// TODO: do a charset detection here .... // TODO: do a charset detection here ....
String documentCharset = (theDocumentCharset == null) ? "ISO-8859-1" : theDocumentCharset; String documentCharset = getRealCharsetEncoding(theDocumentCharset);
// testing if parsing is supported for this resource // testing if parsing is supported for this resource
if (!plasmaParser.supportedContent(location,mimeType)) { if (!plasmaParser.supportedContent(location,mimeType)) {
@ -629,7 +641,10 @@ public final class plasmaParser {
String charset = htmlFilter.detectCharset(); String charset = htmlFilter.detectCharset();
if (charset == null) { if (charset == null) {
charset = documentCharset; charset = documentCharset;
} else {
charset = getRealCharsetEncoding(charset);
} }
if (!documentCharset.equalsIgnoreCase(charset)) { if (!documentCharset.equalsIgnoreCase(charset)) {
this.theLogger.logInfo("Charset transformation needed from '" + documentCharset + "' to '" + charset + "'"); this.theLogger.logInfo("Charset transformation needed from '" + documentCharset + "' to '" + charset + "'");
} }
@ -769,9 +784,11 @@ public final class plasmaParser {
public static void main(String[] args) { public static void main(String[] args) {
//javac -classpath lib/commons-collections.jar:lib/commons-pool-1.2.jar -sourcepath source source/de/anomic/plasma/plasmaParser.java //javac -classpath lib/commons-collections.jar:lib/commons-pool-1.2.jar -sourcepath source source/de/anomic/plasma/plasmaParser.java
//java -cp source:lib/commons-collections.jar:lib/commons-pool-1.2.jar de.anomic.plasma.plasmaParser bug.html bug.out //java -cp source:lib/commons-collections.jar:lib/commons-pool-1.2.jar de.anomic.plasma.plasmaParser bug.html bug.out
httpc remote = null;
try { try {
Object content = null; Object content = null;
URL contentURL = null; URL contentURL = null;
long contentLength = -1;
String contentMimeType = "application/octet-stream"; String contentMimeType = "application/octet-stream";
String charSet = "UTF-8"; String charSet = "UTF-8";
@ -787,7 +804,22 @@ public final class plasmaParser {
contentURL = new URL(args[1]); contentURL = new URL(args[1]);
// downloading the document content // downloading the document content
content = httpc.singleGET(contentURL, contentURL.getHost(), 10000, null, null, null); remote = httpc.getInstance(
contentURL.getHost(),
contentURL.getHost(),
contentURL.getPort(),
5000,
contentURL.getProtocol().equalsIgnoreCase("https"));
httpc.response res = remote.GET(contentURL.getFile(), null);
if (res.statusCode != 200) {
System.err.println("Unable to download " + contentURL + ". " + res.status);
return;
}
content = res.getContentInputStream();
contentMimeType = res.responseHeader.mime();
charSet = res.responseHeader.getCharacterEncoding();
contentLength = res.responseHeader.contentLength();
} }
if ((args.length >= 4)&&(args[2].equalsIgnoreCase("-m"))) { if ((args.length >= 4)&&(args[2].equalsIgnoreCase("-m"))) {
@ -813,6 +845,8 @@ public final class plasmaParser {
document = theParser.parseSource(contentURL, contentMimeType, charSet, (byte[])content); document = theParser.parseSource(contentURL, contentMimeType, charSet, (byte[])content);
} else if (content instanceof File) { } else if (content instanceof File) {
document = theParser.parseSource(contentURL, contentMimeType, charSet, (File)content); document = theParser.parseSource(contentURL, contentMimeType, charSet, (File)content);
} else if (content instanceof InputStream) {
document = theParser.parseSource(contentURL, contentMimeType, charSet, contentLength, (InputStream)content);
} }
// printing out all parsed sentences // printing out all parsed sentences
@ -842,6 +876,8 @@ public final class plasmaParser {
} }
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
} finally {
if (remote != null) try { httpc.returnInstance(remote); } catch (Exception e) {}
} }
} }

Loading…
Cancel
Save