*) Better charset encoding detection

*) New testclass for charset encoding detection tests

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2808 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 19 years ago
parent c54058e64b
commit 92f774edd1

@ -346,6 +346,9 @@ public final class plasmaParser {
public static String getRealCharsetEncoding(String encoding) { public static String getRealCharsetEncoding(String encoding) {
if ((encoding == null) || (encoding.length() == 0)) return "ISO-8859-1"; if ((encoding == null) || (encoding.length() == 0)) return "ISO-8859-1";
// trim encoding string
encoding = encoding.trim();
if (encoding.toLowerCase().startsWith("windows") && encoding.length() > 7) { if (encoding.toLowerCase().startsWith("windows") && encoding.length() > 7) {
char c = encoding.charAt(7); char c = encoding.charAt(7);
if (c == '_') encoding = "windows-" + encoding.substring(8); if (c == '_') encoding = "windows-" + encoding.substring(8);
@ -364,6 +367,23 @@ public final class plasmaParser {
else if ((c >= '0') && (c <= '9')) encoding = encoding.substring(0,8) + "-" + encoding.substring(8); else if ((c >= '0') && (c <= '9')) encoding = encoding.substring(0,8) + "-" + encoding.substring(8);
} }
// converting cp\d{4} -> windows-\d{4}
if (encoding.toLowerCase().matches("cp([_-])?125[0-8]")) {
char c = encoding.charAt(2);
if (c == '_' || c == '-') encoding = "windows-" + encoding.substring(3);
else if ((c >= '0') && (c <= '9')) encoding = "windows-" + encoding.substring(2);
}
if (encoding.toLowerCase().matches("gb[_-]?2312([-_]80)?")) {
encoding = "x-EUC-CN";
}
if (encoding.toLowerCase().matches(".*utf[-_]?8.*")) {
encoding = "UTF-8";
}
return encoding; return encoding;
} }

@ -0,0 +1,40 @@
package de.anomic.plasma;
import java.nio.charset.Charset;
import junit.framework.TestCase;
public class plasmaParserTest extends TestCase {
public void testGetRealCharsetEncoding() {
String[][] testStrings = new String[][] {
new String[]{null,"ISO-8859-1"},
new String[]{"windows1250","windows-1250"},
new String[]{"windows_1250","windows-1250"},
new String[]{"ISO-8859-1","ISO-8859-1"},
new String[]{"ISO8859-1","ISO-8859-1"},
new String[]{"ISO-88591","ISO-8859-1"},
new String[]{"ISO88591","ISO-8859-1"},
new String[]{"iso_8859_1","ISO-8859-1"},
new String[]{"cp-1252","windows-1252"},
new String[]{"gb_2312","x-EUC-CN"},
new String[]{"gb_2312-80","x-EUC-CN"},
new String[]{"UTF-8;","UTF-8"}
};
for (int i=0; i < testStrings.length; i++) {
// desired conversion result
String shouldBe = testStrings[i][1].toLowerCase();
// conversion result
String charset = plasmaParser.getRealCharsetEncoding(testStrings[i][0]).toLowerCase();
// test if equal
assertEquals(charset,shouldBe);
System.out.println("testGetRealCharsetEncoding: " + testStrings[i][0] + " -> " + charset + " | Supported: " + Charset.isSupported(charset));
}
}
}
Loading…
Cancel
Save