From 92f774edd111518a3be34cede1df4986fb80174d Mon Sep 17 00:00:00 2001 From: theli Date: Thu, 19 Oct 2006 07:02:18 +0000 Subject: [PATCH] *) Better charset encoding detection *) New testclass for charset encoding detection tests git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2808 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/plasma/plasmaParser.java | 20 +++++++++++ test/de/anomic/plasma/plasmaParserTest.java | 40 +++++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 test/de/anomic/plasma/plasmaParserTest.java diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index 0dd68265f..c59b6b566 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -346,6 +346,9 @@ public final class plasmaParser { public static String getRealCharsetEncoding(String encoding) { if ((encoding == null) || (encoding.length() == 0)) return "ISO-8859-1"; + // trim encoding string + encoding = encoding.trim(); + if (encoding.toLowerCase().startsWith("windows") && encoding.length() > 7) { char c = encoding.charAt(7); if (c == '_') encoding = "windows-" + encoding.substring(8); @@ -364,6 +367,23 @@ public final class plasmaParser { else if ((c >= '0') && (c <= '9')) encoding = encoding.substring(0,8) + "-" + encoding.substring(8); } + + // converting cp\d{4} -> windows-\d{4} + if (encoding.toLowerCase().matches("cp([_-])?125[0-8]")) { + char c = encoding.charAt(2); + if (c == '_' || c == '-') encoding = "windows-" + encoding.substring(3); + else if ((c >= '0') && (c <= '9')) encoding = "windows-" + encoding.substring(2); + } + + if (encoding.toLowerCase().matches("gb[_-]?2312([-_]80)?")) { + encoding = "x-EUC-CN"; + } + + if (encoding.toLowerCase().matches(".*utf[-_]?8.*")) { + encoding = "UTF-8"; + } + + return encoding; } diff --git a/test/de/anomic/plasma/plasmaParserTest.java b/test/de/anomic/plasma/plasmaParserTest.java new file mode 100644 index 000000000..a35045f23 --- /dev/null +++ b/test/de/anomic/plasma/plasmaParserTest.java @@ -0,0 +1,40 @@ +package de.anomic.plasma; + +import java.nio.charset.Charset; + +import junit.framework.TestCase; + +public class plasmaParserTest extends TestCase { + + public void testGetRealCharsetEncoding() { + String[][] testStrings = new String[][] { + new String[]{null,"ISO-8859-1"}, + new String[]{"windows1250","windows-1250"}, + new String[]{"windows_1250","windows-1250"}, + new String[]{"ISO-8859-1","ISO-8859-1"}, + new String[]{"ISO8859-1","ISO-8859-1"}, + new String[]{"ISO-88591","ISO-8859-1"}, + new String[]{"ISO88591","ISO-8859-1"}, + new String[]{"iso_8859_1","ISO-8859-1"}, + new String[]{"cp-1252","windows-1252"}, + new String[]{"gb_2312","x-EUC-CN"}, + new String[]{"gb_2312-80","x-EUC-CN"}, + new String[]{"UTF-8;","UTF-8"} + }; + + for (int i=0; i < testStrings.length; i++) { + // desired conversion result + String shouldBe = testStrings[i][1].toLowerCase(); + + // conversion result + String charset = plasmaParser.getRealCharsetEncoding(testStrings[i][0]).toLowerCase(); + + // test if equal + assertEquals(charset,shouldBe); + System.out.println("testGetRealCharsetEncoding: " + testStrings[i][0] + " -> " + charset + " | Supported: " + Charset.isSupported(charset)); + + } + + } + +}