Made mime type and extension normalization locale independent.

Previously, upper cased mime type was incorrectly normalized when the default locale is Turkish.
8 years ago · 286f3018bd
parent 319231a458
commit 286f3018bd
2 changed files with 69 additions and 4 deletions
--- a/source/net/yacy/document/TextParser.java
+++ b/source/net/yacy/document/TextParser.java
@ -29,6 +29,7 @@ import java.io.InputStream;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.LinkedHashSet;
+import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
@ -128,6 +129,15 @@ public final class TextParser {
        for (Set<Parser> pl: mime2parser.values()) c.addAll(pl);
        return c;
    }
+    
+    /**
+     * @return the set of all supported mime types
+     */
+    public static Set<String> supportedMimeTypes() {
+    	final Set<String> mimeTypes = new HashSet<>();
+    	mimeTypes.addAll(mime2parser.keySet());
+    	return mimeTypes;
+    }

    private static void initParser(final Parser parser) {
        String prototypeMime = null;
@ -145,7 +155,7 @@ public final class TextParser {
        }

        if (prototypeMime != null) for (String ext: parser.supportedExtensions()) {
-            ext = ext.toLowerCase();
+            ext = ext.toLowerCase(Locale.ROOT);
            final String s = ext2mime.get(ext);
            if (s != null && !s.equals(prototypeMime)) AbstractParser.log.info("Parser for extension '" + ext + "' was set to mime '" + s + "', overwriting with new mime '" + prototypeMime + "'.");
            ext2mime.put(ext, prototypeMime);
@ -153,7 +163,7 @@ public final class TextParser {

        for (String ext: parser.supportedExtensions()) {
            // process the extensions
-            ext = ext.toLowerCase();
+            ext = ext.toLowerCase(Locale.ROOT);
            LinkedHashSet<Parser> p0 = ext2parser.get(ext);
            if (p0 == null) {
                p0 = new LinkedHashSet<Parser>();
@ -518,12 +528,12 @@ public final class TextParser {
    }

    public static String mimeOf(final String ext) {
-        return ext2mime.get(ext.toLowerCase());
+        return ext2mime.get(ext.toLowerCase(Locale.ROOT));
    }

    private static String normalizeMimeType(String mimeType) {
        if (mimeType == null) return "application/octet-stream";
-        mimeType = mimeType.toLowerCase();
+        mimeType = mimeType.toLowerCase(Locale.ROOT);
        final int pos = mimeType.indexOf(';');
        return ((pos < 0) ? mimeType.trim() : mimeType.substring(0, pos).trim());
    }
--- a/test/java/net/yacy/document/TextParserTest.java
+++ b/test/java/net/yacy/document/TextParserTest.java
@ -0,0 +1,55 @@
+// TextParserTest.java
+// ---------------------------
+// Copyright 2017 by luccioman; https://github.com/luccioman
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+package net.yacy.document;
+
+import static org.junit.Assert.*;
+
+import java.util.Locale;
+
+import org.junit.Test;
+
+/**
+ * Unit tests for the {@link TextParser} class.
+ * 
+ * @author luccioman
+ *
+ */
+public class TextParserTest {
+
+	/**
+	 * Test the TextParser.supportsMime() consistency with available locales.
+	 * Possible failure case : with the Turkish ("tr") language, 'I' lower cased
+	 * does not becomes 'i' but '\u005Cu0131' (the latin small letter 'ı'
+	 * character).
+	 */
+	@Test
+	public void testSupportsMimeLocaleConsistency() {
+		for (Locale locale : Locale.getAvailableLocales()) {
+			Locale.setDefault(locale);
+			for (String mimeType : TextParser.supportedMimeTypes()) {
+				assertNull(locale + " " + mimeType, TextParser.supportsMime(mimeType.toUpperCase(Locale.ROOT)));
+			}
+		}
+	}
+
+}