From 83860507c9104ea4d1097362aac7e40b438724f3 Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 26 Feb 2008 22:18:40 +0000 Subject: [PATCH] - added punycode class from gnu idn library - added parser for international domains in yacyURL git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4514 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/tools/Punycode.java | 280 +++++++++++++++++++++++++++ source/de/anomic/yacy/yacyURL.java | 11 ++ 2 files changed, 291 insertions(+) create mode 100644 source/de/anomic/tools/Punycode.java diff --git a/source/de/anomic/tools/Punycode.java b/source/de/anomic/tools/Punycode.java new file mode 100644 index 000000000..eeea62f54 --- /dev/null +++ b/source/de/anomic/tools/Punycode.java @@ -0,0 +1,280 @@ +/** + * Copyright (C) 2004, 2005, 2006, 2007 Free Software Foundation, Inc. + * + * Author: Oliver Hitz + * + * This file is part of GNU Libidn. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +package de.anomic.tools; + + +public class Punycode +{ + /* Punycode parameters */ + final static int TMIN = 1; + final static int TMAX = 26; + final static int BASE = 36; + final static int INITIAL_N = 128; + final static int INITIAL_BIAS = 72; + final static int DAMP = 700; + final static int SKEW = 38; + final static char DELIMITER = '-'; + + /** + * Punycodes a unicode string. + * + * @param input Unicode string. + * @return Punycoded string. + */ + public static String encode(String input) + throws PunycodeException + { + int n = INITIAL_N; + int delta = 0; + int bias = INITIAL_BIAS; + StringBuffer output = new StringBuffer(); + + // Copy all basic code points to the output + int b = 0; + for (int i = 0; i < input.length(); i++) { + char c = input.charAt(i); + if (isBasic(c)) { + output.append(c); + b++; + } + } + + // Append delimiter + if (b > 0) { + output.append(DELIMITER); + } + + int h = b; + while (h < input.length()) { + int m = Integer.MAX_VALUE; + + // Find the minimum code point >= n + for (int i = 0; i < input.length(); i++) { + int c = input.charAt(i); + if (c >= n && c < m) { + m = c; + } + } + + if (m - n > (Integer.MAX_VALUE - delta) / (h + 1)) { + throw new PunycodeException(PunycodeException.OVERFLOW); + } + delta = delta + (m - n) * (h + 1); + n = m; + + for (int j = 0; j < input.length(); j++) { + int c = input.charAt(j); + if (c < n) { + delta++; + if (0 == delta) { + throw new PunycodeException(PunycodeException.OVERFLOW); + } + } + if (c == n) { + int q = delta; + + for (int k = BASE;; k += BASE) { + int t; + if (k <= bias) { + t = TMIN; + } else if (k >= bias + TMAX) { + t = TMAX; + } else { + t = k - bias; + } + if (q < t) { + break; + } + output.append((char) digit2codepoint(t + (q - t) % (BASE - t))); + q = (q - t) / (BASE - t); + } + + output.append((char) digit2codepoint(q)); + bias = adapt(delta, h + 1, h == b); + delta = 0; + h++; + } + } + + delta++; + n++; + } + + return output.toString(); + } + + /** + * Decode a punycoded string. + * + * @param input Punycode string + * @return Unicode string. + */ + public static String decode(String input) + throws PunycodeException + { + int n = INITIAL_N; + int i = 0; + int bias = INITIAL_BIAS; + StringBuffer output = new StringBuffer(); + + int d = input.lastIndexOf(DELIMITER); + if (d > 0) { + for (int j = 0; j < d; j++) { + char c = input.charAt(j); + if (!isBasic(c)) { + throw new PunycodeException(PunycodeException.BAD_INPUT); + } + output.append(c); + } + d++; + } else { + d = 0; + } + + while (d < input.length()) { + int oldi = i; + int w = 1; + + for (int k = BASE; ; k += BASE) { + if (d == input.length()) { + throw new PunycodeException(PunycodeException.BAD_INPUT); + } + int c = input.charAt(d++); + int digit = codepoint2digit(c); + if (digit > (Integer.MAX_VALUE - i) / w) { + throw new PunycodeException(PunycodeException.OVERFLOW); + } + + i = i + digit * w; + + int t; + if (k <= bias) { + t = TMIN; + } else if (k >= bias + TMAX) { + t = TMAX; + } else { + t = k - bias; + } + if (digit < t) { + break; + } + w = w * (BASE - t); + } + + bias = adapt(i - oldi, output.length()+1, oldi == 0); + + if (i / (output.length() + 1) > Integer.MAX_VALUE - n) { + throw new PunycodeException(PunycodeException.OVERFLOW); + } + + n = n + i / (output.length() + 1); + i = i % (output.length() + 1); + output.insert(i, (char) n); + i++; + } + + return output.toString(); + } + + public final static int adapt(int delta, int numpoints, boolean first) + { + if (first) { + delta = delta / DAMP; + } else { + delta = delta / 2; + } + + delta = delta + (delta / numpoints); + + int k = 0; + while (delta > ((BASE - TMIN) * TMAX) / 2) { + delta = delta / (BASE - TMIN); + k = k + BASE; + } + + return k + ((BASE - TMIN + 1) * delta) / (delta + SKEW); + } + + public final static boolean isBasic(char c) + { + return c < 0x80; + } + + // the following method has been added by Michael Christen + public static boolean isBasic(String input) { + for (int j = 0; j < input.length(); j++) { + if (!isBasic(input.charAt(j))) return false; + } + return true; + } + + public final static int digit2codepoint(int d) + throws PunycodeException + { + if (d < 26) { + // 0..25 : 'a'..'z' + return d + 'a'; + } else if (d < 36) { + // 26..35 : '0'..'9'; + return d - 26 + '0'; + } else { + throw new PunycodeException(PunycodeException.BAD_INPUT); + } + } + + public final static int codepoint2digit(int c) + throws PunycodeException + { + if (c - '0' < 10) { + // '0'..'9' : 26..35 + return c - '0' + 26; + } else if (c - 'a' < 26) { + // 'a'..'z' : 0..25 + return c - 'a'; + } else { + throw new PunycodeException(PunycodeException.BAD_INPUT); + } + } + + public static class PunycodeException + extends Exception + { + /** + * + */ + private static final long serialVersionUID = 1L; + public static String OVERFLOW = "Overflow."; + public static String BAD_INPUT = "Bad input."; + + /** + * Creates a new PunycodeException. + * + * @param m message. + */ + public PunycodeException(String m) + { + super(m); + } + } +} \ No newline at end of file diff --git a/source/de/anomic/yacy/yacyURL.java b/source/de/anomic/yacy/yacyURL.java index 2e456eb2d..2750a08e8 100644 --- a/source/de/anomic/yacy/yacyURL.java +++ b/source/de/anomic/yacy/yacyURL.java @@ -38,6 +38,8 @@ import java.util.regex.Pattern; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.server.serverCodings; import de.anomic.server.serverDomains; +import de.anomic.tools.Punycode; +import de.anomic.tools.Punycode.PunycodeException; public class yacyURL { @@ -394,6 +396,7 @@ public class yacyURL { public yacyURL(String url, String hash) throws MalformedURLException { if (url == null) throw new MalformedURLException("url string is null"); + parseURLString(url); this.hash = hash; } @@ -461,6 +464,14 @@ public class yacyURL { throw new MalformedURLException("unknown protocol: " + url); } } + + // handle international domains + if (!Punycode.isBasic(host)) try { + int d = host.lastIndexOf('.'); + if (d >= 0) { + host = Punycode.encode(host.substring(0, d - 1)) + host.substring(d); + } + } catch (PunycodeException e) {} } public yacyURL(File file) throws MalformedURLException {