From 2ef8ffdb60bbf0ce64ee700c8baafdd8eebe274c Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 15 Mar 2015 06:02:45 +0100 Subject: [PATCH] apply UTF-8 encoding copied from escape() --- .../cora/document/id/MultiProtocolURL.java | 26 ++++++++++++++----- .../document/id/MultiProtocolURLTest.java | 5 ++-- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java index 6442eddd7..782ce93ce 100644 --- a/source/net/yacy/cora/document/id/MultiProtocolURL.java +++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java @@ -75,8 +75,8 @@ public class MultiProtocolURL implements Serializable, Comparable 255 or 2-byte chars here ?) - if (UNRESERVED_PATH.get(b)) { - ptmp.append((char) b); - } else { - ptmp.append(hex[b]); + int ch = this.path.charAt(i); + if (ch <= 0x7F) { + if (UNRESERVED_PATH.get(ch)) { + ptmp.append((char) ch); + } else { + ptmp.append(hex[ch]); + modified = true; + } + } else if (ch <= 0x07FF) { // non-ASCII <= 0x7FF + ptmp.append(hex[0xc0 | (ch >> 6)]); + ptmp.append(hex[0x80 | (ch & 0x3F)]); + modified = true; + } else { // 0x7FF < ch <= 0xFFFF + ptmp.append(hex[0xe0 | (ch >> 12)]); + ptmp.append(hex[0x80 | ((ch >> 6) & 0x3F)]); + ptmp.append(hex[0x80 | (ch & 0x3F)]); modified = true; } } diff --git a/test/net/yacy/cora/document/id/MultiProtocolURLTest.java b/test/net/yacy/cora/document/id/MultiProtocolURLTest.java index e4ccb9abd..790aaf64d 100644 --- a/test/net/yacy/cora/document/id/MultiProtocolURLTest.java +++ b/test/net/yacy/cora/document/id/MultiProtocolURLTest.java @@ -151,10 +151,9 @@ public class MultiProtocolURLTest { String[][] testStrings = new String[][]{ // teststring , expectedresult new String[]{"http://www.heise.de/newsticker/thema/%23saukontrovers", "http://www.heise.de/newsticker/thema/%23saukontrovers"}, // http://mantis.tokeek.de/view.php?id=519 - new String[]{"http://www.heise.de/newsticker/thema/#saukontrovers", "http://www.heise.de/newsticker/thema/"}, + new String[]{"http://www.heise.de/newsticker/thema/#saukontrovers", "http://www.heise.de/newsticker/thema/"}, // anchor fragment new String[]{"http://www.liferay.com/community/wiki/-/wiki/Main/Wiki+Portlet", "http://www.liferay.com/community/wiki/-/wiki/Main/Wiki+Portlet"}, // http://mantis.tokeek.de/view.php?id=559 - new String[]{"http://de.wikipedia.org/wiki/Philippe_Ariès", "http://de.wikipedia.org/wiki/Philippe_Ari%E8s"} // utf-8 2 byte char - // new String[]{"http://de.wikipedia.org/wiki/Philippe_Ariès", "http://de.wikipedia.org/wiki/Philippe_Ari%C3%A8s"} // above formal correct for utf8 codepage + new String[]{"http://de.wikipedia.org/wiki/Philippe_Ariès", "http://de.wikipedia.org/wiki/Philippe_Ari%C3%A8s"} // UTF-8 2 byte char }; for (String[] testString : testStrings) {