From 61c337f29a90d28728235d1cec11ea5958f82706 Mon Sep 17 00:00:00 2001 From: luccioman Date: Thu, 4 Oct 2018 09:33:58 +0200 Subject: [PATCH] Decode blacklist entries for easier edition of non ascii chars Not using the JDK URLDecoder.decode() function, as it strips '+' characters when they occur after '?' (both characters having regular expression semantics when used in blacklist path patterns) --- htroot/Blacklist_p.java | 17 +++- .../cora/document/id/MultiProtocolURL.java | 55 ++++++++++++ source/net/yacy/repository/Blacklist.java | 86 ++++++++++++------- .../document/id/MultiProtocolURLTest.java | 30 +++++++ 4 files changed, 155 insertions(+), 33 deletions(-) diff --git a/htroot/Blacklist_p.java b/htroot/Blacklist_p.java index 0bb4d9990..acbe3ceb9 100644 --- a/htroot/Blacklist_p.java +++ b/htroot/Blacklist_p.java @@ -419,10 +419,21 @@ public class Blacklist_p { } for (int j = offset; j < to; ++j){ - final String nextEntry = sortedlist[j]; + String nextEntry = sortedlist[j]; - if (nextEntry.isEmpty()) continue; - if (nextEntry.charAt(0) == '#') continue; + if (nextEntry.isEmpty()) { + continue; + } + if (nextEntry.charAt(0) == '#') { + continue; + } + + /** Decode the entry for easier reading of paths with non ascii characters */ + final int slashPos = nextEntry.indexOf('/', 0); + if(slashPos > 0) { + nextEntry = nextEntry.substring(0, slashPos + 1) + MultiProtocolURL.unescapePath(nextEntry.substring(slashPos + 1)); + } + prop.put(DISABLED + EDIT + "Itemlist_" + entryCount + "_dark", dark ? "1" : "0"); dark = !dark; /* We do not use here putHTML as we don't want '+' characters to be interpreted as application/x-www-form-urlencoded encoding */ diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java index bf2f083fa..7f4bf9a2c 100644 --- a/source/net/yacy/cora/document/id/MultiProtocolURL.java +++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java @@ -37,6 +37,7 @@ import java.io.UnsupportedEncodingException; import java.net.InetAddress; import java.net.MalformedURLException; import java.net.URLDecoder; +import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.util.BitSet; import java.util.LinkedHashMap; @@ -706,6 +707,60 @@ public class MultiProtocolURL implements Serializable, Comparable + * Differences with {@link URLDecoder#decode(String, String)} : + *
  • the '+' character is not decoded to space character
  • + *
  • no exception is thrown when invalid hexadecimal digits are found after a '%' character
  • + * + * + * @param path an URL path eventually escaped + * @return return the unescaped path or null when path is null. + */ + public static final String unescapePath(final String escaped) { + if (escaped == null) { + return escaped; + } + boolean modified = false; + final int len = escaped.length(); + final StringBuilder unescaped = new StringBuilder(len > 500 ? len / 2 : len); + ByteBuffer utf8Bytes = null; + int i = 0; + while (i < len) { + final char ch = escaped.charAt(i); + if (ch == '%' && (i + 2) < len) { + final char digit1 = escaped.charAt(i + 1); + final char digit2 = escaped.charAt(i + 2); + if (isHexDigit(digit1) && isHexDigit(digit2)) { + if (utf8Bytes == null) { + utf8Bytes = ByteBuffer.allocate((len - i) / 3); + } + /* Percent-encoded character UTF-8 byte */ + int hexaValue = Integer.parseInt(escaped.substring(i + 1, i + 3), 16); + utf8Bytes.put((byte) hexaValue); + modified = true; + i += 2; + } else { + /* Not a valid percent-encoded character : we append it as is */ + unescaped.append(ch); + } + } else { + if (utf8Bytes != null && utf8Bytes.position() > 0) { + unescaped.append(new String(utf8Bytes.array(), 0, utf8Bytes.position(), StandardCharsets.UTF_8)); + utf8Bytes.position(0); + } + unescaped.append(ch); + } + i++; + } + if (utf8Bytes != null && utf8Bytes.position() > 0) { + unescaped.append(new String(utf8Bytes.array(), 0, utf8Bytes.position(), StandardCharsets.UTF_8)); + } + + return modified ? unescaped.toString() : escaped; + } /** * @param character a character to test diff --git a/source/net/yacy/repository/Blacklist.java b/source/net/yacy/repository/Blacklist.java index 27b197a47..08dcf5daf 100644 --- a/source/net/yacy/repository/Blacklist.java +++ b/source/net/yacy/repository/Blacklist.java @@ -275,46 +275,33 @@ public class Blacklist { public final void remove(final BlacklistType blacklistType, final String blacklistToUse, final String host, final String path) { final Map> blacklistMap = getBlacklistMap(blacklistType, true); - Set hostList = blacklistMap.get(host); - if (hostList != null) { - // remove pattern from list (by comparing patternstring with path, remove(path) will not match path) - for (Pattern hp : hostList) { - String hpxs = hp.pattern(); - if (hpxs.equals(path)) { - hostList.remove(hp); - break; - } - } - if (hostList.isEmpty()) { - blacklistMap.remove(host); - } - } + removePatternFromMap(host, path, blacklistMap); final Map> blacklistMapNotMatch = getBlacklistMap(blacklistType, false); - hostList = blacklistMapNotMatch.get(host); - if (hostList != null) { - // remove pattern from list - for (Pattern hp : hostList) { - String hpxs = hp.pattern(); - if (hpxs.equals(path)) { - hostList.remove(hp); - break; - } - } - if (hostList.isEmpty()) { - blacklistMapNotMatch.remove(host); - } - } + removePatternFromMap(host, path, blacklistMapNotMatch); //TODO: check if delete from blacklist is desired, on reload entry will not be available in any blacklist // even if remove (above) from internal maps (at runtime) is only done for given blacklistType // load blacklist data from file final List list = FileUtils.getListArray(new File(ListManager.listsPath, blacklistToUse)); - // delete the old entry from file + /* delete the old entry from file, in any normalized or not normalized possible combinations */ + final Set entriesToDelete = new HashSet<>(); + final String normalizedPathPattern = MultiProtocolURL.escapePathPattern(path); + entriesToDelete.add(host + "/" + path); + entriesToDelete.add(host + "/" + normalizedPathPattern); + if (!Punycode.isBasic(host)) { + try { + final String normalizedHost = MultiProtocolURL.toPunycode(host); + entriesToDelete.add(normalizedHost + "/" + path); + entriesToDelete.add(normalizedHost + "/" + normalizedPathPattern); + } catch (final PunycodeException ignored) { + /* We continue even if a punycode flavor can not be produced */ + } + } if (list != null) { for (final String e : list) { - if (e.equals(host + "/" + path)) { + if (entriesToDelete.contains(e)) { list.remove(e); break; } @@ -322,6 +309,45 @@ public class Blacklist { FileUtils.writeList(new File(ListManager.listsPath, blacklistToUse), list.toArray(new String[list.size()])); } } + + /** + * Remove the (host, pathPattern) entries eventually found in the given + * blacklist map. + * + * @param host the host part of the entry to remove + * @param pathPattern the path pattern part of the entry to remove + * @param blacklistMap a blacklist map to update + */ + private void removePatternFromMap(final String host, final String pathPattern, + final Map> blacklistMap) { + final String normalizedPathPattern = MultiProtocolURL.escapePathPattern(pathPattern); + final Set hosts = new HashSet<>(); + hosts.add(host); + if (!Punycode.isBasic(host)) { + try { + hosts.add(MultiProtocolURL.toPunycode(host)); + } catch (final PunycodeException ignored) { + /* We continue even if a punycode flavor can not be produced */ + } + } + for (final String hostKey : hosts) { + final Set hostList = blacklistMap.get(hostKey); + if (hostList != null) { + // remove pattern from list (by comparing patternstring with path, remove(path) + // will not match path) + for (Pattern hp : hostList) { + String hpxs = hp.pattern(); + if (hpxs.equals(pathPattern) || hpxs.equals(normalizedPathPattern)) { + hostList.remove(hp); + break; + } + } + if (hostList.isEmpty()) { + blacklistMap.remove(host); + } + } + } + } /** * Adds entries to a given blacklist internal data and updates the source diff --git a/test/java/net/yacy/cora/document/id/MultiProtocolURLTest.java b/test/java/net/yacy/cora/document/id/MultiProtocolURLTest.java index 370279269..c189cb67f 100644 --- a/test/java/net/yacy/cora/document/id/MultiProtocolURLTest.java +++ b/test/java/net/yacy/cora/document/id/MultiProtocolURLTest.java @@ -391,6 +391,36 @@ public class MultiProtocolURLTest { } } + /** + * Unit tests for {@link MultiProtocolURL#unescapePath(String)} + */ + @Test + public void testUnescapePath() { + String[][] testStrings = new String[][] { + // "test string", "expected unescaped result" + new String[] { "", "" }, new String[] { "/", "/" }, new String[] { "/ascii/path", "/ascii/path" }, + new String[] { "/latin/chars/%C3%A0%C3%A4%C3%A2%C3%A9%C3%A8%C3%AF%C3%AE%C3%B4%C3%B6%C3%B9", + "/latin/chars/àäâéèïîôöù" }, + new String[] { "/wiki/%25", "/wiki/%" }, + new String[] { "/logograms/%E6%AD%A3%E9%AB%94%E5%AD%97/%E7%B9%81%E9%AB%94%E5%AD%97", + "/logograms/正體字/繁體字" }, + new String[] { "/bad/hexaDigits/%GH%-1%èà/file", "/bad/hexaDigits/%GH%-1%èà/file" }, + new String[] { "/missing/hexaDigit/%2", "/missing/hexaDigit/%2" }, + new String[] { "/missing/hexaDigits/%", "/missing/hexaDigits/%" }, + new String[] { "/unescaped/logograms/正體字/繁體字", "/unescaped/logograms/正體字/繁體字" }, + new String[] { "/unescaped/rfc3986/unreserved/path/chars/-._~", + "/unescaped/rfc3986/unreserved/path/chars/-._~" }, + new String[] { "/unescaped/rfc3986/subdelims/!$&'()*+,;=", "/unescaped/rfc3986/subdelims/!$&'()*+,;=" }, + new String[] { "/unescaped/rfc3986/pchar/additional/:@", "/unescaped/rfc3986/pchar/additional/:@" }, + new String[] { "/unescaped/regex/metacharacters/<([{\\^-=$!|]})?*+.>", + "/unescaped/regex/metacharacters/<([{\\^-=$!|]})?*+.>" } }; + for (int i = 0; i < testStrings.length; i++) { + String[] testString = testStrings[i]; + final String decoded = MultiProtocolURL.unescapePath(testString[0]); + assertEquals(testString[1], decoded); + } + } + /** * Unit tests for {@link MultiProtocolURL#escapePathPattern(String)} */