Decode blacklist entries for easier edition of non ascii chars

Not using the JDK URLDecoder.decode() function, as it strips '+' characters when they occur after '?' (both characters having regular expression semantics when used in blacklist path patterns)
6 years ago · 61c337f29a
parent ed93221fa1
commit 61c337f29a
4 changed files with 155 additions and 33 deletions
--- a/htroot/Blacklist_p.java
+++ b/htroot/Blacklist_p.java
@ -419,10 +419,21 @@ public class Blacklist_p {
            }
            for (int j = offset; j < to; ++j){
-                final String nextEntry = sortedlist[j];
+                String nextEntry = sortedlist[j];
                if (nextEntry.isEmpty()) {
                	continue;
                }
                if (nextEntry.charAt(0) == '#') {
                	continue;
                }
                /** Decode the entry for easier reading of paths with non ascii characters */
        		final int slashPos = nextEntry.indexOf('/', 0);
        		if(slashPos > 0) {
        			nextEntry = nextEntry.substring(0, slashPos + 1) + MultiProtocolURL.unescapePath(nextEntry.substring(slashPos + 1));
        		}
                if (nextEntry.isEmpty()) continue;
                if (nextEntry.charAt(0) == '#') continue;
                prop.put(DISABLED + EDIT + "Itemlist_" + entryCount + "_dark", dark ? "1" : "0");
                dark = !dark;
                /* We do not use here putHTML as we don't want '+' characters to be interpreted as application/x-www-form-urlencoded encoding */
--- a/source/net/yacy/cora/document/id/MultiProtocolURL.java
+++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java
@ -37,6 +37,7 @@ import java.io.UnsupportedEncodingException;
 import java.net.InetAddress;
 import java.net.MalformedURLException;
 import java.net.URLDecoder;
 import java.nio.ByteBuffer;
 import java.nio.charset.StandardCharsets;
 import java.util.BitSet;
 import java.util.LinkedHashMap;
@ -707,6 +708,60 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
 		return pathToEscape;
 	}
 	/**
 	 * Decode UTF-8 percent-encoded characters eventually found in the given path.
 	 * <ul>
 	 * Differences with {@link URLDecoder#decode(String, String)} :
 	 * <li>the '+' character is not decoded to space character</li>
 	 * <li>no exception is thrown when invalid hexadecimal digits are found after a '%' character</li>
 	 * </ul>
 	 * 
 	 * @param path an URL path eventually escaped
 	 * @return return the unescaped path or null when path is null.
 	 */
 	public static final String unescapePath(final String escaped) {
 		if (escaped == null) {
 			return escaped;
 		}
 		boolean modified = false;
 		final int len = escaped.length();
 		final StringBuilder unescaped = new StringBuilder(len > 500 ? len / 2 : len);
 		ByteBuffer utf8Bytes = null;
 		int i = 0;
 		while (i < len) {
 			final char ch = escaped.charAt(i);
 			if (ch == '%' && (i + 2) < len) {
 				final char digit1 = escaped.charAt(i + 1);
 				final char digit2 = escaped.charAt(i + 2);
 				if (isHexDigit(digit1) && isHexDigit(digit2)) {
 					if (utf8Bytes == null) {
 						utf8Bytes = ByteBuffer.allocate((len - i) / 3);
 					}
 					/* Percent-encoded character UTF-8 byte */
 					int hexaValue = Integer.parseInt(escaped.substring(i + 1, i + 3), 16);
 					utf8Bytes.put((byte) hexaValue);
 					modified = true;
 					i += 2;
 				} else {
 					/* Not a valid percent-encoded character : we append it as is */
 					unescaped.append(ch);
 				}
 			} else {
 				if (utf8Bytes != null && utf8Bytes.position() > 0) {
 					unescaped.append(new String(utf8Bytes.array(), 0, utf8Bytes.position(), StandardCharsets.UTF_8));
 					utf8Bytes.position(0);
 				}
 				unescaped.append(ch);
 			}
 			i++;
 		}
 		if (utf8Bytes != null && utf8Bytes.position() > 0) {
 			unescaped.append(new String(utf8Bytes.array(), 0, utf8Bytes.position(), StandardCharsets.UTF_8));
 		}
 		return modified ? unescaped.toString() : escaped;
 	}
 	/**
 	 * @param character a character to test
 	 * @return true when the character is a valid hexadecimal digit
--- a/source/net/yacy/repository/Blacklist.java
+++ b/source/net/yacy/repository/Blacklist.java
@ -275,51 +275,77 @@ public class Blacklist {
    public final void remove(final BlacklistType blacklistType, final String blacklistToUse, final String host, final String path) {
        final Map<String, Set<Pattern>> blacklistMap = getBlacklistMap(blacklistType, true);
-        Set<Pattern> hostList = blacklistMap.get(host);
+		removePatternFromMap(host, path, blacklistMap);
-        if (hostList != null) {
+
-            // remove pattern from list (by comparing patternstring with path, remove(path) will not match path)
+        final Map<String, Set<Pattern>> blacklistMapNotMatch = getBlacklistMap(blacklistType, false);
-            for (Pattern hp : hostList) {
+        removePatternFromMap(host, path, blacklistMapNotMatch);
-                String hpxs = hp.pattern();
+
-                if (hpxs.equals(path)) {
+        //TODO: check if delete from blacklist is desired, on reload entry will not be available in any blacklist
-                    hostList.remove(hp);
+        //      even if remove (above) from internal maps (at runtime) is only done for given blacklistType
        // load blacklist data from file
        final List<String> list = FileUtils.getListArray(new File(ListManager.listsPath, blacklistToUse));
        /* delete the old entry from file, in any normalized or not normalized possible combinations */
 		final Set<String> entriesToDelete = new HashSet<>();
 		final String normalizedPathPattern = MultiProtocolURL.escapePathPattern(path);
 		entriesToDelete.add(host + "/" + path);
 		entriesToDelete.add(host + "/" + normalizedPathPattern);
 		if (!Punycode.isBasic(host)) {
 			try {
 				final String normalizedHost = MultiProtocolURL.toPunycode(host);
 				entriesToDelete.add(normalizedHost + "/" + path);
 				entriesToDelete.add(normalizedHost + "/" + normalizedPathPattern);
 			} catch (final PunycodeException ignored) {
 				/* We continue even if a punycode flavor can not be produced */
 			}
 		}
        if (list != null) {
            for (final String e : list) {
                if (entriesToDelete.contains(e)) {
                    list.remove(e);
                    break;
                }
            }
-            if (hostList.isEmpty()) {
+            FileUtils.writeList(new File(ListManager.listsPath, blacklistToUse), list.toArray(new String[list.size()]));
                blacklistMap.remove(host);
        }
    }
-        final Map<String, Set<Pattern>> blacklistMapNotMatch = getBlacklistMap(blacklistType, false);
+	/**
-        hostList = blacklistMapNotMatch.get(host);
+	 * Remove the (host, pathPattern) entries eventually found in the given
 	 * blacklist map.
 	 * 
 	 * @param host         the host part of the entry to remove
 	 * @param pathPattern  the path pattern part of the entry to remove
 	 * @param blacklistMap a blacklist map to update
 	 */
 	private void removePatternFromMap(final String host, final String pathPattern,
 			final Map<String, Set<Pattern>> blacklistMap) {
 		final String normalizedPathPattern = MultiProtocolURL.escapePathPattern(pathPattern);
 		final Set<String> hosts = new HashSet<>();
 		hosts.add(host);
 		if (!Punycode.isBasic(host)) {
 			try {
 				hosts.add(MultiProtocolURL.toPunycode(host));
 			} catch (final PunycodeException ignored) {
 				/* We continue even if a punycode flavor can not be produced */
 			}
 		}
 		for (final String hostKey : hosts) {
 			final Set<Pattern> hostList = blacklistMap.get(hostKey);
 			if (hostList != null) {
-            // remove pattern from list
+				// remove pattern from list (by comparing patternstring with path, remove(path)
 				// will not match path)
 				for (Pattern hp : hostList) {
 					String hpxs = hp.pattern();
-                if (hpxs.equals(path)) {
+					if (hpxs.equals(pathPattern) || hpxs.equals(normalizedPathPattern)) {
 						hostList.remove(hp);
 						break;
 					}
 				}
 				if (hostList.isEmpty()) {
-                blacklistMapNotMatch.remove(host);
+					blacklistMap.remove(host);
            }
        }
        //TODO: check if delete from blacklist is desired, on reload entry will not be available in any blacklist
        //      even if remove (above) from internal maps (at runtime) is only done for given blacklistType
        // load blacklist data from file
        final List<String> list = FileUtils.getListArray(new File(ListManager.listsPath, blacklistToUse));
        // delete the old entry from file
        if (list != null) {
            for (final String e : list) {
                if (e.equals(host + "/" + path)) {
                    list.remove(e);
                    break;
 				}
 			}
            FileUtils.writeList(new File(ListManager.listsPath, blacklistToUse), list.toArray(new String[list.size()]));
 		}
 	}
--- a/test/java/net/yacy/cora/document/id/MultiProtocolURLTest.java
+++ b/test/java/net/yacy/cora/document/id/MultiProtocolURLTest.java
@ -391,6 +391,36 @@ public class MultiProtocolURLTest {
 		}
 	}
 	/**
 	 * Unit tests for {@link MultiProtocolURL#unescapePath(String)}
 	 */
 	@Test
 	public void testUnescapePath() {
 		String[][] testStrings = new String[][] {
 				// "test string", "expected unescaped result"
 				new String[] { "", "" }, new String[] { "/", "/" }, new String[] { "/ascii/path", "/ascii/path" },
 				new String[] { "/latin/chars/%C3%A0%C3%A4%C3%A2%C3%A9%C3%A8%C3%AF%C3%AE%C3%B4%C3%B6%C3%B9",
 						"/latin/chars/àäâéèïîôöù" },
 				new String[] { "/wiki/%25", "/wiki/%" },
 				new String[] { "/logograms/%E6%AD%A3%E9%AB%94%E5%AD%97/%E7%B9%81%E9%AB%94%E5%AD%97",
 						"/logograms/正體字/繁體字" },
 				new String[] { "/bad/hexaDigits/%GH%-1%èà/file", "/bad/hexaDigits/%GH%-1%èà/file" },
 				new String[] { "/missing/hexaDigit/%2", "/missing/hexaDigit/%2" },
 				new String[] { "/missing/hexaDigits/%", "/missing/hexaDigits/%" },
 				new String[] { "/unescaped/logograms/正體字/繁體字", "/unescaped/logograms/正體字/繁體字" },
 				new String[] { "/unescaped/rfc3986/unreserved/path/chars/-._~",
 						"/unescaped/rfc3986/unreserved/path/chars/-._~" },
 				new String[] { "/unescaped/rfc3986/subdelims/!$&'()*+,;=", "/unescaped/rfc3986/subdelims/!$&'()*+,;=" },
 				new String[] { "/unescaped/rfc3986/pchar/additional/:@", "/unescaped/rfc3986/pchar/additional/:@" },
 				new String[] { "/unescaped/regex/metacharacters/<([{\\^-=$!|]})?*+.>",
 						"/unescaped/regex/metacharacters/<([{\\^-=$!|]})?*+.>" } };
 		for (int i = 0; i < testStrings.length; i++) {
 			String[] testString = testStrings[i];
 			final String decoded = MultiProtocolURL.unescapePath(testString[0]);
 			assertEquals(testString[1], decoded);
 		}
 	}
 	/**
 	 * Unit tests for {@link MultiProtocolURL#escapePathPattern(String)}
 	 */