Improved normalization of blacklist path patterns having non ascii chars

Normalize blacklist path patterns using percent-encoding, at pattern edition in web interface and at loading from configuration files. Fixes issue #237
6 years ago · ed93221fa1
parent d42f079c2d
commit ed93221fa1
5 changed files with 269 additions and 41 deletions
--- a/htroot/Blacklist_p.java
+++ b/htroot/Blacklist_p.java
@ -37,6 +37,9 @@ import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
 import net.yacy.cora.document.id.MultiProtocolURL;
 import net.yacy.cora.document.id.Punycode;
 import net.yacy.cora.document.id.Punycode.PunycodeException;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.data.ListManager;
@ -313,7 +316,10 @@ public class Blacklist_p {
                    for (final Entry<String, String> selectedEntry : selectedBlacklistEntries.entrySet()) {
                    	final String editedEntryValue = editedBlacklistEntries.get(selectedEntry.getKey().replace("selectedBlacklistEntry.", "editedBlacklistEntry."));
-                        if (!selectedEntry.getValue().equals(editedEntryValue)) {
+                    	
                    	final String preparedNewEntry = prepareNormalizedEntry(editedEntryValue);
                        if (!normalizeEntry(selectedEntry.getValue()).equals(preparedNewEntry)) {
                        	/* Add first, to detect any eventual syntax errors before removing the old entry */
                            if (!BlacklistHelper.addBlacklistEntry(blacklistToUse, editedEntryValue, header)) {
@ -540,4 +546,33 @@ public class Blacklist_p {
        return prop;
    }
 	/**
 	 * @param entry a blacklist entry. Must not be null.
 	 * @return a prepared and normalized entry as done internally in
 	 *         BlacklistHelper.addBlacklistEntry()
 	 */
 	private static String prepareNormalizedEntry(final String entry) {
 		return normalizeEntry(BlacklistHelper.prepareEntry(entry));
 	}
 	/**
 	 * @param entry a blacklist entry. Must not be null.
 	 * @return a normalized entry (punycode encoded host and percent-encoded path)
 	 *         as done internally in BlacklistHelper.addBlacklistEntry()
 	 */
 	private static String normalizeEntry(final String entry) {
 		final int slashPos = entry.indexOf('/', 0);
 		String host = entry.substring(0, slashPos);
 		try {
 			host = Punycode.isBasic(host) ? host : MultiProtocolURL.toPunycode(host);
 		} catch (final PunycodeException ignored) {
 			/*
 			 * Punycode encoding error will be handled in
 			 * BlacklistHelper.addBlacklistEntry()
 			 */
 		}
 		String path = MultiProtocolURL.escapePathPattern(entry.substring(slashPos + 1));
 		return host + "/" + path;
 	}
 }
--- a/source/net/yacy/cora/document/id/MultiProtocolURL.java
+++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java
@ -81,8 +81,22 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
    private static final Pattern patternMail = Pattern.compile("^[a-z]+:.*?");
    //private static final Pattern patternSpace = Pattern.compile("%20");
-    private final static BitSet UNRESERVED_RFC1738 = new BitSet(128); // register unreserved chars (never escaped in url)
+    /** Register unreserved chars (never escaped in url) */
-    private final static BitSet UNRESERVED_PATH    = new BitSet(128); // register unreserved chars for path part (not escaped in path)
+    private final static BitSet UNRESERVED_RFC1738 = new BitSet(128);
    /** Register unreserved chars for path part (not escaped in path) */
    private final static BitSet UNRESERVED_PATH    = new BitSet(128);
 	/**
 	 * Register regular expressions metacharacters used by the {@link Pattern}
 	 * class.
 	 * 
 	 * @see <a href=
 	 *      "https://docs.oracle.com/javase/tutorial/essential/regex/literals.html">Regular
 	 *      expressions string literals documentation</a>
 	 */
 	private static final BitSet PATTERN_METACHARACTERS = new BitSet(128);
    static {
        // unreserved characters (chars not to escape in url)
        for (int i = 'A'; i <= 'Z'; i++) { // hialpha RFC1738 Section 5
@ -119,6 +133,27 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
        UNRESERVED_PATH.set('@');
        UNRESERVED_PATH.set('&');
        UNRESERVED_PATH.set('=');
        /* Pattern metacharacters : <([{\^-=$!|]})?*+.> */
        PATTERN_METACHARACTERS.set('<');
        PATTERN_METACHARACTERS.set('(');
        PATTERN_METACHARACTERS.set('[');
        PATTERN_METACHARACTERS.set('{');
        PATTERN_METACHARACTERS.set('\\');
        PATTERN_METACHARACTERS.set('^');
        PATTERN_METACHARACTERS.set('-');
        PATTERN_METACHARACTERS.set('=');
        PATTERN_METACHARACTERS.set('$');
        PATTERN_METACHARACTERS.set('!');
        PATTERN_METACHARACTERS.set('|');
        PATTERN_METACHARACTERS.set(']');
        PATTERN_METACHARACTERS.set('}');
        PATTERN_METACHARACTERS.set(')');
        PATTERN_METACHARACTERS.set('?');
        PATTERN_METACHARACTERS.set('*');
        PATTERN_METACHARACTERS.set('+');
        PATTERN_METACHARACTERS.set('.');
        PATTERN_METACHARACTERS.set('>');
    }
    // session id handling
@ -552,45 +587,135 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
     * </ul>
     */
    private void escape() {
-        if (this.path != null && this.path.indexOf('%') == -1) escapePath();
+        if (this.path != null && this.path.indexOf('%') == -1) {
        	this.path = escapePath(this.path);
        }
        if (this.searchpart != null && this.searchpart.indexOf('%') == -1) escapeSearchpart();
        if (this.anchor != null) this.anchor = escape(this.anchor).toString();
    }
-    /**
+	/**
-     * Url encode/escape the path part according to the allowed characters
+	 * <p>Percent-encode/escape an URL path part according to the allowed characters
-     * (RFC1738 & RFC2396)
+	 * (see RFC3986, and formerly RFC1738 & RFC2396). Uses UTF-8 character codes for
-     * uses UTF-8 character codes for non-ASCII
+	 * non-ASCII.</p>
-     */
+	 * <p>Important : already percent-encoded characters are not re-encoded</p>
-    private void escapePath() {
+	 * 
-        final StringBuilder ptmp = new StringBuilder(this.path.length() + 10);
+	 * @param pathToEscape the path part to escape.
-        boolean modified = false;
+	 * @return an escaped path with only ASCII characters, or null when pathToEscape
-        final int len = this.path.length();
+	 *         is null.
-        for (int i = 0; i < len; i++) {
+	 * @see <a href="https://tools.ietf.org/html/rfc3986#section-2.1">RFC3986
-            int ch = this.path.charAt(i);
+	 *      percent-encoding section</a>
-            if (ch <= 0x7F) {
+	 * @see <z href="https://tools.ietf.org/html/rfc3986#appendix-A">RFC3986 path
-                if (UNRESERVED_PATH.get(ch)) {
+	 *      definition</a>
-                    ptmp.append((char) ch);
+	 */
-                } else {
+    public static String escapePath(final String pathToEscape) {
-                    ptmp.append(hex[ch]);
+    	return escapePath(pathToEscape, false);
                    modified = true;
                }
            } else if (ch <= 0x07FF) {              // non-ASCII <= 0x7FF
                ptmp.append(hex[0xc0 | (ch >> 6)]);
                ptmp.append(hex[0x80 | (ch & 0x3F)]);
                modified = true;
            } else {                                // 0x7FF < ch <= 0xFFFF
                ptmp.append(hex[0xe0 | (ch >> 12)]);
                ptmp.append(hex[0x80 | ((ch >> 6) & 0x3F)]);
                ptmp.append(hex[0x80 | (ch & 0x3F)]);
                modified = true;
            }
        }
        if (modified) {
            this.path = ptmp.toString();
        }
    }
 	/**
 	 * <p>Percent-encode/escape an URL path regular expression according to the allowed
 	 * characters in an URL path (see RFC3986) and in the {@link Pattern} regular
 	 * expressions. Uses UTF-8 character codes for non-ASCII.</p>
 	 * <p>Important : already percent-encoded characters are not re-encoded</p>
 	 * 
 	 * @param pathPattern the URL path regular expression to escape.
 	 * @return an escaped path regular expression with only allowed ASCII
 	 *         characters, or null when pathPattern is null.
 	 * @see <a href="https://tools.ietf.org/html/rfc3986#section-2.1">RFC3986
 	 *      percent-encoding section</a>
 	 * @see <z href="https://tools.ietf.org/html/rfc3986#appendix-A">RFC3986 path
 	 *      definition</a>
 	 */
 	public static String escapePathPattern(final String pathPattern) {
 		return escapePath(pathPattern, true);
 	}
 	/**
 	 * <p>
 	 * Percent-encode/escape an URL path part according to the allowed characters
 	 * specified in RFC3986 (formerly RFC1738 and RFC2396). Uses UTF-8 character
 	 * codes for non-ASCII.
 	 * </p>
 	 * <p>
 	 * When isPattern is true, the string is processed as a regular expression, and
 	 * therefore meta-characters used by the {@link Pattern} class are not
 	 * percent-encoded.
 	 * </p>
 	 * 
 	 * @param pathToEscape the path part to escape.
 	 * @param isPattern    when true, regular meta-characters are not escaped
 	 * @return an escaped path regular expression with only allowed ASCII
 	 *         characters, or null when pathPattern is null.
 	 * @see <a href="https://tools.ietf.org/html/rfc3986#section-2.1">RFC3986
 	 *      percent-encoding section</a>
 	 * @see <z href="https://tools.ietf.org/html/rfc3986#appendix-A">RFC3986 path
 	 *      definition</a>
 	 */
 	private static String escapePath(final String pathToEscape, final boolean isPattern) {
 		if (pathToEscape == null) {
 			return pathToEscape;
 		}
 		final StringBuilder ptmp = new StringBuilder(pathToEscape.length() + 10);
 		boolean modified = false;
 		final int len = pathToEscape.length();
 		int i = 0;
 		while (i < len) {
 			int ch = pathToEscape.charAt(i);
 			if (ch == '%' && (i + 2) < len) {
 				final char digit1 = pathToEscape.charAt(i + 1);
 				final char digit2 = pathToEscape.charAt(i + 2);
 				if (isHexDigit(digit1) && isHexDigit(digit2)) {
 					/* Already percent-encoded character */
 					ptmp.append((char) ch);
 					/* Normalize hexadecimal digits to upper case */
 					if (Character.isLowerCase(digit1) || Character.isLowerCase(digit2)) {
 						modified = true;
 					}
 					ptmp.append(Character.toUpperCase(digit1));
 					ptmp.append(Character.toUpperCase(digit2));
 					i += 2;
 				} else {
 					/* Not a valid percent-encoded character : we encode it now */
 					ptmp.append(hex[ch]);
 					modified = true;
 				}
 			} else if (isPattern && PATTERN_METACHARACTERS.get(ch)) {
 				ptmp.append((char) ch);
 			} else if (ch <= 0x7F) {
 				if (UNRESERVED_PATH.get(ch)) {
 					ptmp.append((char) ch);
 				} else {
 					ptmp.append(hex[ch]);
 					modified = true;
 				}
 			} else if (ch <= 0x07FF) { // non-ASCII <= 0x7FF
 				ptmp.append(hex[0xc0 | (ch >> 6)]);
 				ptmp.append(hex[0x80 | (ch & 0x3F)]);
 				modified = true;
 			} else { // 0x7FF < ch <= 0xFFFF
 				ptmp.append(hex[0xe0 | (ch >> 12)]);
 				ptmp.append(hex[0x80 | ((ch >> 6) & 0x3F)]);
 				ptmp.append(hex[0x80 | (ch & 0x3F)]);
 				modified = true;
 			}
 			i++;
 		}
 		if (modified) {
 			return ptmp.toString();
 		}
 		return pathToEscape;
 	}
 	/**
 	 * @param character a character to test
 	 * @return true when the character is a valid hexadecimal digit
 	 */
 	private static boolean isHexDigit(final int character) {
 		return (character >= '0' && character <= '9') || (character >= 'a' && character <= 'f')
 				|| (character >= 'A' && character <= 'F');
 	}
    private void escapeSearchpart() {
        final StringBuilder qtmp = new StringBuilder(this.searchpart.length() + 10);
        for (final Map.Entry<String, String> element: getAttributes().entrySet()) {
--- a/source/net/yacy/repository/Blacklist.java
+++ b/source/net/yacy/repository/Blacklist.java
@ -236,7 +236,10 @@ public class Blacklist {
                    	log.warn("ignored blacklist path to prevent 'Dangling meta character' exception: " + a);
                        continue;
                    }
-                    loadedPathsPattern.add(Pattern.compile(a, Pattern.CASE_INSENSITIVE)); // add case insesitive regex
+                    /* We ensure now that any necessary percent-encoding is applied, as the blacklist file may have been manually edited.
                     * (when using the web interface, encoding should already have been applied in the add() function) */
                    final String normalizedPattern = MultiProtocolURL.escapePathPattern(a);
                    loadedPathsPattern.add(Pattern.compile(normalizedPattern, Pattern.CASE_INSENSITIVE)); // add case insesitive regex
                }
                // create new entry if host mask unknown, otherwise merge
@ -348,8 +351,9 @@ public class Blacklist {
 					final String host = itemToAdd.getHost();
 					final String path = itemToAdd.getPath();
 					final String safeHost = Punycode.isBasic(host) ? host : MultiProtocolURL.toPunycode(host);
 					final String safePath = MultiProtocolURL.escapePathPattern(path);
-					if (contains(blacklistType, safeHost, path)) {
+					if (contains(blacklistType, safeHost, safePath)) {
 						/* Continue to the next item */
 						continue;
 					}
@ -364,7 +368,7 @@ public class Blacklist {
 						continue;
 					}
-					String p = (!path.isEmpty() && path.charAt(0) == '/') ? path.substring(1) : path;
+					String p = (!safePath.isEmpty() && safePath.charAt(0) == '/') ? safePath.substring(1) : safePath;
 					final Map<String, Set<Pattern>> blacklistMap = getBlacklistMap(blacklistType, isMatchable(host));
 					// avoid PatternSyntaxException e
@ -376,7 +380,7 @@ public class Blacklist {
 					Set<Pattern> hostList;
 					if (!(blacklistMap.containsKey(h) && ((hostList = blacklistMap.get(h)) != null))) {
-						blacklistMap.put(h, (hostList = new HashSet<Pattern>()));
+						blacklistMap.put(h, (hostList = new HashSet<>()));
 					}
 					Pattern pattern = Pattern.compile(p, Pattern.CASE_INSENSITIVE);
@ -438,6 +442,7 @@ public class Blacklist {
        }
        String p = (!path.isEmpty() && path.charAt(0) == '/') ? path.substring(1) : path;
        p = MultiProtocolURL.escapePathPattern(p);
        // avoid PatternSyntaxException e
        String h = ((!isMatchable(host) && !host.isEmpty() && host.charAt(0) == '*') ? "." + host : host).toLowerCase(Locale.ROOT);
--- a/source/net/yacy/repository/BlacklistHelper.java
+++ b/source/net/yacy/repository/BlacklistHelper.java
@ -29,7 +29,7 @@ public final class BlacklistHelper {
 	 * @param entry a blacklist entry. Must not be null.
 	 * @return the entry eventually modified to be ready to use by the Blacklist engine
 	 */
-	protected static String prepareEntry(final String entry) {
+	public static String prepareEntry(final String entry) {
 		String newEntry = entry;
    	/* Remove the eventual unnecessary Regex line beginning char '^' and URL scheme (protocol) part */
    	Matcher schemeMatcher = URL_SCHEME_PATTERN.matcher(newEntry);
--- a/test/java/net/yacy/cora/document/id/MultiProtocolURLTest.java
+++ b/test/java/net/yacy/cora/document/id/MultiProtocolURLTest.java
@ -363,6 +363,69 @@ public class MultiProtocolURLTest {
 		}
 	}
 	/**
 	 * Unit tests for {@link MultiProtocolURL#escapePath(String)}
 	 */
 	@Test
 	public void testEscapePath() {
 		String[][] testStrings = new String[][] {
 				// "test string" , "expected escaped result"
 				new String[] { "", "" }, new String[] { "/", "/" }, new String[] { "/ascii/path", "/ascii/path" },
 				new String[] { "/latin/chars/àäâéèïîôöù",
 						"/latin/chars/%C3%A0%C3%A4%C3%A2%C3%A9%C3%A8%C3%AF%C3%AE%C3%B4%C3%B6%C3%B9" },
 				new String[] { "/with%char", "/with%25char" }, new String[] { "/wiki/%", "/wiki/%25" },
 				new String[] { "/already/percent-encoded/%C3%9f", "/already/percent-encoded/%C3%9F" },
 				new String[] { "/logograms/正體字/繁體字",
 						"/logograms/%E6%AD%A3%E9%AB%94%E5%AD%97/%E7%B9%81%E9%AB%94%E5%AD%97" },
 				new String[] { "/rfc3986/unreserved/path/chars/-._~", "/rfc3986/unreserved/path/chars/-._~" },
 				new String[] { "/rfc3986/subdelims/!$&'()*+,;=", "/rfc3986/subdelims/!$&'()*+,;=" },
 				new String[] { "/rfc3986/pchar/additional/:@", "/rfc3986/pchar/additional/:@" },
 				new String[] { "/regex/metacharacters/<([{\\^-=$!|]})?*+.>",
 						"/regex/metacharacters/%3C(%5B%7B%5C%5E-=$!%7C%5D%7D)%3F*+.%3E" } };
 		for (int i = 0; i < testStrings.length; i++) {
 			String[] testString = testStrings[i];
 			final String encoded = MultiProtocolURL.escapePath(testString[0]);
 			assertTrue("Encoded string contains only ascii chars",
 					StandardCharsets.US_ASCII.newEncoder().canEncode(encoded));
 			assertEquals(testString[1], encoded);
 		}
 	}
 	/**
 	 * Unit tests for {@link MultiProtocolURL#escapePathPattern(String)}
 	 */
 	@Test
 	public void testEscapePathPattern() {
 		String[][] testStrings = new String[][] {
 				// "test string" , "expected escaped result"
 				new String[] { "", "" }, new String[] { "/", "/" }, new String[] { "/ascii/path", "/ascii/path" },
 				new String[] { "/latin/chars/àäâéèïîôöù",
 						"/latin/chars/%C3%A0%C3%A4%C3%A2%C3%A9%C3%A8%C3%AF%C3%AE%C3%B4%C3%B6%C3%B9" },
 				new String[] { "/with%char", "/with%25char" }, new String[] { "/wiki/%", "/wiki/%25" },
 				new String[] { "/already/percent-encoded/%C3%9f", "/already/percent-encoded/%C3%9F" },
 				new String[] { "/logograms/正體字/繁體字",
 						"/logograms/%E6%AD%A3%E9%AB%94%E5%AD%97/%E7%B9%81%E9%AB%94%E5%AD%97" },
 				new String[] { "/rfc3986/unreserved/path/chars/-._~", "/rfc3986/unreserved/path/chars/-._~" },
 				new String[] { "/rfc3986/subdelims/!$&'()*+,;=", "/rfc3986/subdelims/!$&'()*+,;=" },
 				new String[] { "/rfc3986/pchar/additional/:@", "/rfc3986/pchar/additional/:@" },
 				new String[] { "/regex/metacharacters/<([{\\^-=$!|]})?*+.>",
 						"/regex/metacharacters/<([{\\^-=$!|]})?*+.>" },
 				new String[] {
 						"/regex/char/classes/[abc]/[^abc]/[a-zA-Z]/[a-d[m-p]]/[a-z&&[def]]/[a-z&&[^bc]]/[a-z&&[^m-p]]",
 						"/regex/char/classes/[abc]/[^abc]/[a-zA-Z]/[a-d[m-p]]/[a-z&&[def]]/[a-z&&[^bc]]/[a-z&&[^m-p]]" },
 				new String[] { "/regex/predefined/char/class/.\\d\\D\\h\\H\\s\\S\\v\\V\\w\\W",
 						"/regex/predefined/char/class/.\\d\\D\\h\\H\\s\\S\\v\\V\\w\\W" },
 				new String[] { "/regex/boundary/matchers/^$\\b\\B\\A\\G\\Z\\z",
 						"/regex/boundary/matchers/^$\\b\\B\\A\\G\\Z\\z" } };
 		for (int i = 0; i < testStrings.length; i++) {
 			String[] testString = testStrings[i];
 			final String encoded = MultiProtocolURL.escapePathPattern(testString[0]);
 			assertTrue("Encoded string contains only ascii chars",
 					StandardCharsets.US_ASCII.newEncoder().canEncode(encoded));
 			assertEquals(testString[1], encoded);
 		}
 	}
 	/**
 	 * Unit tests for {@link MultiProtocolURL#unescape(String)}
 	 */