diff --git a/htroot/Blacklist_p.java b/htroot/Blacklist_p.java index 2f2c35cec..0bb4d9990 100644 --- a/htroot/Blacklist_p.java +++ b/htroot/Blacklist_p.java @@ -37,6 +37,9 @@ import java.util.List; import java.util.Map; import java.util.Map.Entry; +import net.yacy.cora.document.id.MultiProtocolURL; +import net.yacy.cora.document.id.Punycode; +import net.yacy.cora.document.id.Punycode.PunycodeException; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; import net.yacy.data.ListManager; @@ -313,7 +316,10 @@ public class Blacklist_p { for (final Entry selectedEntry : selectedBlacklistEntries.entrySet()) { final String editedEntryValue = editedBlacklistEntries.get(selectedEntry.getKey().replace("selectedBlacklistEntry.", "editedBlacklistEntry.")); - if (!selectedEntry.getValue().equals(editedEntryValue)) { + + final String preparedNewEntry = prepareNormalizedEntry(editedEntryValue); + + if (!normalizeEntry(selectedEntry.getValue()).equals(preparedNewEntry)) { /* Add first, to detect any eventual syntax errors before removing the old entry */ if (!BlacklistHelper.addBlacklistEntry(blacklistToUse, editedEntryValue, header)) { @@ -540,4 +546,33 @@ public class Blacklist_p { return prop; } + /** + * @param entry a blacklist entry. Must not be null. + * @return a prepared and normalized entry as done internally in + * BlacklistHelper.addBlacklistEntry() + */ + private static String prepareNormalizedEntry(final String entry) { + return normalizeEntry(BlacklistHelper.prepareEntry(entry)); + } + + /** + * @param entry a blacklist entry. Must not be null. + * @return a normalized entry (punycode encoded host and percent-encoded path) + * as done internally in BlacklistHelper.addBlacklistEntry() + */ + private static String normalizeEntry(final String entry) { + final int slashPos = entry.indexOf('/', 0); + String host = entry.substring(0, slashPos); + try { + host = Punycode.isBasic(host) ? host : MultiProtocolURL.toPunycode(host); + } catch (final PunycodeException ignored) { + /* + * Punycode encoding error will be handled in + * BlacklistHelper.addBlacklistEntry() + */ + } + String path = MultiProtocolURL.escapePathPattern(entry.substring(slashPos + 1)); + return host + "/" + path; + } + } diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java index d681b7f21..bf2f083fa 100644 --- a/source/net/yacy/cora/document/id/MultiProtocolURL.java +++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java @@ -81,8 +81,22 @@ public class MultiProtocolURL implements Serializable, ComparableRegular + * expressions string literals documentation + */ + private static final BitSet PATTERN_METACHARACTERS = new BitSet(128); + static { // unreserved characters (chars not to escape in url) for (int i = 'A'; i <= 'Z'; i++) { // hialpha RFC1738 Section 5 @@ -119,6 +133,27 @@ public class MultiProtocolURL implements Serializable, Comparable'); } // session id handling @@ -552,45 +587,135 @@ public class MultiProtocolURL implements Serializable, Comparable */ private void escape() { - if (this.path != null && this.path.indexOf('%') == -1) escapePath(); + if (this.path != null && this.path.indexOf('%') == -1) { + this.path = escapePath(this.path); + } if (this.searchpart != null && this.searchpart.indexOf('%') == -1) escapeSearchpart(); if (this.anchor != null) this.anchor = escape(this.anchor).toString(); } - /** - * Url encode/escape the path part according to the allowed characters - * (RFC1738 & RFC2396) - * uses UTF-8 character codes for non-ASCII - */ - private void escapePath() { - final StringBuilder ptmp = new StringBuilder(this.path.length() + 10); - boolean modified = false; - final int len = this.path.length(); - for (int i = 0; i < len; i++) { - int ch = this.path.charAt(i); - if (ch <= 0x7F) { - if (UNRESERVED_PATH.get(ch)) { - ptmp.append((char) ch); - } else { - ptmp.append(hex[ch]); - modified = true; - } - } else if (ch <= 0x07FF) { // non-ASCII <= 0x7FF - ptmp.append(hex[0xc0 | (ch >> 6)]); - ptmp.append(hex[0x80 | (ch & 0x3F)]); - modified = true; - } else { // 0x7FF < ch <= 0xFFFF - ptmp.append(hex[0xe0 | (ch >> 12)]); - ptmp.append(hex[0x80 | ((ch >> 6) & 0x3F)]); - ptmp.append(hex[0x80 | (ch & 0x3F)]); - modified = true; - } - } - if (modified) { - this.path = ptmp.toString(); - } + /** + *

Percent-encode/escape an URL path part according to the allowed characters + * (see RFC3986, and formerly RFC1738 & RFC2396). Uses UTF-8 character codes for + * non-ASCII.

+ *

Important : already percent-encoded characters are not re-encoded

+ * + * @param pathToEscape the path part to escape. + * @return an escaped path with only ASCII characters, or null when pathToEscape + * is null. + * @see RFC3986 + * percent-encoding section + * @see RFC3986 path + * definition + */ + public static String escapePath(final String pathToEscape) { + return escapePath(pathToEscape, false); } + + /** + *

Percent-encode/escape an URL path regular expression according to the allowed + * characters in an URL path (see RFC3986) and in the {@link Pattern} regular + * expressions. Uses UTF-8 character codes for non-ASCII.

+ *

Important : already percent-encoded characters are not re-encoded

+ * + * @param pathPattern the URL path regular expression to escape. + * @return an escaped path regular expression with only allowed ASCII + * characters, or null when pathPattern is null. + * @see RFC3986 + * percent-encoding section + * @see RFC3986 path + * definition + */ + public static String escapePathPattern(final String pathPattern) { + return escapePath(pathPattern, true); + } + + /** + *

+ * Percent-encode/escape an URL path part according to the allowed characters + * specified in RFC3986 (formerly RFC1738 and RFC2396). Uses UTF-8 character + * codes for non-ASCII. + *

+ *

+ * When isPattern is true, the string is processed as a regular expression, and + * therefore meta-characters used by the {@link Pattern} class are not + * percent-encoded. + *

+ * + * @param pathToEscape the path part to escape. + * @param isPattern when true, regular meta-characters are not escaped + * @return an escaped path regular expression with only allowed ASCII + * characters, or null when pathPattern is null. + * @see RFC3986 + * percent-encoding section + * @see RFC3986 path + * definition + */ + private static String escapePath(final String pathToEscape, final boolean isPattern) { + if (pathToEscape == null) { + return pathToEscape; + } + final StringBuilder ptmp = new StringBuilder(pathToEscape.length() + 10); + boolean modified = false; + final int len = pathToEscape.length(); + int i = 0; + while (i < len) { + int ch = pathToEscape.charAt(i); + if (ch == '%' && (i + 2) < len) { + final char digit1 = pathToEscape.charAt(i + 1); + final char digit2 = pathToEscape.charAt(i + 2); + if (isHexDigit(digit1) && isHexDigit(digit2)) { + /* Already percent-encoded character */ + ptmp.append((char) ch); + /* Normalize hexadecimal digits to upper case */ + if (Character.isLowerCase(digit1) || Character.isLowerCase(digit2)) { + modified = true; + } + ptmp.append(Character.toUpperCase(digit1)); + ptmp.append(Character.toUpperCase(digit2)); + i += 2; + } else { + /* Not a valid percent-encoded character : we encode it now */ + ptmp.append(hex[ch]); + modified = true; + } + } else if (isPattern && PATTERN_METACHARACTERS.get(ch)) { + ptmp.append((char) ch); + } else if (ch <= 0x7F) { + if (UNRESERVED_PATH.get(ch)) { + ptmp.append((char) ch); + } else { + ptmp.append(hex[ch]); + modified = true; + } + } else if (ch <= 0x07FF) { // non-ASCII <= 0x7FF + ptmp.append(hex[0xc0 | (ch >> 6)]); + ptmp.append(hex[0x80 | (ch & 0x3F)]); + modified = true; + } else { // 0x7FF < ch <= 0xFFFF + ptmp.append(hex[0xe0 | (ch >> 12)]); + ptmp.append(hex[0x80 | ((ch >> 6) & 0x3F)]); + ptmp.append(hex[0x80 | (ch & 0x3F)]); + modified = true; + } + i++; + } + if (modified) { + return ptmp.toString(); + } + return pathToEscape; + } + + /** + * @param character a character to test + * @return true when the character is a valid hexadecimal digit + */ + private static boolean isHexDigit(final int character) { + return (character >= '0' && character <= '9') || (character >= 'a' && character <= 'f') + || (character >= 'A' && character <= 'F'); + } + private void escapeSearchpart() { final StringBuilder qtmp = new StringBuilder(this.searchpart.length() + 10); for (final Map.Entry element: getAttributes().entrySet()) { diff --git a/source/net/yacy/repository/Blacklist.java b/source/net/yacy/repository/Blacklist.java index d7c796044..27b197a47 100644 --- a/source/net/yacy/repository/Blacklist.java +++ b/source/net/yacy/repository/Blacklist.java @@ -236,7 +236,10 @@ public class Blacklist { log.warn("ignored blacklist path to prevent 'Dangling meta character' exception: " + a); continue; } - loadedPathsPattern.add(Pattern.compile(a, Pattern.CASE_INSENSITIVE)); // add case insesitive regex + /* We ensure now that any necessary percent-encoding is applied, as the blacklist file may have been manually edited. + * (when using the web interface, encoding should already have been applied in the add() function) */ + final String normalizedPattern = MultiProtocolURL.escapePathPattern(a); + loadedPathsPattern.add(Pattern.compile(normalizedPattern, Pattern.CASE_INSENSITIVE)); // add case insesitive regex } // create new entry if host mask unknown, otherwise merge @@ -348,8 +351,9 @@ public class Blacklist { final String host = itemToAdd.getHost(); final String path = itemToAdd.getPath(); final String safeHost = Punycode.isBasic(host) ? host : MultiProtocolURL.toPunycode(host); + final String safePath = MultiProtocolURL.escapePathPattern(path); - if (contains(blacklistType, safeHost, path)) { + if (contains(blacklistType, safeHost, safePath)) { /* Continue to the next item */ continue; } @@ -364,7 +368,7 @@ public class Blacklist { continue; } - String p = (!path.isEmpty() && path.charAt(0) == '/') ? path.substring(1) : path; + String p = (!safePath.isEmpty() && safePath.charAt(0) == '/') ? safePath.substring(1) : safePath; final Map> blacklistMap = getBlacklistMap(blacklistType, isMatchable(host)); // avoid PatternSyntaxException e @@ -376,7 +380,7 @@ public class Blacklist { Set hostList; if (!(blacklistMap.containsKey(h) && ((hostList = blacklistMap.get(h)) != null))) { - blacklistMap.put(h, (hostList = new HashSet())); + blacklistMap.put(h, (hostList = new HashSet<>())); } Pattern pattern = Pattern.compile(p, Pattern.CASE_INSENSITIVE); @@ -438,6 +442,7 @@ public class Blacklist { } String p = (!path.isEmpty() && path.charAt(0) == '/') ? path.substring(1) : path; + p = MultiProtocolURL.escapePathPattern(p); // avoid PatternSyntaxException e String h = ((!isMatchable(host) && !host.isEmpty() && host.charAt(0) == '*') ? "." + host : host).toLowerCase(Locale.ROOT); diff --git a/source/net/yacy/repository/BlacklistHelper.java b/source/net/yacy/repository/BlacklistHelper.java index 393cda868..fa5a15cef 100644 --- a/source/net/yacy/repository/BlacklistHelper.java +++ b/source/net/yacy/repository/BlacklistHelper.java @@ -29,7 +29,7 @@ public final class BlacklistHelper { * @param entry a blacklist entry. Must not be null. * @return the entry eventually modified to be ready to use by the Blacklist engine */ - protected static String prepareEntry(final String entry) { + public static String prepareEntry(final String entry) { String newEntry = entry; /* Remove the eventual unnecessary Regex line beginning char '^' and URL scheme (protocol) part */ Matcher schemeMatcher = URL_SCHEME_PATTERN.matcher(newEntry); diff --git a/test/java/net/yacy/cora/document/id/MultiProtocolURLTest.java b/test/java/net/yacy/cora/document/id/MultiProtocolURLTest.java index 64f7d49c8..370279269 100644 --- a/test/java/net/yacy/cora/document/id/MultiProtocolURLTest.java +++ b/test/java/net/yacy/cora/document/id/MultiProtocolURLTest.java @@ -363,6 +363,69 @@ public class MultiProtocolURLTest { } } + /** + * Unit tests for {@link MultiProtocolURL#escapePath(String)} + */ + @Test + public void testEscapePath() { + String[][] testStrings = new String[][] { + // "test string" , "expected escaped result" + new String[] { "", "" }, new String[] { "/", "/" }, new String[] { "/ascii/path", "/ascii/path" }, + new String[] { "/latin/chars/àäâéèïîôöù", + "/latin/chars/%C3%A0%C3%A4%C3%A2%C3%A9%C3%A8%C3%AF%C3%AE%C3%B4%C3%B6%C3%B9" }, + new String[] { "/with%char", "/with%25char" }, new String[] { "/wiki/%", "/wiki/%25" }, + new String[] { "/already/percent-encoded/%C3%9f", "/already/percent-encoded/%C3%9F" }, + new String[] { "/logograms/正體字/繁體字", + "/logograms/%E6%AD%A3%E9%AB%94%E5%AD%97/%E7%B9%81%E9%AB%94%E5%AD%97" }, + new String[] { "/rfc3986/unreserved/path/chars/-._~", "/rfc3986/unreserved/path/chars/-._~" }, + new String[] { "/rfc3986/subdelims/!$&'()*+,;=", "/rfc3986/subdelims/!$&'()*+,;=" }, + new String[] { "/rfc3986/pchar/additional/:@", "/rfc3986/pchar/additional/:@" }, + new String[] { "/regex/metacharacters/<([{\\^-=$!|]})?*+.>", + "/regex/metacharacters/%3C(%5B%7B%5C%5E-=$!%7C%5D%7D)%3F*+.%3E" } }; + for (int i = 0; i < testStrings.length; i++) { + String[] testString = testStrings[i]; + final String encoded = MultiProtocolURL.escapePath(testString[0]); + assertTrue("Encoded string contains only ascii chars", + StandardCharsets.US_ASCII.newEncoder().canEncode(encoded)); + assertEquals(testString[1], encoded); + } + } + + /** + * Unit tests for {@link MultiProtocolURL#escapePathPattern(String)} + */ + @Test + public void testEscapePathPattern() { + String[][] testStrings = new String[][] { + // "test string" , "expected escaped result" + new String[] { "", "" }, new String[] { "/", "/" }, new String[] { "/ascii/path", "/ascii/path" }, + new String[] { "/latin/chars/àäâéèïîôöù", + "/latin/chars/%C3%A0%C3%A4%C3%A2%C3%A9%C3%A8%C3%AF%C3%AE%C3%B4%C3%B6%C3%B9" }, + new String[] { "/with%char", "/with%25char" }, new String[] { "/wiki/%", "/wiki/%25" }, + new String[] { "/already/percent-encoded/%C3%9f", "/already/percent-encoded/%C3%9F" }, + new String[] { "/logograms/正體字/繁體字", + "/logograms/%E6%AD%A3%E9%AB%94%E5%AD%97/%E7%B9%81%E9%AB%94%E5%AD%97" }, + new String[] { "/rfc3986/unreserved/path/chars/-._~", "/rfc3986/unreserved/path/chars/-._~" }, + new String[] { "/rfc3986/subdelims/!$&'()*+,;=", "/rfc3986/subdelims/!$&'()*+,;=" }, + new String[] { "/rfc3986/pchar/additional/:@", "/rfc3986/pchar/additional/:@" }, + new String[] { "/regex/metacharacters/<([{\\^-=$!|]})?*+.>", + "/regex/metacharacters/<([{\\^-=$!|]})?*+.>" }, + new String[] { + "/regex/char/classes/[abc]/[^abc]/[a-zA-Z]/[a-d[m-p]]/[a-z&&[def]]/[a-z&&[^bc]]/[a-z&&[^m-p]]", + "/regex/char/classes/[abc]/[^abc]/[a-zA-Z]/[a-d[m-p]]/[a-z&&[def]]/[a-z&&[^bc]]/[a-z&&[^m-p]]" }, + new String[] { "/regex/predefined/char/class/.\\d\\D\\h\\H\\s\\S\\v\\V\\w\\W", + "/regex/predefined/char/class/.\\d\\D\\h\\H\\s\\S\\v\\V\\w\\W" }, + new String[] { "/regex/boundary/matchers/^$\\b\\B\\A\\G\\Z\\z", + "/regex/boundary/matchers/^$\\b\\B\\A\\G\\Z\\z" } }; + for (int i = 0; i < testStrings.length; i++) { + String[] testString = testStrings[i]; + final String encoded = MultiProtocolURL.escapePathPattern(testString[0]); + assertTrue("Encoded string contains only ascii chars", + StandardCharsets.US_ASCII.newEncoder().canEncode(encoded)); + assertEquals(testString[1], encoded); + } + } + /** * Unit tests for {@link MultiProtocolURL#unescape(String)} */