Improved normalization of blacklist path patterns having non ascii chars

Normalize blacklist path patterns using percent-encoding, at pattern
edition in web interface and at loading from configuration files.

Fixes issue #237
pull/250/head
luccioman 6 years ago
parent d42f079c2d
commit ed93221fa1

@ -37,6 +37,9 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.document.id.Punycode;
import net.yacy.cora.document.id.Punycode.PunycodeException;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
import net.yacy.data.ListManager; import net.yacy.data.ListManager;
@ -313,7 +316,10 @@ public class Blacklist_p {
for (final Entry<String, String> selectedEntry : selectedBlacklistEntries.entrySet()) { for (final Entry<String, String> selectedEntry : selectedBlacklistEntries.entrySet()) {
final String editedEntryValue = editedBlacklistEntries.get(selectedEntry.getKey().replace("selectedBlacklistEntry.", "editedBlacklistEntry.")); final String editedEntryValue = editedBlacklistEntries.get(selectedEntry.getKey().replace("selectedBlacklistEntry.", "editedBlacklistEntry."));
if (!selectedEntry.getValue().equals(editedEntryValue)) {
final String preparedNewEntry = prepareNormalizedEntry(editedEntryValue);
if (!normalizeEntry(selectedEntry.getValue()).equals(preparedNewEntry)) {
/* Add first, to detect any eventual syntax errors before removing the old entry */ /* Add first, to detect any eventual syntax errors before removing the old entry */
if (!BlacklistHelper.addBlacklistEntry(blacklistToUse, editedEntryValue, header)) { if (!BlacklistHelper.addBlacklistEntry(blacklistToUse, editedEntryValue, header)) {
@ -540,4 +546,33 @@ public class Blacklist_p {
return prop; return prop;
} }
/**
* @param entry a blacklist entry. Must not be null.
* @return a prepared and normalized entry as done internally in
* BlacklistHelper.addBlacklistEntry()
*/
private static String prepareNormalizedEntry(final String entry) {
return normalizeEntry(BlacklistHelper.prepareEntry(entry));
}
/**
* @param entry a blacklist entry. Must not be null.
* @return a normalized entry (punycode encoded host and percent-encoded path)
* as done internally in BlacklistHelper.addBlacklistEntry()
*/
private static String normalizeEntry(final String entry) {
final int slashPos = entry.indexOf('/', 0);
String host = entry.substring(0, slashPos);
try {
host = Punycode.isBasic(host) ? host : MultiProtocolURL.toPunycode(host);
} catch (final PunycodeException ignored) {
/*
* Punycode encoding error will be handled in
* BlacklistHelper.addBlacklistEntry()
*/
}
String path = MultiProtocolURL.escapePathPattern(entry.substring(slashPos + 1));
return host + "/" + path;
}
} }

@ -81,8 +81,22 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
private static final Pattern patternMail = Pattern.compile("^[a-z]+:.*?"); private static final Pattern patternMail = Pattern.compile("^[a-z]+:.*?");
//private static final Pattern patternSpace = Pattern.compile("%20"); //private static final Pattern patternSpace = Pattern.compile("%20");
private final static BitSet UNRESERVED_RFC1738 = new BitSet(128); // register unreserved chars (never escaped in url) /** Register unreserved chars (never escaped in url) */
private final static BitSet UNRESERVED_PATH = new BitSet(128); // register unreserved chars for path part (not escaped in path) private final static BitSet UNRESERVED_RFC1738 = new BitSet(128);
/** Register unreserved chars for path part (not escaped in path) */
private final static BitSet UNRESERVED_PATH = new BitSet(128);
/**
* Register regular expressions metacharacters used by the {@link Pattern}
* class.
*
* @see <a href=
* "https://docs.oracle.com/javase/tutorial/essential/regex/literals.html">Regular
* expressions string literals documentation</a>
*/
private static final BitSet PATTERN_METACHARACTERS = new BitSet(128);
static { static {
// unreserved characters (chars not to escape in url) // unreserved characters (chars not to escape in url)
for (int i = 'A'; i <= 'Z'; i++) { // hialpha RFC1738 Section 5 for (int i = 'A'; i <= 'Z'; i++) { // hialpha RFC1738 Section 5
@ -119,6 +133,27 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
UNRESERVED_PATH.set('@'); UNRESERVED_PATH.set('@');
UNRESERVED_PATH.set('&'); UNRESERVED_PATH.set('&');
UNRESERVED_PATH.set('='); UNRESERVED_PATH.set('=');
/* Pattern metacharacters : <([{\^-=$!|]})?*+.> */
PATTERN_METACHARACTERS.set('<');
PATTERN_METACHARACTERS.set('(');
PATTERN_METACHARACTERS.set('[');
PATTERN_METACHARACTERS.set('{');
PATTERN_METACHARACTERS.set('\\');
PATTERN_METACHARACTERS.set('^');
PATTERN_METACHARACTERS.set('-');
PATTERN_METACHARACTERS.set('=');
PATTERN_METACHARACTERS.set('$');
PATTERN_METACHARACTERS.set('!');
PATTERN_METACHARACTERS.set('|');
PATTERN_METACHARACTERS.set(']');
PATTERN_METACHARACTERS.set('}');
PATTERN_METACHARACTERS.set(')');
PATTERN_METACHARACTERS.set('?');
PATTERN_METACHARACTERS.set('*');
PATTERN_METACHARACTERS.set('+');
PATTERN_METACHARACTERS.set('.');
PATTERN_METACHARACTERS.set('>');
} }
// session id handling // session id handling
@ -552,45 +587,135 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
* </ul> * </ul>
*/ */
private void escape() { private void escape() {
if (this.path != null && this.path.indexOf('%') == -1) escapePath(); if (this.path != null && this.path.indexOf('%') == -1) {
this.path = escapePath(this.path);
}
if (this.searchpart != null && this.searchpart.indexOf('%') == -1) escapeSearchpart(); if (this.searchpart != null && this.searchpart.indexOf('%') == -1) escapeSearchpart();
if (this.anchor != null) this.anchor = escape(this.anchor).toString(); if (this.anchor != null) this.anchor = escape(this.anchor).toString();
} }
/** /**
* Url encode/escape the path part according to the allowed characters * <p>Percent-encode/escape an URL path part according to the allowed characters
* (RFC1738 & RFC2396) * (see RFC3986, and formerly RFC1738 & RFC2396). Uses UTF-8 character codes for
* uses UTF-8 character codes for non-ASCII * non-ASCII.</p>
*/ * <p>Important : already percent-encoded characters are not re-encoded</p>
private void escapePath() { *
final StringBuilder ptmp = new StringBuilder(this.path.length() + 10); * @param pathToEscape the path part to escape.
boolean modified = false; * @return an escaped path with only ASCII characters, or null when pathToEscape
final int len = this.path.length(); * is null.
for (int i = 0; i < len; i++) { * @see <a href="https://tools.ietf.org/html/rfc3986#section-2.1">RFC3986
int ch = this.path.charAt(i); * percent-encoding section</a>
if (ch <= 0x7F) { * @see <z href="https://tools.ietf.org/html/rfc3986#appendix-A">RFC3986 path
if (UNRESERVED_PATH.get(ch)) { * definition</a>
ptmp.append((char) ch); */
} else { public static String escapePath(final String pathToEscape) {
ptmp.append(hex[ch]); return escapePath(pathToEscape, false);
modified = true;
}
} else if (ch <= 0x07FF) { // non-ASCII <= 0x7FF
ptmp.append(hex[0xc0 | (ch >> 6)]);
ptmp.append(hex[0x80 | (ch & 0x3F)]);
modified = true;
} else { // 0x7FF < ch <= 0xFFFF
ptmp.append(hex[0xe0 | (ch >> 12)]);
ptmp.append(hex[0x80 | ((ch >> 6) & 0x3F)]);
ptmp.append(hex[0x80 | (ch & 0x3F)]);
modified = true;
}
}
if (modified) {
this.path = ptmp.toString();
}
} }
/**
* <p>Percent-encode/escape an URL path regular expression according to the allowed
* characters in an URL path (see RFC3986) and in the {@link Pattern} regular
* expressions. Uses UTF-8 character codes for non-ASCII.</p>
* <p>Important : already percent-encoded characters are not re-encoded</p>
*
* @param pathPattern the URL path regular expression to escape.
* @return an escaped path regular expression with only allowed ASCII
* characters, or null when pathPattern is null.
* @see <a href="https://tools.ietf.org/html/rfc3986#section-2.1">RFC3986
* percent-encoding section</a>
* @see <z href="https://tools.ietf.org/html/rfc3986#appendix-A">RFC3986 path
* definition</a>
*/
public static String escapePathPattern(final String pathPattern) {
return escapePath(pathPattern, true);
}
/**
* <p>
* Percent-encode/escape an URL path part according to the allowed characters
* specified in RFC3986 (formerly RFC1738 and RFC2396). Uses UTF-8 character
* codes for non-ASCII.
* </p>
* <p>
* When isPattern is true, the string is processed as a regular expression, and
* therefore meta-characters used by the {@link Pattern} class are not
* percent-encoded.
* </p>
*
* @param pathToEscape the path part to escape.
* @param isPattern when true, regular meta-characters are not escaped
* @return an escaped path regular expression with only allowed ASCII
* characters, or null when pathPattern is null.
* @see <a href="https://tools.ietf.org/html/rfc3986#section-2.1">RFC3986
* percent-encoding section</a>
* @see <z href="https://tools.ietf.org/html/rfc3986#appendix-A">RFC3986 path
* definition</a>
*/
private static String escapePath(final String pathToEscape, final boolean isPattern) {
if (pathToEscape == null) {
return pathToEscape;
}
final StringBuilder ptmp = new StringBuilder(pathToEscape.length() + 10);
boolean modified = false;
final int len = pathToEscape.length();
int i = 0;
while (i < len) {
int ch = pathToEscape.charAt(i);
if (ch == '%' && (i + 2) < len) {
final char digit1 = pathToEscape.charAt(i + 1);
final char digit2 = pathToEscape.charAt(i + 2);
if (isHexDigit(digit1) && isHexDigit(digit2)) {
/* Already percent-encoded character */
ptmp.append((char) ch);
/* Normalize hexadecimal digits to upper case */
if (Character.isLowerCase(digit1) || Character.isLowerCase(digit2)) {
modified = true;
}
ptmp.append(Character.toUpperCase(digit1));
ptmp.append(Character.toUpperCase(digit2));
i += 2;
} else {
/* Not a valid percent-encoded character : we encode it now */
ptmp.append(hex[ch]);
modified = true;
}
} else if (isPattern && PATTERN_METACHARACTERS.get(ch)) {
ptmp.append((char) ch);
} else if (ch <= 0x7F) {
if (UNRESERVED_PATH.get(ch)) {
ptmp.append((char) ch);
} else {
ptmp.append(hex[ch]);
modified = true;
}
} else if (ch <= 0x07FF) { // non-ASCII <= 0x7FF
ptmp.append(hex[0xc0 | (ch >> 6)]);
ptmp.append(hex[0x80 | (ch & 0x3F)]);
modified = true;
} else { // 0x7FF < ch <= 0xFFFF
ptmp.append(hex[0xe0 | (ch >> 12)]);
ptmp.append(hex[0x80 | ((ch >> 6) & 0x3F)]);
ptmp.append(hex[0x80 | (ch & 0x3F)]);
modified = true;
}
i++;
}
if (modified) {
return ptmp.toString();
}
return pathToEscape;
}
/**
* @param character a character to test
* @return true when the character is a valid hexadecimal digit
*/
private static boolean isHexDigit(final int character) {
return (character >= '0' && character <= '9') || (character >= 'a' && character <= 'f')
|| (character >= 'A' && character <= 'F');
}
private void escapeSearchpart() { private void escapeSearchpart() {
final StringBuilder qtmp = new StringBuilder(this.searchpart.length() + 10); final StringBuilder qtmp = new StringBuilder(this.searchpart.length() + 10);
for (final Map.Entry<String, String> element: getAttributes().entrySet()) { for (final Map.Entry<String, String> element: getAttributes().entrySet()) {

@ -236,7 +236,10 @@ public class Blacklist {
log.warn("ignored blacklist path to prevent 'Dangling meta character' exception: " + a); log.warn("ignored blacklist path to prevent 'Dangling meta character' exception: " + a);
continue; continue;
} }
loadedPathsPattern.add(Pattern.compile(a, Pattern.CASE_INSENSITIVE)); // add case insesitive regex /* We ensure now that any necessary percent-encoding is applied, as the blacklist file may have been manually edited.
* (when using the web interface, encoding should already have been applied in the add() function) */
final String normalizedPattern = MultiProtocolURL.escapePathPattern(a);
loadedPathsPattern.add(Pattern.compile(normalizedPattern, Pattern.CASE_INSENSITIVE)); // add case insesitive regex
} }
// create new entry if host mask unknown, otherwise merge // create new entry if host mask unknown, otherwise merge
@ -348,8 +351,9 @@ public class Blacklist {
final String host = itemToAdd.getHost(); final String host = itemToAdd.getHost();
final String path = itemToAdd.getPath(); final String path = itemToAdd.getPath();
final String safeHost = Punycode.isBasic(host) ? host : MultiProtocolURL.toPunycode(host); final String safeHost = Punycode.isBasic(host) ? host : MultiProtocolURL.toPunycode(host);
final String safePath = MultiProtocolURL.escapePathPattern(path);
if (contains(blacklistType, safeHost, path)) { if (contains(blacklistType, safeHost, safePath)) {
/* Continue to the next item */ /* Continue to the next item */
continue; continue;
} }
@ -364,7 +368,7 @@ public class Blacklist {
continue; continue;
} }
String p = (!path.isEmpty() && path.charAt(0) == '/') ? path.substring(1) : path; String p = (!safePath.isEmpty() && safePath.charAt(0) == '/') ? safePath.substring(1) : safePath;
final Map<String, Set<Pattern>> blacklistMap = getBlacklistMap(blacklistType, isMatchable(host)); final Map<String, Set<Pattern>> blacklistMap = getBlacklistMap(blacklistType, isMatchable(host));
// avoid PatternSyntaxException e // avoid PatternSyntaxException e
@ -376,7 +380,7 @@ public class Blacklist {
Set<Pattern> hostList; Set<Pattern> hostList;
if (!(blacklistMap.containsKey(h) && ((hostList = blacklistMap.get(h)) != null))) { if (!(blacklistMap.containsKey(h) && ((hostList = blacklistMap.get(h)) != null))) {
blacklistMap.put(h, (hostList = new HashSet<Pattern>())); blacklistMap.put(h, (hostList = new HashSet<>()));
} }
Pattern pattern = Pattern.compile(p, Pattern.CASE_INSENSITIVE); Pattern pattern = Pattern.compile(p, Pattern.CASE_INSENSITIVE);
@ -438,6 +442,7 @@ public class Blacklist {
} }
String p = (!path.isEmpty() && path.charAt(0) == '/') ? path.substring(1) : path; String p = (!path.isEmpty() && path.charAt(0) == '/') ? path.substring(1) : path;
p = MultiProtocolURL.escapePathPattern(p);
// avoid PatternSyntaxException e // avoid PatternSyntaxException e
String h = ((!isMatchable(host) && !host.isEmpty() && host.charAt(0) == '*') ? "." + host : host).toLowerCase(Locale.ROOT); String h = ((!isMatchable(host) && !host.isEmpty() && host.charAt(0) == '*') ? "." + host : host).toLowerCase(Locale.ROOT);

@ -29,7 +29,7 @@ public final class BlacklistHelper {
* @param entry a blacklist entry. Must not be null. * @param entry a blacklist entry. Must not be null.
* @return the entry eventually modified to be ready to use by the Blacklist engine * @return the entry eventually modified to be ready to use by the Blacklist engine
*/ */
protected static String prepareEntry(final String entry) { public static String prepareEntry(final String entry) {
String newEntry = entry; String newEntry = entry;
/* Remove the eventual unnecessary Regex line beginning char '^' and URL scheme (protocol) part */ /* Remove the eventual unnecessary Regex line beginning char '^' and URL scheme (protocol) part */
Matcher schemeMatcher = URL_SCHEME_PATTERN.matcher(newEntry); Matcher schemeMatcher = URL_SCHEME_PATTERN.matcher(newEntry);

@ -363,6 +363,69 @@ public class MultiProtocolURLTest {
} }
} }
/**
* Unit tests for {@link MultiProtocolURL#escapePath(String)}
*/
@Test
public void testEscapePath() {
String[][] testStrings = new String[][] {
// "test string" , "expected escaped result"
new String[] { "", "" }, new String[] { "/", "/" }, new String[] { "/ascii/path", "/ascii/path" },
new String[] { "/latin/chars/àäâéèïîôöù",
"/latin/chars/%C3%A0%C3%A4%C3%A2%C3%A9%C3%A8%C3%AF%C3%AE%C3%B4%C3%B6%C3%B9" },
new String[] { "/with%char", "/with%25char" }, new String[] { "/wiki/%", "/wiki/%25" },
new String[] { "/already/percent-encoded/%C3%9f", "/already/percent-encoded/%C3%9F" },
new String[] { "/logograms/正體字/繁體字",
"/logograms/%E6%AD%A3%E9%AB%94%E5%AD%97/%E7%B9%81%E9%AB%94%E5%AD%97" },
new String[] { "/rfc3986/unreserved/path/chars/-._~", "/rfc3986/unreserved/path/chars/-._~" },
new String[] { "/rfc3986/subdelims/!$&'()*+,;=", "/rfc3986/subdelims/!$&'()*+,;=" },
new String[] { "/rfc3986/pchar/additional/:@", "/rfc3986/pchar/additional/:@" },
new String[] { "/regex/metacharacters/<([{\\^-=$!|]})?*+.>",
"/regex/metacharacters/%3C(%5B%7B%5C%5E-=$!%7C%5D%7D)%3F*+.%3E" } };
for (int i = 0; i < testStrings.length; i++) {
String[] testString = testStrings[i];
final String encoded = MultiProtocolURL.escapePath(testString[0]);
assertTrue("Encoded string contains only ascii chars",
StandardCharsets.US_ASCII.newEncoder().canEncode(encoded));
assertEquals(testString[1], encoded);
}
}
/**
* Unit tests for {@link MultiProtocolURL#escapePathPattern(String)}
*/
@Test
public void testEscapePathPattern() {
String[][] testStrings = new String[][] {
// "test string" , "expected escaped result"
new String[] { "", "" }, new String[] { "/", "/" }, new String[] { "/ascii/path", "/ascii/path" },
new String[] { "/latin/chars/àäâéèïîôöù",
"/latin/chars/%C3%A0%C3%A4%C3%A2%C3%A9%C3%A8%C3%AF%C3%AE%C3%B4%C3%B6%C3%B9" },
new String[] { "/with%char", "/with%25char" }, new String[] { "/wiki/%", "/wiki/%25" },
new String[] { "/already/percent-encoded/%C3%9f", "/already/percent-encoded/%C3%9F" },
new String[] { "/logograms/正體字/繁體字",
"/logograms/%E6%AD%A3%E9%AB%94%E5%AD%97/%E7%B9%81%E9%AB%94%E5%AD%97" },
new String[] { "/rfc3986/unreserved/path/chars/-._~", "/rfc3986/unreserved/path/chars/-._~" },
new String[] { "/rfc3986/subdelims/!$&'()*+,;=", "/rfc3986/subdelims/!$&'()*+,;=" },
new String[] { "/rfc3986/pchar/additional/:@", "/rfc3986/pchar/additional/:@" },
new String[] { "/regex/metacharacters/<([{\\^-=$!|]})?*+.>",
"/regex/metacharacters/<([{\\^-=$!|]})?*+.>" },
new String[] {
"/regex/char/classes/[abc]/[^abc]/[a-zA-Z]/[a-d[m-p]]/[a-z&&[def]]/[a-z&&[^bc]]/[a-z&&[^m-p]]",
"/regex/char/classes/[abc]/[^abc]/[a-zA-Z]/[a-d[m-p]]/[a-z&&[def]]/[a-z&&[^bc]]/[a-z&&[^m-p]]" },
new String[] { "/regex/predefined/char/class/.\\d\\D\\h\\H\\s\\S\\v\\V\\w\\W",
"/regex/predefined/char/class/.\\d\\D\\h\\H\\s\\S\\v\\V\\w\\W" },
new String[] { "/regex/boundary/matchers/^$\\b\\B\\A\\G\\Z\\z",
"/regex/boundary/matchers/^$\\b\\B\\A\\G\\Z\\z" } };
for (int i = 0; i < testStrings.length; i++) {
String[] testString = testStrings[i];
final String encoded = MultiProtocolURL.escapePathPattern(testString[0]);
assertTrue("Encoded string contains only ascii chars",
StandardCharsets.US_ASCII.newEncoder().canEncode(encoded));
assertEquals(testString[1], encoded);
}
}
/** /**
* Unit tests for {@link MultiProtocolURL#unescape(String)} * Unit tests for {@link MultiProtocolURL#unescape(String)}
*/ */

Loading…
Cancel
Save