fix for bad URL decoding

pull/1/head
Michael Peter Christen 11 years ago
parent 1092e798a5
commit f3a6b6e21e

@ -59,7 +59,6 @@ import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.retrieval.Response;
import net.yacy.document.parser.html.CharacterCoding;
/**
* MultiProtocolURI provides a URL object for multiple protocols like http, https, ftp, smb and file
@ -225,13 +224,15 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
if (h.startsWith("///")) { //absolute local file path
// no host given
this.path = h.substring(2); // "/path" or "/c:/path"
} else { // "//host/path" or "//host/c:/path"
} else if (h.startsWith("//")) { // "//host/path" or "//host/c:/path"
int q = url.indexOf('/', p + 3);
if (q < 0) {
this.path = "/";
} else {
this.path = url.substring(q);
}
} else if (h.startsWith("/")) { // "/host/path" or "/host/c:/path"
this.path = h;
}
this.userInfo = null;
this.port = -1;
@ -418,7 +419,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
private void escape() {
if (this.path != null && this.path.indexOf('%') == -1) escapePath();
if (this.searchpart != null && this.searchpart.indexOf('%') == -1) escapeSearchpart();
if (this.anchor != null && this.anchor.indexOf('%') == -1) escapeAnchor();
if (this.anchor != null) this.anchor = escape(this.anchor).toString();
}
private void escapePath() {
@ -431,10 +432,6 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
this.path = ptmp.substring((ptmp.length() > 0) ? 1 : 0);
}
private void escapeAnchor() {
this.anchor = escape(this.anchor).toString();
}
private void escapeSearchpart() {
final String[] questp = CommonPattern.AMP.split(this.searchpart, -1);
final StringBuilder qtmp = new StringBuilder(this.searchpart.length() + 10);
@ -517,24 +514,39 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
final StringBuilder sbuf = new StringBuilder(len + 10);
for (int i = 0; i < len; i++) {
final int ch = s.charAt(i);
if ('A' <= ch && ch <= 'Z') { // 'A'..'Z'
sbuf.append((char)ch);
} else if ('a' <= ch && ch <= 'z') { // 'a'..'z'
if (ch == ' ') { // space
sbuf.append("%20");
} else if (ch == '%') {
if (i < len - 2 && s.charAt(i + 1) >= '0' && s.charAt(i + 1) <= '9' && s.charAt(i + 2) >= '0' && s.charAt(i + 2) <= '9') {
sbuf.append((char)ch); // lets consider this is used for encoding, leave it that way
} else {
sbuf.append("%23"); // RFC 1738 2.2 unsafe char shall be encoded
}
} else if (ch == '&') {
if (i < len - 6 && "amp;".equals(s.substring(i + 1, i + 5).toLowerCase())) {
sbuf.append((char)ch); // leave it that way, it is used the right way
} else {
sbuf.append("&amp;"); // this must be urlencoded
}
sbuf.append((char)ch);
} else if ('0' <= ch && ch <= '9') { // '0'..'9'
} else if (ch == '#') { // RFC 1738 2.2 unsafe char is _not_ encoded because it may already be used for encoding
sbuf.append((char)ch);
} else if (ch == ' ') { // space
sbuf.append("%20");
} else if (ch == '&' || ch == ':' // unreserved
} else if (ch == '!' || ch == ':' // unreserved
|| ch == '-' || ch == '_'
|| ch == '.' || ch == '!'
|| ch == '~' || ch == '*'
|| ch == '\'' || ch == '('
|| ch == ')' || ch == ';'
|| ch == ',' || ch == '=') { // RFC 1738 2.2 special char (may be used unencoded)
|| ch == '.' || ch == '~'
|| ch == '*' || ch == '\''
|| ch == '(' || ch == ')'
|| ch == '{' || ch == '}'
|| ch == ';' || ch == ',' || ch == '=') { // RFC 1738 2.2 unsafe char (may be used unencoded)
sbuf.append((char)ch);
} else if ('0' <= ch && ch <= '9') { // '0'..'9'
sbuf.append((char)ch);
} else if (ch == '/') { // reserved, but may appear in post part where it should not be replaced
sbuf.append((char)ch);
} else if ('A' <= ch && ch <= 'Z') { // 'A'..'Z'
sbuf.append((char)ch);
} else if ('a' <= ch && ch <= 'z') { // 'a'..'z'
sbuf.append((char)ch);
} else if (ch <= 0x007f) { // other ASCII
sbuf.append(hex[ch]);
} else if (ch <= 0x07FF) { // non-ASCII <= 0x7FF
@ -647,11 +659,15 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
} else {
this.searchpart = this.path.substring(r + 1);
// strip &amp;
/*
Matcher matcher = CharacterCoding.ampPattern.matcher(this.searchpart);
while (matcher.find()) {
int from = 0;
while (matcher.find(from)) {
from = matcher.start() + 1;
this.searchpart = matcher.replaceAll("&");
matcher.reset(this.searchpart);
}
*/
this.path = this.path.substring(0, r);
}
}
@ -934,7 +950,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
}
String urlPath = this.getFile(excludeAnchor, removeSessionID);
String h = getHost();
final StringBuilder u = new StringBuilder(20 + urlPath.length() + ((h == null) ? 0 : h.length()));
final StringBuilder u = new StringBuilder(20 + (urlPath == null ? 0 : urlPath.length()) + ((h == null) ? 0 : h.length()));
u.append(this.protocol);
u.append("://");
if (h != null) {
@ -2179,10 +2195,11 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
*/
public static void main(final String[] args) {
final String[][] test = new String[][]{
new String[]{null, "https://www.example.com/shoe/?p=2&ps=75#t={%22san_NaviPaging%22:2}"}, // ugly strange pagination link
new String[]{null, "C:WINDOWS\\CMD0.EXE"},
new String[]{null, "file://C:WINDOWS\\CMD0.EXE"},
new String[]{null, "file:/bin/yacy1"}, // file://<host>/<path> may have many '/' if the host is omitted and the path starts with '/'
new String[]{null, "file:///bin/yacy2"}, // file://<host>/<path> may have many '/' if the host is omitted and the path starts with '/'
new String[]{null, "file:/bin/yacy1"}, // file://<host>/<path> may have many '/' if the host is omitted and the path starts with '/'
new String[]{null, "file:C:WINDOWS\\CMD.EXE"},
new String[]{null, "file:///C:WINDOWS\\CMD1.EXE"},
new String[]{null, "file:///C|WINDOWS\\CMD2.EXE"},

@ -312,6 +312,13 @@ public final class CharacterCoding {
}
s = text.substring(p, q + 1);
p = q + 1;
// check if another ampersand is in between
int pp;
while ((pp = s.indexOf('&', 1)) >= 0) {
// we skip the first ampersand
sb.append(s.substring(0, pp));
s = s.substring(pp);
}
if (s.equals(AMP_HTML)) {
sb.append(AMP_UNICODE);
continue;
@ -340,7 +347,8 @@ public final class CharacterCoding {
} catch (final NumberFormatException e) { }
continue;
}
// the entity is unknown, skip it
// the entity is unknown, copy it
sb.append(s);
}
return sb.toString();
}

Loading…
Cancel
Save