fix for bad URL decoding

11 years ago · f3a6b6e21e
parent 1092e798a5
commit f3a6b6e21e
2 changed files with 48 additions and 23 deletions
--- a/source/net/yacy/cora/document/id/MultiProtocolURL.java
+++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java
@ -59,7 +59,6 @@ import net.yacy.cora.protocol.http.HTTPClient;
 import net.yacy.cora.util.CommonPattern;
 import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.crawler.retrieval.Response;
-import net.yacy.document.parser.html.CharacterCoding;

 /**
 * MultiProtocolURI provides a URL object for multiple protocols like http, https, ftp, smb and file
@ -225,13 +224,15 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
                if (h.startsWith("///")) { //absolute local file path
                    // no host given
                    this.path = h.substring(2); // "/path"  or "/c:/path"
-                } else { // "//host/path" or "//host/c:/path"
+                } else if (h.startsWith("//")) { // "//host/path" or "//host/c:/path"
                    int q = url.indexOf('/', p + 3);
                    if (q < 0) {
                        this.path = "/";
                    } else {
                        this.path = url.substring(q);
                    }
+                } else if (h.startsWith("/")) { // "/host/path" or "/host/c:/path"
+                    this.path = h;
                }
                this.userInfo = null;
                this.port = -1;
@ -418,7 +419,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
    private void escape() {
        if (this.path != null && this.path.indexOf('%') == -1) escapePath();
        if (this.searchpart != null && this.searchpart.indexOf('%') == -1) escapeSearchpart();
-        if (this.anchor != null && this.anchor.indexOf('%') == -1) escapeAnchor();
+        if (this.anchor != null) this.anchor = escape(this.anchor).toString();
    }

    private void escapePath() {
@ -431,10 +432,6 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
        this.path = ptmp.substring((ptmp.length() > 0) ? 1 : 0);
    }

-    private void escapeAnchor() {
-        this.anchor = escape(this.anchor).toString();
-    }
-
    private void escapeSearchpart() {
        final String[] questp = CommonPattern.AMP.split(this.searchpart, -1);
        final StringBuilder qtmp = new StringBuilder(this.searchpart.length() + 10);
@ -517,24 +514,39 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
        final StringBuilder sbuf = new StringBuilder(len + 10);
        for (int i = 0; i < len; i++) {
            final int ch = s.charAt(i);
-            if ('A' <= ch && ch <= 'Z') {           // 'A'..'Z'
-                sbuf.append((char)ch);
-            } else if ('a' <= ch && ch <= 'z') {    // 'a'..'z'
+            if (ch == ' ') {                 // space
+                sbuf.append("%20");
+            } else if (ch == '%') {
+                if (i < len - 2 && s.charAt(i + 1) >= '0' && s.charAt(i + 1) <= '9' && s.charAt(i + 2) >= '0' && s.charAt(i + 2) <= '9') {
+                    sbuf.append((char)ch);   // lets consider this is used for encoding, leave it that way
+                } else {
+                    sbuf.append("%23");      // RFC 1738 2.2 unsafe char shall be encoded
+                }
+            } else if (ch == '&') { 
+                if (i < len - 6 && "amp;".equals(s.substring(i + 1, i + 5).toLowerCase())) {
+                    sbuf.append((char)ch);   // leave it that way, it is used the right way
+                } else {
+                    sbuf.append("&amp;");    // this must be urlencoded
+                }
                sbuf.append((char)ch);
-            } else if ('0' <= ch && ch <= '9') {    // '0'..'9'
+            } else if (ch == '#') {          // RFC 1738 2.2 unsafe char is _not_ encoded because it may already be used for encoding 
                sbuf.append((char)ch);
-            } else if (ch == ' ') {                 // space
-                sbuf.append("%20");
-            } else if (ch == '&' || ch == ':'       // unreserved
+            } else if (ch == '!' || ch == ':'   // unreserved
                    || ch == '-' || ch == '_'
-                    || ch == '.' || ch == '!'
-                    || ch == '~' || ch == '*'
-                    || ch == '\'' || ch == '('
-                    || ch == ')' || ch == ';' 
-                    || ch == ',' || ch == '=') { // RFC 1738 2.2 special char (may be used unencoded)
+                    || ch == '.' || ch == '~' 
+                    || ch == '*' || ch == '\''
+                    || ch == '(' || ch == ')'
+                    || ch == '{' || ch == '}'
+                    || ch == ';' || ch == ',' || ch == '=') {    // RFC 1738 2.2 unsafe char (may be used unencoded)
+                sbuf.append((char)ch);
+            } else if ('0' <= ch && ch <= '9') {    // '0'..'9'
                sbuf.append((char)ch);
            } else if (ch == '/') {                 // reserved, but may appear in post part where it should not be replaced
                sbuf.append((char)ch);
+            } else if ('A' <= ch && ch <= 'Z') {    // 'A'..'Z'
+                sbuf.append((char)ch);
+            } else if ('a' <= ch && ch <= 'z') {    // 'a'..'z'
+                sbuf.append((char)ch);
            } else if (ch <= 0x007f) {              // other ASCII
                sbuf.append(hex[ch]);
            } else if (ch <= 0x07FF) {              // non-ASCII <= 0x7FF
@ -647,11 +659,15 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
        } else {
            this.searchpart = this.path.substring(r + 1);
            // strip &amp;
+            /*
            Matcher matcher = CharacterCoding.ampPattern.matcher(this.searchpart);
-            while (matcher.find()) {
+            int from = 0;
+            while (matcher.find(from)) {
+                from = matcher.start() + 1;
                this.searchpart = matcher.replaceAll("&");
                matcher.reset(this.searchpart);
            }
+            */
            this.path = this.path.substring(0, r);
        }
    }
@ -934,7 +950,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
        }
        String urlPath = this.getFile(excludeAnchor, removeSessionID);
        String h = getHost();
-        final StringBuilder u = new StringBuilder(20 + urlPath.length() + ((h == null) ? 0 : h.length()));
+        final StringBuilder u = new StringBuilder(20 + (urlPath == null ? 0 : urlPath.length()) + ((h == null) ? 0 : h.length()));
        u.append(this.protocol);
        u.append("://");
        if (h != null) {
@ -2179,10 +2195,11 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
 */
    public static void main(final String[] args) {
        final String[][] test = new String[][]{
+          new String[]{null, "https://www.example.com/shoe/?p=2&ps=75#t={%22san_NaviPaging%22:2}"}, // ugly strange pagination link
          new String[]{null, "C:WINDOWS\\CMD0.EXE"},
          new String[]{null, "file://C:WINDOWS\\CMD0.EXE"},
-          new String[]{null, "file:/bin/yacy1"}, // file://<host>/<path> may have many '/' if the host is omitted and the path starts with '/'
          new String[]{null, "file:///bin/yacy2"}, // file://<host>/<path> may have many '/' if the host is omitted and the path starts with '/'
+          new String[]{null, "file:/bin/yacy1"}, // file://<host>/<path> may have many '/' if the host is omitted and the path starts with '/'
          new String[]{null, "file:C:WINDOWS\\CMD.EXE"},
          new String[]{null, "file:///C:WINDOWS\\CMD1.EXE"},
          new String[]{null, "file:///C|WINDOWS\\CMD2.EXE"},
--- a/source/net/yacy/document/parser/html/CharacterCoding.java
+++ b/source/net/yacy/document/parser/html/CharacterCoding.java
@ -312,6 +312,13 @@ public final class CharacterCoding {
            }
            s = text.substring(p, q + 1);
            p = q + 1;
+            // check if another ampersand is in between
+            int pp;
+            while ((pp = s.indexOf('&', 1)) >= 0) {
+                // we skip the first ampersand
+                sb.append(s.substring(0, pp));
+                s = s.substring(pp);
+            }
            if (s.equals(AMP_HTML)) {
                sb.append(AMP_UNICODE);
                continue;
@ -340,7 +347,8 @@ public final class CharacterCoding {
                } catch (final NumberFormatException e) { }
                continue;
            }
-            // the entity is unknown, skip it
+            // the entity is unknown, copy it
+            sb.append(s);
        }
        return sb.toString();
    }