From f3a6b6e21e1e4acb57dec36dc5c9d44db2026696 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Thu, 10 Jul 2014 01:59:29 +0200 Subject: [PATCH] fix for bad URL decoding --- .../cora/document/id/MultiProtocolURL.java | 61 ++++++++++++------- .../document/parser/html/CharacterCoding.java | 10 ++- 2 files changed, 48 insertions(+), 23 deletions(-) diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java index 7752f4f30..ed3f45596 100644 --- a/source/net/yacy/cora/document/id/MultiProtocolURL.java +++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java @@ -59,7 +59,6 @@ import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.retrieval.Response; -import net.yacy.document.parser.html.CharacterCoding; /** * MultiProtocolURI provides a URL object for multiple protocols like http, https, ftp, smb and file @@ -225,13 +224,15 @@ public class MultiProtocolURL implements Serializable, Comparable 0) ? 1 : 0); } - private void escapeAnchor() { - this.anchor = escape(this.anchor).toString(); - } - private void escapeSearchpart() { final String[] questp = CommonPattern.AMP.split(this.searchpart, -1); final StringBuilder qtmp = new StringBuilder(this.searchpart.length() + 10); @@ -517,24 +514,39 @@ public class MultiProtocolURL implements Serializable, Comparable= '0' && s.charAt(i + 1) <= '9' && s.charAt(i + 2) >= '0' && s.charAt(i + 2) <= '9') { + sbuf.append((char)ch); // lets consider this is used for encoding, leave it that way + } else { + sbuf.append("%23"); // RFC 1738 2.2 unsafe char shall be encoded + } + } else if (ch == '&') { + if (i < len - 6 && "amp;".equals(s.substring(i + 1, i + 5).toLowerCase())) { + sbuf.append((char)ch); // leave it that way, it is used the right way + } else { + sbuf.append("&"); // this must be urlencoded + } sbuf.append((char)ch); - } else if ('0' <= ch && ch <= '9') { // '0'..'9' + } else if (ch == '#') { // RFC 1738 2.2 unsafe char is _not_ encoded because it may already be used for encoding sbuf.append((char)ch); - } else if (ch == ' ') { // space - sbuf.append("%20"); - } else if (ch == '&' || ch == ':' // unreserved + } else if (ch == '!' || ch == ':' // unreserved || ch == '-' || ch == '_' - || ch == '.' || ch == '!' - || ch == '~' || ch == '*' - || ch == '\'' || ch == '(' - || ch == ')' || ch == ';' - || ch == ',' || ch == '=') { // RFC 1738 2.2 special char (may be used unencoded) + || ch == '.' || ch == '~' + || ch == '*' || ch == '\'' + || ch == '(' || ch == ')' + || ch == '{' || ch == '}' + || ch == ';' || ch == ',' || ch == '=') { // RFC 1738 2.2 unsafe char (may be used unencoded) + sbuf.append((char)ch); + } else if ('0' <= ch && ch <= '9') { // '0'..'9' sbuf.append((char)ch); } else if (ch == '/') { // reserved, but may appear in post part where it should not be replaced sbuf.append((char)ch); + } else if ('A' <= ch && ch <= 'Z') { // 'A'..'Z' + sbuf.append((char)ch); + } else if ('a' <= ch && ch <= 'z') { // 'a'..'z' + sbuf.append((char)ch); } else if (ch <= 0x007f) { // other ASCII sbuf.append(hex[ch]); } else if (ch <= 0x07FF) { // non-ASCII <= 0x7FF @@ -647,11 +659,15 @@ public class MultiProtocolURL implements Serializable, Comparable/ may have many '/' if the host is omitted and the path starts with '/' new String[]{null, "file:///bin/yacy2"}, // file:/// may have many '/' if the host is omitted and the path starts with '/' + new String[]{null, "file:/bin/yacy1"}, // file:/// may have many '/' if the host is omitted and the path starts with '/' new String[]{null, "file:C:WINDOWS\\CMD.EXE"}, new String[]{null, "file:///C:WINDOWS\\CMD1.EXE"}, new String[]{null, "file:///C|WINDOWS\\CMD2.EXE"}, diff --git a/source/net/yacy/document/parser/html/CharacterCoding.java b/source/net/yacy/document/parser/html/CharacterCoding.java index f93300cbd..7541e22e1 100644 --- a/source/net/yacy/document/parser/html/CharacterCoding.java +++ b/source/net/yacy/document/parser/html/CharacterCoding.java @@ -312,6 +312,13 @@ public final class CharacterCoding { } s = text.substring(p, q + 1); p = q + 1; + // check if another ampersand is in between + int pp; + while ((pp = s.indexOf('&', 1)) >= 0) { + // we skip the first ampersand + sb.append(s.substring(0, pp)); + s = s.substring(pp); + } if (s.equals(AMP_HTML)) { sb.append(AMP_UNICODE); continue; @@ -340,7 +347,8 @@ public final class CharacterCoding { } catch (final NumberFormatException e) { } continue; } - // the entity is unknown, skip it + // the entity is unknown, copy it + sb.append(s); } return sb.toString(); }