normalization of url using urlencoding/decoding

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@8017 6c8d7289-2bf4-0310-a012-ef5d649a1542
14 years ago · 37e35f2741
parent e58438c01c
commit 37e35f2741
6 changed files with 117 additions and 86 deletions
--- a/htroot/QuickCrawlLink_p.java
+++ b/htroot/QuickCrawlLink_p.java
@ -29,16 +29,14 @@
 //if the shell's current path is HTROOT


-import java.io.UnsupportedEncodingException;
 import java.net.MalformedURLException;
-import java.net.URLDecoder;
 import java.util.Date;

+import net.yacy.cora.document.UTF8;
 import net.yacy.cora.protocol.HeaderFramework;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.services.federated.yacy.CacheStrategy;
 import net.yacy.kelondro.data.meta.DigestURI;
-import net.yacy.kelondro.logging.Log;
 import net.yacy.search.Switchboard;
 import net.yacy.search.index.Segment;
 import net.yacy.search.index.Segments;
@ -98,11 +96,7 @@ public class QuickCrawlLink_p {

        // get the URL
        String crawlingStart = post.get("url",null);
-        try {
-            crawlingStart = URLDecoder.decode(crawlingStart, "UTF-8");
-        } catch (final UnsupportedEncodingException e) {
-            Log.logException(e);
-        }
+        crawlingStart = UTF8.decodeURL(crawlingStart);

        // get the browser title
        final String title = post.get("title",null);
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@ -29,9 +29,7 @@

 import java.io.ByteArrayInputStream;
 import java.io.IOException;
-import java.io.UnsupportedEncodingException;
 import java.net.MalformedURLException;
-import java.net.URLDecoder;
 import java.util.Collection;
 import java.util.Enumeration;
 import java.util.Iterator;
@ -358,14 +356,12 @@ public class ViewFile {
        if (words.length() > 1 && words.charAt(0) == '[' && words.charAt(words.length() - 1) == ']') {
            words = words.substring(1, words.length() - 1);
        }
-        try {
-            words = URLDecoder.decode(words, "UTF-8");
+        words = UTF8.decodeURL(words);
        if (words.indexOf(' ') >= 0) return words.split(" ");
        if (words.indexOf(',') >= 0) return words.split(",");
        if (words.indexOf('+') >= 0) return words.split("\\+");
        w = new String[1];
        w[0] = words;
-        } catch (final UnsupportedEncodingException e) {}
        return w;
    }

--- a/source/de/anomic/crawler/RobotsTxtParser.java
+++ b/source/de/anomic/crawler/RobotsTxtParser.java
@ -35,11 +35,12 @@ import java.io.BufferedReader;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStreamReader;
-import java.net.URLDecoder;
 import java.util.ArrayList;
 import java.util.Set;
 import java.util.regex.Pattern;

+import net.yacy.cora.document.UTF8;
+
 /*
 * A class for Parsing robots.txt files.
 * It only parses the Deny Part, yet.
@ -129,10 +130,10 @@ public final class RobotsTxtParser {

                // parse sitemap; if there are several sitemaps then take the first url
                // TODO: support for multiple sitemaps
-                if (lineUpper.startsWith(ROBOTS_SITEMAP) && (sitemap == null || sitemap.length() == 0)) {
+                if (lineUpper.startsWith(ROBOTS_SITEMAP) && (this.sitemap == null || this.sitemap.length() == 0)) {
                    pos = line.indexOf(' ');
                    if (pos != -1) {
-                        sitemap = line.substring(pos).trim();
+                        this.sitemap = line.substring(pos).trim();
                    }
                    continue lineparser;
                }
@ -151,7 +152,7 @@ public final class RobotsTxtParser {
                        inBlock = false;
                        isRule4AllAgents = false;
                        isRule4ThisAgents = false;
-                        crawlDelayMillis = 0; // each block has a separate delay
+                        this.crawlDelayMillis = 0; // each block has a separate delay
                    }

                    // cutting off comments at the line end
@ -163,7 +164,7 @@ public final class RobotsTxtParser {
                    if (pos != -1) {
                        final String userAgent = line.substring(pos).trim();
                        isRule4AllAgents |= userAgent.equals("*");
-                        for (String agent: this.myNames) {
+                        for (final String agent: this.myNames) {
                            if (userAgent.toLowerCase().equals(agent)) {
                                this.agentName = agent;
                                isRule4ThisAgents = true;
@ -183,7 +184,7 @@ public final class RobotsTxtParser {
                		if (pos != -1) {
                			try {
                				// the crawl delay can be a float number and means number of seconds
-                				crawlDelayMillis = (long) (1000.0 * Float.parseFloat(line.substring(pos).trim()));
+                				this.crawlDelayMillis = (long) (1000.0 * Float.parseFloat(line.substring(pos).trim()));
                			} catch (final NumberFormatException e) {
                				// invalid crawling delay
                			}
@ -213,7 +214,7 @@ public final class RobotsTxtParser {

                            // unencoding all special charsx
                            try {
-                                path = URLDecoder.decode(path, "UTF-8");
+                                path = UTF8.decodeURL(path);
                            } catch (final Exception e) {
                                /*
                                 * url decoding failed. E.g. because of
@ -239,8 +240,8 @@ public final class RobotsTxtParser {
            }
        } catch (final IOException e) {}

-        allowList.addAll(rule4ThisAgentsFound ? allow4ThisAgents : allow4AllAgents);
-        denyList.addAll(rule4ThisAgentsFound ? deny4ThisAgents : deny4AllAgents);
+        this.allowList.addAll(rule4ThisAgentsFound ? allow4ThisAgents : allow4AllAgents);
+        this.denyList.addAll(rule4ThisAgentsFound ? deny4ThisAgents : deny4AllAgents);
    }

    /**
--- a/source/de/anomic/http/server/HTTPDFileHandler.java
+++ b/source/de/anomic/http/server/HTTPDFileHandler.java
@ -69,13 +69,11 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
 import java.io.StringWriter;
-import java.io.UnsupportedEncodingException;
 import java.lang.ref.SoftReference;
 import java.lang.reflect.InvocationTargetException;
 import java.lang.reflect.Method;
 import java.net.MalformedURLException;
 import java.net.URL;
-import java.net.URLDecoder;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.Iterator;
@ -266,13 +264,7 @@ public final class HTTPDFileHandler {
                return;
            }

-            // url decoding of path
-            try {
-                path = URLDecoder.decode(path, "UTF-8");
-            } catch (final UnsupportedEncodingException e) {
-                // This should never occur
-                assert(false) : "UnsupportedEncodingException: " + e.getMessage();
-            }
+            path = UTF8.decodeURL(path);

            // check against hack attacks in path
            if (path.indexOf("..") >= 0) {
@ -538,8 +530,8 @@ public final class HTTPDFileHandler {
            // implement proxy via url (not in servlet, because we need binary access on ouputStream)
            if (path.equals("/proxy.html")) {
            	final List<Pattern> urlProxyAccess = Domains.makePatterns(sb.getConfig("proxyURL.access", "127.0.0.1"));
-                UserDB.Entry user = sb.userDB.getUser(requestHeader);
-                boolean user_may_see_proxyurl = Domains.matchesList(clientIP, urlProxyAccess) || (user!=null && user.hasRight(UserDB.AccessRight.PROXY_RIGHT));
+                final UserDB.Entry user = sb.userDB.getUser(requestHeader);
+                final boolean user_may_see_proxyurl = Domains.matchesList(clientIP, urlProxyAccess) || (user!=null && user.hasRight(UserDB.AccessRight.PROXY_RIGHT));
            	if (sb.getConfigBool("proxyURL", false) && user_may_see_proxyurl) {
            		doURLProxy(args, conProp, requestHeader, out);
            		return;
@ -1308,7 +1300,7 @@ public final class HTTPDFileHandler {
     * not in separete servlet, because we need access to binary outstream
     * @throws IOException
     */
-    private static void doURLProxy(final serverObjects args, final HashMap<String, Object> conProp, final RequestHeader requestHeader, OutputStream out) throws IOException {
+    private static void doURLProxy(final serverObjects args, final HashMap<String, Object> conProp, final RequestHeader requestHeader, final OutputStream out) throws IOException {
        final String httpVersion = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HTTP_VER);
 		URL proxyurl = null;

@ -1430,7 +1422,7 @@ public final class HTTPDFileHandler {

 				} else if (url.startsWith("//")) {
 					// absoulte url but same protocol of form href="//domain.com/path"
-					String complete_url = proxyurl.getProtocol() + ":" +  url;
+					final String complete_url = proxyurl.getProtocol() + ":" +  url;
 					if (sb.getConfig("proxyURL.rewriteURLs", "all").equals("domainlist")) {
 						if (sb.crawlStacker.urlInAcceptedDomain(new DigestURI(complete_url)) != null) {
 							continue;
@ -1455,7 +1447,7 @@ public final class HTTPDFileHandler {
 						newurl = newurl.replaceAll("\\$","\\\\\\$");
 						m.appendReplacement(result, newurl);
 					}
-					catch (MalformedURLException e) {}
+					catch (final MalformedURLException e) {}

 				}
 			}
@ -1466,7 +1458,7 @@ public final class HTTPDFileHandler {
 			if (outgoingHeader.containsKey(HeaderFramework.TRANSFER_ENCODING)) {
 				HTTPDemon.sendRespondHeader(conProp, out, httpVersion, httpStatus, outgoingHeader);

-				ChunkedOutputStream cos = new ChunkedOutputStream(out);
+				final ChunkedOutputStream cos = new ChunkedOutputStream(out);

 				cos.write(sbb);
 				cos.finish();
--- a/source/net/yacy/cora/document/MultiProtocolURI.java
+++ b/source/net/yacy/cora/document/MultiProtocolURI.java
@ -127,6 +127,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
        // identify protocol
        assert (url != null);
        url = url.trim();
+        url = UTF8.decodeURL(url); // normalization here
        //url = patternSpace.matcher(url).replaceAll(" ");
        if (url.startsWith("\\\\")) {
            url = "smb://" + patternBackSlash.matcher(url.substring(2)).replaceAll("/");
--- a/source/net/yacy/cora/document/UTF8.java
+++ b/source/net/yacy/cora/document/UTF8.java
@ -154,4 +154,51 @@ public class UTF8 {
        return s.getBytes(charset);
    }

+    /**
+     * Decodes a <code>application/x-www-form-urlencoded</code> string using a specific
+     * encoding scheme.
+     */
+    public static String decodeURL(final String s) {
+        boolean needToChange = false;
+        final int numChars = s.length();
+        final StringBuffer sb = new StringBuffer(numChars > 500 ? numChars / 2 : numChars);
+        int i = 0;
+        char c;
+        byte[] bytes = null;
+        while (i < numChars) {
+            c = s.charAt(i);
+            switch (c) {
+            case '+':
+                sb.append(' ');
+                i++;
+                needToChange = true;
+                break;
+            case '%':
+                try {
+                    if (bytes == null) bytes = new byte[(numChars-i)/3];
+                    int pos = 0;
+                    while (((i+2) < numChars) && (c=='%')) {
+                        final int v = Integer.parseInt(s.substring(i+1,i+3),16);
+                        if (v < 0) throw new IllegalArgumentException("URLDecoder: Illegal hex characters in escape (%) pattern - negative value");
+                        bytes[pos++] = (byte) v;
+                        i+= 3;
+                        if (i < numChars) c = s.charAt(i);
+                    }
+                    if ((i < numChars) && (c=='%')) throw new IllegalArgumentException("URLDecoder: Incomplete trailing escape (%) pattern");
+                    sb.append(new String(bytes, 0, pos, charset));
+                } catch (final NumberFormatException e) {
+                    throw new IllegalArgumentException("URLDecoder: Illegal hex characters in escape (%) pattern - " + e.getMessage());
+                }
+                needToChange = true;
+                break;
+            default:
+                sb.append(c);
+                i++;
+                break;
+            }
+        }
+
+        return (needToChange? sb.toString() : s);
+    }
+
 }