normalization of url using urlencoding/decoding

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@8017 6c8d7289-2bf4-0310-a012-ef5d649a1542
14 years ago · 37e35f2741
parent e58438c01c
commit 37e35f2741
6 changed files with 117 additions and 86 deletions
--- a/htroot/QuickCrawlLink_p.java
+++ b/htroot/QuickCrawlLink_p.java
@ -29,16 +29,14 @@
 //if the shell's current path is HTROOT


-import java.io.UnsupportedEncodingException;
 import java.net.MalformedURLException;
-import java.net.URLDecoder;
 import java.util.Date;

+import net.yacy.cora.document.UTF8;
 import net.yacy.cora.protocol.HeaderFramework;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.services.federated.yacy.CacheStrategy;
 import net.yacy.kelondro.data.meta.DigestURI;
-import net.yacy.kelondro.logging.Log;
 import net.yacy.search.Switchboard;
 import net.yacy.search.index.Segment;
 import net.yacy.search.index.Segments;
@ -98,11 +96,7 @@ public class QuickCrawlLink_p {

        // get the URL
        String crawlingStart = post.get("url",null);
-        try {
-            crawlingStart = URLDecoder.decode(crawlingStart, "UTF-8");
-        } catch (final UnsupportedEncodingException e) {
-            Log.logException(e);
-        }
+        crawlingStart = UTF8.decodeURL(crawlingStart);

        // get the browser title
        final String title = post.get("title",null);
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@ -29,9 +29,7 @@

 import java.io.ByteArrayInputStream;
 import java.io.IOException;
-import java.io.UnsupportedEncodingException;
 import java.net.MalformedURLException;
-import java.net.URLDecoder;
 import java.util.Collection;
 import java.util.Enumeration;
 import java.util.Iterator;
@ -358,14 +356,12 @@ public class ViewFile {
        if (words.length() > 1 && words.charAt(0) == '[' && words.charAt(words.length() - 1) == ']') {
            words = words.substring(1, words.length() - 1);
        }
-        try {
-            words = URLDecoder.decode(words, "UTF-8");
-            if (words.indexOf(' ') >= 0) return words.split(" ");
-            if (words.indexOf(',') >= 0) return words.split(",");
-            if (words.indexOf('+') >= 0) return words.split("\\+");
-            w = new String[1];
-            w[0] = words;
-        } catch (final UnsupportedEncodingException e) {}
+        words = UTF8.decodeURL(words);
+        if (words.indexOf(' ') >= 0) return words.split(" ");
+        if (words.indexOf(',') >= 0) return words.split(",");
+        if (words.indexOf('+') >= 0) return words.split("\\+");
+        w = new String[1];
+        w[0] = words;
        return w;
    }

--- a/source/de/anomic/crawler/RobotsTxtParser.java
+++ b/source/de/anomic/crawler/RobotsTxtParser.java
@ -1,24 +1,24 @@
 /*
-  robotsParser.java 
+  robotsParser.java
  -------------------------------------
  part of YACY
-  
+
  (C) 2005, 2006 by Alexander Schier
                    Martin Thelian
-  
+
  last change: $LastChangedDate$LastChangedBy: orbiter $
  Revision: $LastChangedRevision$
-  
+
  This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU General public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.
-  
+
  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General public License for more details.
-  
+
  You should have received a copy of the GNU General private License
  along with this program; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
@ -35,48 +35,49 @@ import java.io.BufferedReader;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStreamReader;
-import java.net.URLDecoder;
 import java.util.ArrayList;
 import java.util.Set;
 import java.util.regex.Pattern;

+import net.yacy.cora.document.UTF8;
+
 /*
 * A class for Parsing robots.txt files.
 * It only parses the Deny Part, yet.
- *  
+ *
 * Robots RFC
 * http://www.robotstxt.org/wc/norobots-rfc.html
- * 
+ *
 * TODO:
 *      - On the request attempt resulted in temporary failure a robot
 *      should defer visits to the site until such time as the resource
 *      can be retrieved.
- *      
- *      - Extended Standard for Robot Exclusion 
+ *
+ *      - Extended Standard for Robot Exclusion
 *        See: http://www.conman.org/people/spc/robots2.html
- *        
- *      - Robot Exclusion Standard Revisited 
+ *
+ *      - Robot Exclusion Standard Revisited
 *        See: http://www.kollar.com/robots.html
 */

 public final class RobotsTxtParser {
-    
+
    private static final Pattern patternTab = Pattern.compile("\t");
-    
+
 	private static final String ROBOTS_USER_AGENT = "User-agent:".toUpperCase();
    private static final String ROBOTS_DISALLOW = "Disallow:".toUpperCase();
    private static final String ROBOTS_ALLOW = "Allow:".toUpperCase();
    private static final String ROBOTS_COMMENT = "#";
    private static final String ROBOTS_SITEMAP = "Sitemap:".toUpperCase();
    private static final String ROBOTS_CRAWL_DELAY = "Crawl-delay:".toUpperCase();
-    
+
    private final ArrayList<String> allowList;
    private final ArrayList<String> denyList;
    private       String sitemap;
    private       long crawlDelayMillis;
    private final Set<String> myNames; // a list of own name lists
    private       String agentName; // the name of the agent that was used to return the result
-    
+
    protected RobotsTxtParser(final byte[] robotsTxt, final Set<String> myNames) {
        this.allowList = new ArrayList<String>(0);
        this.denyList = new ArrayList<String>(0);
@ -90,26 +91,26 @@ public final class RobotsTxtParser {
            parse(reader);
        }
    }
-    
+
    private void parse(final BufferedReader reader) {
        final ArrayList<String> deny4AllAgents = new ArrayList<String>();
        final ArrayList<String> deny4ThisAgents = new ArrayList<String>();
        final ArrayList<String> allow4AllAgents = new ArrayList<String>();
        final ArrayList<String> allow4ThisAgents = new ArrayList<String>();
-        
+
        int pos;
        String line = null, lineUpper = null;
        boolean isRule4AllAgents = false,
                isRule4ThisAgents = false,
                rule4ThisAgentsFound = false,
-                inBlock = false;        
-        
+                inBlock = false;
+
        try {
            lineparser: while ((line = reader.readLine()) != null) {
                // replacing all tabs with spaces
                line = patternTab.matcher(line).replaceAll(" ").trim();
                lineUpper = line.toUpperCase();
-                
+
                // parse empty line
                if (line.length() == 0) {
                    // we have reached the end of the rule block
@ -120,26 +121,26 @@ public final class RobotsTxtParser {
                    }
                    continue lineparser;
                }
-                
+
                // parse comment
                if (line.startsWith(ROBOTS_COMMENT)) {
                    // we can ignore this. Just a comment line
                    continue lineparser;
                }
-                
+
                // parse sitemap; if there are several sitemaps then take the first url
                // TODO: support for multiple sitemaps
-                if (lineUpper.startsWith(ROBOTS_SITEMAP) && (sitemap == null || sitemap.length() == 0)) {
+                if (lineUpper.startsWith(ROBOTS_SITEMAP) && (this.sitemap == null || this.sitemap.length() == 0)) {
                    pos = line.indexOf(' ');
                    if (pos != -1) {
-                        sitemap = line.substring(pos).trim();
+                        this.sitemap = line.substring(pos).trim();
                    }
                    continue lineparser;
                }
-                
+
                // parse user agent
                if (lineUpper.startsWith(ROBOTS_USER_AGENT)) {
-                    
+
                    if (inBlock) {
                        // we have detected the start of a new block
                        if (rule4ThisAgentsFound) {
@ -147,23 +148,23 @@ public final class RobotsTxtParser {
                            // or global settings which shall not overwrite YaCys settings.
                            break lineparser;
                        }
-                        
+
                        inBlock = false;
                        isRule4AllAgents = false;
                        isRule4ThisAgents = false;
-                        crawlDelayMillis = 0; // each block has a separate delay
+                        this.crawlDelayMillis = 0; // each block has a separate delay
                    }
-                    
+
                    // cutting off comments at the line end
                    pos = line.indexOf(ROBOTS_COMMENT);
                    if (pos != -1) line = line.substring(0,pos).trim();
-                    
+
                    // getting out the robots name
                    pos = line.indexOf(' ');
                    if (pos != -1) {
                        final String userAgent = line.substring(pos).trim();
                        isRule4AllAgents |= userAgent.equals("*");
-                        for (String agent: this.myNames) {
+                        for (final String agent: this.myNames) {
                            if (userAgent.toLowerCase().equals(agent)) {
                                this.agentName = agent;
                                isRule4ThisAgents = true;
@ -174,7 +175,7 @@ public final class RobotsTxtParser {
                    }
                    continue lineparser;
                }
-                
+
                // parse crawl delay
                if (lineUpper.startsWith(ROBOTS_CRAWL_DELAY)) {
                    inBlock = true;
@ -183,7 +184,7 @@ public final class RobotsTxtParser {
                		if (pos != -1) {
                			try {
                				// the crawl delay can be a float number and means number of seconds
-                				crawlDelayMillis = (long) (1000.0 * Float.parseFloat(line.substring(pos).trim()));
+                				this.crawlDelayMillis = (long) (1000.0 * Float.parseFloat(line.substring(pos).trim()));
                			} catch (final NumberFormatException e) {
                				// invalid crawling delay
                			}
@ -191,39 +192,39 @@ public final class RobotsTxtParser {
                	}
                	continue lineparser;
                }
-                
+
                // parse disallow
                if (lineUpper.startsWith(ROBOTS_DISALLOW) || lineUpper.startsWith(ROBOTS_ALLOW)) {
                    inBlock = true;
                    final boolean isDisallowRule = lineUpper.startsWith(ROBOTS_DISALLOW);
-                    
+
                    if (isRule4ThisAgents || isRule4AllAgents) {
                        // cutting off comments at the line end
                        pos = line.indexOf(ROBOTS_COMMENT);
                        if (pos != -1) line = line.substring(0,pos).trim();
-                                           
+
                        // cut off tailing *
                        if (line.endsWith("*")) line = line.substring(0,line.length()-1);
-                        
+
                        // parse the path
                        pos = line.indexOf(' ');
                        if (pos >= 0) {
                            // getting the path
                            String path = line.substring(pos).trim();
-                            
+
                            // unencoding all special charsx
                            try {
-                                path = URLDecoder.decode(path, "UTF-8");
+                                path = UTF8.decodeURL(path);
                            } catch (final Exception e) {
-                                /* 
+                                /*
                                 * url decoding failed. E.g. because of
                                 * "Incomplete trailing escape (%) pattern"
                                 */
                            }
-                            
+
                            // escaping all occurences of ; because this char is used as special char in the Robots DB
                            path = RobotsTxt.ROBOTS_DB_PATH_SEPARATOR_MATCHER.matcher(path).replaceAll("%3B");
-                            
+
                            // adding it to the pathlist
                            if (isDisallowRule) {
                                if (isRule4AllAgents) deny4AllAgents.add(path);
@ -238,11 +239,11 @@ public final class RobotsTxtParser {
                }
            }
        } catch (final IOException e) {}
-        
-        allowList.addAll(rule4ThisAgentsFound ? allow4ThisAgents : allow4AllAgents);
-        denyList.addAll(rule4ThisAgentsFound ? deny4ThisAgents : deny4AllAgents);
+
+        this.allowList.addAll(rule4ThisAgentsFound ? allow4ThisAgents : allow4AllAgents);
+        this.denyList.addAll(rule4ThisAgentsFound ? deny4ThisAgents : deny4AllAgents);
    }
-    
+
    /**
     * a crawl delay can be assigned to every agent or for all agents
     * a special case is where the user agent of this yacy peer is given explicitely
@ -253,7 +254,7 @@ public final class RobotsTxtParser {
    protected long crawlDelayMillis() {
        return this.crawlDelayMillis;
    }
-    
+
    /**
     * the user agent that was applied to get the crawl properties is recorded
     * because it is possible that this robots.txt parser applies to several user agents
@ -264,15 +265,15 @@ public final class RobotsTxtParser {
    protected String agentName() {
        return this.agentName;
    }
-    
+
    protected String sitemap() {
        return this.sitemap;
    }
-    
+
    protected ArrayList<String> allowList() {
        return this.allowList;
    }
-    
+
    protected ArrayList<String> denyList() {
        return this.denyList;
    }
--- a/source/de/anomic/http/server/HTTPDFileHandler.java
+++ b/source/de/anomic/http/server/HTTPDFileHandler.java
@ -69,13 +69,11 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
 import java.io.StringWriter;
-import java.io.UnsupportedEncodingException;
 import java.lang.ref.SoftReference;
 import java.lang.reflect.InvocationTargetException;
 import java.lang.reflect.Method;
 import java.net.MalformedURLException;
 import java.net.URL;
-import java.net.URLDecoder;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.Iterator;
@ -266,13 +264,7 @@ public final class HTTPDFileHandler {
                return;
            }

-            // url decoding of path
-            try {
-                path = URLDecoder.decode(path, "UTF-8");
-            } catch (final UnsupportedEncodingException e) {
-                // This should never occur
-                assert(false) : "UnsupportedEncodingException: " + e.getMessage();
-            }
+            path = UTF8.decodeURL(path);

            // check against hack attacks in path
            if (path.indexOf("..") >= 0) {
@ -538,8 +530,8 @@ public final class HTTPDFileHandler {
            // implement proxy via url (not in servlet, because we need binary access on ouputStream)
            if (path.equals("/proxy.html")) {
            	final List<Pattern> urlProxyAccess = Domains.makePatterns(sb.getConfig("proxyURL.access", "127.0.0.1"));
-                UserDB.Entry user = sb.userDB.getUser(requestHeader);
-                boolean user_may_see_proxyurl = Domains.matchesList(clientIP, urlProxyAccess) || (user!=null && user.hasRight(UserDB.AccessRight.PROXY_RIGHT));
+                final UserDB.Entry user = sb.userDB.getUser(requestHeader);
+                final boolean user_may_see_proxyurl = Domains.matchesList(clientIP, urlProxyAccess) || (user!=null && user.hasRight(UserDB.AccessRight.PROXY_RIGHT));
            	if (sb.getConfigBool("proxyURL", false) && user_may_see_proxyurl) {
            		doURLProxy(args, conProp, requestHeader, out);
            		return;
@ -1308,7 +1300,7 @@ public final class HTTPDFileHandler {
     * not in separete servlet, because we need access to binary outstream
     * @throws IOException
     */
-    private static void doURLProxy(final serverObjects args, final HashMap<String, Object> conProp, final RequestHeader requestHeader, OutputStream out) throws IOException {
+    private static void doURLProxy(final serverObjects args, final HashMap<String, Object> conProp, final RequestHeader requestHeader, final OutputStream out) throws IOException {
        final String httpVersion = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HTTP_VER);
 		URL proxyurl = null;

@ -1325,7 +1317,7 @@ public final class HTTPDFileHandler {
 		}
 		String host = proxyurl.getHost();
 		if (proxyurl.getPort() != -1) {
-			host += ":" + proxyurl.getPort(); 
+			host += ":" + proxyurl.getPort();
 		}

 		// set properties for proxy connection
@ -1430,7 +1422,7 @@ public final class HTTPDFileHandler {

 				} else if (url.startsWith("//")) {
 					// absoulte url but same protocol of form href="//domain.com/path"
-					String complete_url = proxyurl.getProtocol() + ":" +  url;
+					final String complete_url = proxyurl.getProtocol() + ":" +  url;
 					if (sb.getConfig("proxyURL.rewriteURLs", "all").equals("domainlist")) {
 						if (sb.crawlStacker.urlInAcceptedDomain(new DigestURI(complete_url)) != null) {
 							continue;
@ -1455,7 +1447,7 @@ public final class HTTPDFileHandler {
 						newurl = newurl.replaceAll("\\$","\\\\\\$");
 						m.appendReplacement(result, newurl);
 					}
-					catch (MalformedURLException e) {}
+					catch (final MalformedURLException e) {}

 				}
 			}
@ -1466,7 +1458,7 @@ public final class HTTPDFileHandler {
 			if (outgoingHeader.containsKey(HeaderFramework.TRANSFER_ENCODING)) {
 				HTTPDemon.sendRespondHeader(conProp, out, httpVersion, httpStatus, outgoingHeader);

-				ChunkedOutputStream cos = new ChunkedOutputStream(out);
+				final ChunkedOutputStream cos = new ChunkedOutputStream(out);

 				cos.write(sbb);
 				cos.finish();
--- a/source/net/yacy/cora/document/MultiProtocolURI.java
+++ b/source/net/yacy/cora/document/MultiProtocolURI.java
@ -127,6 +127,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
        // identify protocol
        assert (url != null);
        url = url.trim();
+        url = UTF8.decodeURL(url); // normalization here
        //url = patternSpace.matcher(url).replaceAll(" ");
        if (url.startsWith("\\\\")) {
            url = "smb://" + patternBackSlash.matcher(url.substring(2)).replaceAll("/");
--- a/source/net/yacy/cora/document/UTF8.java
+++ b/source/net/yacy/cora/document/UTF8.java
@ -154,4 +154,51 @@ public class UTF8 {
        return s.getBytes(charset);
    }

+    /**
+     * Decodes a <code>application/x-www-form-urlencoded</code> string using a specific
+     * encoding scheme.
+     */
+    public static String decodeURL(final String s) {
+        boolean needToChange = false;
+        final int numChars = s.length();
+        final StringBuffer sb = new StringBuffer(numChars > 500 ? numChars / 2 : numChars);
+        int i = 0;
+        char c;
+        byte[] bytes = null;
+        while (i < numChars) {
+            c = s.charAt(i);
+            switch (c) {
+            case '+':
+                sb.append(' ');
+                i++;
+                needToChange = true;
+                break;
+            case '%':
+                try {
+                    if (bytes == null) bytes = new byte[(numChars-i)/3];
+                    int pos = 0;
+                    while (((i+2) < numChars) && (c=='%')) {
+                        final int v = Integer.parseInt(s.substring(i+1,i+3),16);
+                        if (v < 0) throw new IllegalArgumentException("URLDecoder: Illegal hex characters in escape (%) pattern - negative value");
+                        bytes[pos++] = (byte) v;
+                        i+= 3;
+                        if (i < numChars) c = s.charAt(i);
+                    }
+                    if ((i < numChars) && (c=='%')) throw new IllegalArgumentException("URLDecoder: Incomplete trailing escape (%) pattern");
+                    sb.append(new String(bytes, 0, pos, charset));
+                } catch (final NumberFormatException e) {
+                    throw new IllegalArgumentException("URLDecoder: Illegal hex characters in escape (%) pattern - " + e.getMessage());
+                }
+                needToChange = true;
+                break;
+            default:
+                sb.append(c);
+                i++;
+                break;
+            }
+        }
+
+        return (needToChange? sb.toString() : s);
+    }
+
 }