normalization of url using urlencoding/decoding

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@8017 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent e58438c01c
commit 37e35f2741

@ -29,16 +29,14 @@
//if the shell's current path is HTROOT
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URLDecoder;
import java.util.Date;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Segment;
import net.yacy.search.index.Segments;
@ -98,11 +96,7 @@ public class QuickCrawlLink_p {
// get the URL
String crawlingStart = post.get("url",null);
try {
crawlingStart = URLDecoder.decode(crawlingStart, "UTF-8");
} catch (final UnsupportedEncodingException e) {
Log.logException(e);
}
crawlingStart = UTF8.decodeURL(crawlingStart);
// get the browser title
final String title = post.get("title",null);

@ -29,9 +29,7 @@
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URLDecoder;
import java.util.Collection;
import java.util.Enumeration;
import java.util.Iterator;
@ -358,14 +356,12 @@ public class ViewFile {
if (words.length() > 1 && words.charAt(0) == '[' && words.charAt(words.length() - 1) == ']') {
words = words.substring(1, words.length() - 1);
}
try {
words = URLDecoder.decode(words, "UTF-8");
words = UTF8.decodeURL(words);
if (words.indexOf(' ') >= 0) return words.split(" ");
if (words.indexOf(',') >= 0) return words.split(",");
if (words.indexOf('+') >= 0) return words.split("\\+");
w = new String[1];
w[0] = words;
} catch (final UnsupportedEncodingException e) {}
return w;
}

@ -35,11 +35,12 @@ import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Set;
import java.util.regex.Pattern;
import net.yacy.cora.document.UTF8;
/*
* A class for Parsing robots.txt files.
* It only parses the Deny Part, yet.
@ -129,10 +130,10 @@ public final class RobotsTxtParser {
// parse sitemap; if there are several sitemaps then take the first url
// TODO: support for multiple sitemaps
if (lineUpper.startsWith(ROBOTS_SITEMAP) && (sitemap == null || sitemap.length() == 0)) {
if (lineUpper.startsWith(ROBOTS_SITEMAP) && (this.sitemap == null || this.sitemap.length() == 0)) {
pos = line.indexOf(' ');
if (pos != -1) {
sitemap = line.substring(pos).trim();
this.sitemap = line.substring(pos).trim();
}
continue lineparser;
}
@ -151,7 +152,7 @@ public final class RobotsTxtParser {
inBlock = false;
isRule4AllAgents = false;
isRule4ThisAgents = false;
crawlDelayMillis = 0; // each block has a separate delay
this.crawlDelayMillis = 0; // each block has a separate delay
}
// cutting off comments at the line end
@ -163,7 +164,7 @@ public final class RobotsTxtParser {
if (pos != -1) {
final String userAgent = line.substring(pos).trim();
isRule4AllAgents |= userAgent.equals("*");
for (String agent: this.myNames) {
for (final String agent: this.myNames) {
if (userAgent.toLowerCase().equals(agent)) {
this.agentName = agent;
isRule4ThisAgents = true;
@ -183,7 +184,7 @@ public final class RobotsTxtParser {
if (pos != -1) {
try {
// the crawl delay can be a float number and means number of seconds
crawlDelayMillis = (long) (1000.0 * Float.parseFloat(line.substring(pos).trim()));
this.crawlDelayMillis = (long) (1000.0 * Float.parseFloat(line.substring(pos).trim()));
} catch (final NumberFormatException e) {
// invalid crawling delay
}
@ -213,7 +214,7 @@ public final class RobotsTxtParser {
// unencoding all special charsx
try {
path = URLDecoder.decode(path, "UTF-8");
path = UTF8.decodeURL(path);
} catch (final Exception e) {
/*
* url decoding failed. E.g. because of
@ -239,8 +240,8 @@ public final class RobotsTxtParser {
}
} catch (final IOException e) {}
allowList.addAll(rule4ThisAgentsFound ? allow4ThisAgents : allow4AllAgents);
denyList.addAll(rule4ThisAgentsFound ? deny4ThisAgents : deny4AllAgents);
this.allowList.addAll(rule4ThisAgentsFound ? allow4ThisAgents : allow4AllAgents);
this.denyList.addAll(rule4ThisAgentsFound ? deny4ThisAgents : deny4AllAgents);
}
/**

@ -69,13 +69,11 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.lang.ref.SoftReference;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
@ -266,13 +264,7 @@ public final class HTTPDFileHandler {
return;
}
// url decoding of path
try {
path = URLDecoder.decode(path, "UTF-8");
} catch (final UnsupportedEncodingException e) {
// This should never occur
assert(false) : "UnsupportedEncodingException: " + e.getMessage();
}
path = UTF8.decodeURL(path);
// check against hack attacks in path
if (path.indexOf("..") >= 0) {
@ -538,8 +530,8 @@ public final class HTTPDFileHandler {
// implement proxy via url (not in servlet, because we need binary access on ouputStream)
if (path.equals("/proxy.html")) {
final List<Pattern> urlProxyAccess = Domains.makePatterns(sb.getConfig("proxyURL.access", "127.0.0.1"));
UserDB.Entry user = sb.userDB.getUser(requestHeader);
boolean user_may_see_proxyurl = Domains.matchesList(clientIP, urlProxyAccess) || (user!=null && user.hasRight(UserDB.AccessRight.PROXY_RIGHT));
final UserDB.Entry user = sb.userDB.getUser(requestHeader);
final boolean user_may_see_proxyurl = Domains.matchesList(clientIP, urlProxyAccess) || (user!=null && user.hasRight(UserDB.AccessRight.PROXY_RIGHT));
if (sb.getConfigBool("proxyURL", false) && user_may_see_proxyurl) {
doURLProxy(args, conProp, requestHeader, out);
return;
@ -1308,7 +1300,7 @@ public final class HTTPDFileHandler {
* not in separete servlet, because we need access to binary outstream
* @throws IOException
*/
private static void doURLProxy(final serverObjects args, final HashMap<String, Object> conProp, final RequestHeader requestHeader, OutputStream out) throws IOException {
private static void doURLProxy(final serverObjects args, final HashMap<String, Object> conProp, final RequestHeader requestHeader, final OutputStream out) throws IOException {
final String httpVersion = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HTTP_VER);
URL proxyurl = null;
@ -1430,7 +1422,7 @@ public final class HTTPDFileHandler {
} else if (url.startsWith("//")) {
// absoulte url but same protocol of form href="//domain.com/path"
String complete_url = proxyurl.getProtocol() + ":" + url;
final String complete_url = proxyurl.getProtocol() + ":" + url;
if (sb.getConfig("proxyURL.rewriteURLs", "all").equals("domainlist")) {
if (sb.crawlStacker.urlInAcceptedDomain(new DigestURI(complete_url)) != null) {
continue;
@ -1455,7 +1447,7 @@ public final class HTTPDFileHandler {
newurl = newurl.replaceAll("\\$","\\\\\\$");
m.appendReplacement(result, newurl);
}
catch (MalformedURLException e) {}
catch (final MalformedURLException e) {}
}
}
@ -1466,7 +1458,7 @@ public final class HTTPDFileHandler {
if (outgoingHeader.containsKey(HeaderFramework.TRANSFER_ENCODING)) {
HTTPDemon.sendRespondHeader(conProp, out, httpVersion, httpStatus, outgoingHeader);
ChunkedOutputStream cos = new ChunkedOutputStream(out);
final ChunkedOutputStream cos = new ChunkedOutputStream(out);
cos.write(sbb);
cos.finish();

@ -127,6 +127,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
// identify protocol
assert (url != null);
url = url.trim();
url = UTF8.decodeURL(url); // normalization here
//url = patternSpace.matcher(url).replaceAll(" ");
if (url.startsWith("\\\\")) {
url = "smb://" + patternBackSlash.matcher(url.substring(2)).replaceAll("/");

@ -154,4 +154,51 @@ public class UTF8 {
return s.getBytes(charset);
}
/**
* Decodes a <code>application/x-www-form-urlencoded</code> string using a specific
* encoding scheme.
*/
public static String decodeURL(final String s) {
boolean needToChange = false;
final int numChars = s.length();
final StringBuffer sb = new StringBuffer(numChars > 500 ? numChars / 2 : numChars);
int i = 0;
char c;
byte[] bytes = null;
while (i < numChars) {
c = s.charAt(i);
switch (c) {
case '+':
sb.append(' ');
i++;
needToChange = true;
break;
case '%':
try {
if (bytes == null) bytes = new byte[(numChars-i)/3];
int pos = 0;
while (((i+2) < numChars) && (c=='%')) {
final int v = Integer.parseInt(s.substring(i+1,i+3),16);
if (v < 0) throw new IllegalArgumentException("URLDecoder: Illegal hex characters in escape (%) pattern - negative value");
bytes[pos++] = (byte) v;
i+= 3;
if (i < numChars) c = s.charAt(i);
}
if ((i < numChars) && (c=='%')) throw new IllegalArgumentException("URLDecoder: Incomplete trailing escape (%) pattern");
sb.append(new String(bytes, 0, pos, charset));
} catch (final NumberFormatException e) {
throw new IllegalArgumentException("URLDecoder: Illegal hex characters in escape (%) pattern - " + e.getMessage());
}
needToChange = true;
break;
default:
sb.append(c);
i++;
break;
}
}
return (needToChange? sb.toString() : s);
}
}

Loading…
Cancel
Save