normalization of url using urlencoding/decoding

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@8017 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent e58438c01c
commit 37e35f2741

@ -29,16 +29,14 @@
//if the shell's current path is HTROOT
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URLDecoder;
import java.util.Date;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Segment;
import net.yacy.search.index.Segments;
@ -98,11 +96,7 @@ public class QuickCrawlLink_p {
// get the URL
String crawlingStart = post.get("url",null);
try {
crawlingStart = URLDecoder.decode(crawlingStart, "UTF-8");
} catch (final UnsupportedEncodingException e) {
Log.logException(e);
}
crawlingStart = UTF8.decodeURL(crawlingStart);
// get the browser title
final String title = post.get("title",null);

@ -29,9 +29,7 @@
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URLDecoder;
import java.util.Collection;
import java.util.Enumeration;
import java.util.Iterator;
@ -358,14 +356,12 @@ public class ViewFile {
if (words.length() > 1 && words.charAt(0) == '[' && words.charAt(words.length() - 1) == ']') {
words = words.substring(1, words.length() - 1);
}
try {
words = URLDecoder.decode(words, "UTF-8");
if (words.indexOf(' ') >= 0) return words.split(" ");
if (words.indexOf(',') >= 0) return words.split(",");
if (words.indexOf('+') >= 0) return words.split("\\+");
w = new String[1];
w[0] = words;
} catch (final UnsupportedEncodingException e) {}
words = UTF8.decodeURL(words);
if (words.indexOf(' ') >= 0) return words.split(" ");
if (words.indexOf(',') >= 0) return words.split(",");
if (words.indexOf('+') >= 0) return words.split("\\+");
w = new String[1];
w[0] = words;
return w;
}

@ -1,24 +1,24 @@
/*
robotsParser.java
robotsParser.java
-------------------------------------
part of YACY
(C) 2005, 2006 by Alexander Schier
Martin Thelian
last change: $LastChangedDate$LastChangedBy: orbiter $
Revision: $LastChangedRevision$
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General public License for more details.
You should have received a copy of the GNU General private License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
@ -35,48 +35,49 @@ import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Set;
import java.util.regex.Pattern;
import net.yacy.cora.document.UTF8;
/*
* A class for Parsing robots.txt files.
* It only parses the Deny Part, yet.
*
*
* Robots RFC
* http://www.robotstxt.org/wc/norobots-rfc.html
*
*
* TODO:
* - On the request attempt resulted in temporary failure a robot
* should defer visits to the site until such time as the resource
* can be retrieved.
*
* - Extended Standard for Robot Exclusion
*
* - Extended Standard for Robot Exclusion
* See: http://www.conman.org/people/spc/robots2.html
*
* - Robot Exclusion Standard Revisited
*
* - Robot Exclusion Standard Revisited
* See: http://www.kollar.com/robots.html
*/
public final class RobotsTxtParser {
private static final Pattern patternTab = Pattern.compile("\t");
private static final String ROBOTS_USER_AGENT = "User-agent:".toUpperCase();
private static final String ROBOTS_DISALLOW = "Disallow:".toUpperCase();
private static final String ROBOTS_ALLOW = "Allow:".toUpperCase();
private static final String ROBOTS_COMMENT = "#";
private static final String ROBOTS_SITEMAP = "Sitemap:".toUpperCase();
private static final String ROBOTS_CRAWL_DELAY = "Crawl-delay:".toUpperCase();
private final ArrayList<String> allowList;
private final ArrayList<String> denyList;
private String sitemap;
private long crawlDelayMillis;
private final Set<String> myNames; // a list of own name lists
private String agentName; // the name of the agent that was used to return the result
protected RobotsTxtParser(final byte[] robotsTxt, final Set<String> myNames) {
this.allowList = new ArrayList<String>(0);
this.denyList = new ArrayList<String>(0);
@ -90,26 +91,26 @@ public final class RobotsTxtParser {
parse(reader);
}
}
private void parse(final BufferedReader reader) {
final ArrayList<String> deny4AllAgents = new ArrayList<String>();
final ArrayList<String> deny4ThisAgents = new ArrayList<String>();
final ArrayList<String> allow4AllAgents = new ArrayList<String>();
final ArrayList<String> allow4ThisAgents = new ArrayList<String>();
int pos;
String line = null, lineUpper = null;
boolean isRule4AllAgents = false,
isRule4ThisAgents = false,
rule4ThisAgentsFound = false,
inBlock = false;
inBlock = false;
try {
lineparser: while ((line = reader.readLine()) != null) {
// replacing all tabs with spaces
line = patternTab.matcher(line).replaceAll(" ").trim();
lineUpper = line.toUpperCase();
// parse empty line
if (line.length() == 0) {
// we have reached the end of the rule block
@ -120,26 +121,26 @@ public final class RobotsTxtParser {
}
continue lineparser;
}
// parse comment
if (line.startsWith(ROBOTS_COMMENT)) {
// we can ignore this. Just a comment line
continue lineparser;
}
// parse sitemap; if there are several sitemaps then take the first url
// TODO: support for multiple sitemaps
if (lineUpper.startsWith(ROBOTS_SITEMAP) && (sitemap == null || sitemap.length() == 0)) {
if (lineUpper.startsWith(ROBOTS_SITEMAP) && (this.sitemap == null || this.sitemap.length() == 0)) {
pos = line.indexOf(' ');
if (pos != -1) {
sitemap = line.substring(pos).trim();
this.sitemap = line.substring(pos).trim();
}
continue lineparser;
}
// parse user agent
if (lineUpper.startsWith(ROBOTS_USER_AGENT)) {
if (inBlock) {
// we have detected the start of a new block
if (rule4ThisAgentsFound) {
@ -147,23 +148,23 @@ public final class RobotsTxtParser {
// or global settings which shall not overwrite YaCys settings.
break lineparser;
}
inBlock = false;
isRule4AllAgents = false;
isRule4ThisAgents = false;
crawlDelayMillis = 0; // each block has a separate delay
this.crawlDelayMillis = 0; // each block has a separate delay
}
// cutting off comments at the line end
pos = line.indexOf(ROBOTS_COMMENT);
if (pos != -1) line = line.substring(0,pos).trim();
// getting out the robots name
pos = line.indexOf(' ');
if (pos != -1) {
final String userAgent = line.substring(pos).trim();
isRule4AllAgents |= userAgent.equals("*");
for (String agent: this.myNames) {
for (final String agent: this.myNames) {
if (userAgent.toLowerCase().equals(agent)) {
this.agentName = agent;
isRule4ThisAgents = true;
@ -174,7 +175,7 @@ public final class RobotsTxtParser {
}
continue lineparser;
}
// parse crawl delay
if (lineUpper.startsWith(ROBOTS_CRAWL_DELAY)) {
inBlock = true;
@ -183,7 +184,7 @@ public final class RobotsTxtParser {
if (pos != -1) {
try {
// the crawl delay can be a float number and means number of seconds
crawlDelayMillis = (long) (1000.0 * Float.parseFloat(line.substring(pos).trim()));
this.crawlDelayMillis = (long) (1000.0 * Float.parseFloat(line.substring(pos).trim()));
} catch (final NumberFormatException e) {
// invalid crawling delay
}
@ -191,39 +192,39 @@ public final class RobotsTxtParser {
}
continue lineparser;
}
// parse disallow
if (lineUpper.startsWith(ROBOTS_DISALLOW) || lineUpper.startsWith(ROBOTS_ALLOW)) {
inBlock = true;
final boolean isDisallowRule = lineUpper.startsWith(ROBOTS_DISALLOW);
if (isRule4ThisAgents || isRule4AllAgents) {
// cutting off comments at the line end
pos = line.indexOf(ROBOTS_COMMENT);
if (pos != -1) line = line.substring(0,pos).trim();
// cut off tailing *
if (line.endsWith("*")) line = line.substring(0,line.length()-1);
// parse the path
pos = line.indexOf(' ');
if (pos >= 0) {
// getting the path
String path = line.substring(pos).trim();
// unencoding all special charsx
try {
path = URLDecoder.decode(path, "UTF-8");
path = UTF8.decodeURL(path);
} catch (final Exception e) {
/*
/*
* url decoding failed. E.g. because of
* "Incomplete trailing escape (%) pattern"
*/
}
// escaping all occurences of ; because this char is used as special char in the Robots DB
path = RobotsTxt.ROBOTS_DB_PATH_SEPARATOR_MATCHER.matcher(path).replaceAll("%3B");
// adding it to the pathlist
if (isDisallowRule) {
if (isRule4AllAgents) deny4AllAgents.add(path);
@ -238,11 +239,11 @@ public final class RobotsTxtParser {
}
}
} catch (final IOException e) {}
allowList.addAll(rule4ThisAgentsFound ? allow4ThisAgents : allow4AllAgents);
denyList.addAll(rule4ThisAgentsFound ? deny4ThisAgents : deny4AllAgents);
this.allowList.addAll(rule4ThisAgentsFound ? allow4ThisAgents : allow4AllAgents);
this.denyList.addAll(rule4ThisAgentsFound ? deny4ThisAgents : deny4AllAgents);
}
/**
* a crawl delay can be assigned to every agent or for all agents
* a special case is where the user agent of this yacy peer is given explicitely
@ -253,7 +254,7 @@ public final class RobotsTxtParser {
protected long crawlDelayMillis() {
return this.crawlDelayMillis;
}
/**
* the user agent that was applied to get the crawl properties is recorded
* because it is possible that this robots.txt parser applies to several user agents
@ -264,15 +265,15 @@ public final class RobotsTxtParser {
protected String agentName() {
return this.agentName;
}
protected String sitemap() {
return this.sitemap;
}
protected ArrayList<String> allowList() {
return this.allowList;
}
protected ArrayList<String> denyList() {
return this.denyList;
}

@ -69,13 +69,11 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.lang.ref.SoftReference;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
@ -266,13 +264,7 @@ public final class HTTPDFileHandler {
return;
}
// url decoding of path
try {
path = URLDecoder.decode(path, "UTF-8");
} catch (final UnsupportedEncodingException e) {
// This should never occur
assert(false) : "UnsupportedEncodingException: " + e.getMessage();
}
path = UTF8.decodeURL(path);
// check against hack attacks in path
if (path.indexOf("..") >= 0) {
@ -538,8 +530,8 @@ public final class HTTPDFileHandler {
// implement proxy via url (not in servlet, because we need binary access on ouputStream)
if (path.equals("/proxy.html")) {
final List<Pattern> urlProxyAccess = Domains.makePatterns(sb.getConfig("proxyURL.access", "127.0.0.1"));
UserDB.Entry user = sb.userDB.getUser(requestHeader);
boolean user_may_see_proxyurl = Domains.matchesList(clientIP, urlProxyAccess) || (user!=null && user.hasRight(UserDB.AccessRight.PROXY_RIGHT));
final UserDB.Entry user = sb.userDB.getUser(requestHeader);
final boolean user_may_see_proxyurl = Domains.matchesList(clientIP, urlProxyAccess) || (user!=null && user.hasRight(UserDB.AccessRight.PROXY_RIGHT));
if (sb.getConfigBool("proxyURL", false) && user_may_see_proxyurl) {
doURLProxy(args, conProp, requestHeader, out);
return;
@ -1308,7 +1300,7 @@ public final class HTTPDFileHandler {
* not in separete servlet, because we need access to binary outstream
* @throws IOException
*/
private static void doURLProxy(final serverObjects args, final HashMap<String, Object> conProp, final RequestHeader requestHeader, OutputStream out) throws IOException {
private static void doURLProxy(final serverObjects args, final HashMap<String, Object> conProp, final RequestHeader requestHeader, final OutputStream out) throws IOException {
final String httpVersion = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HTTP_VER);
URL proxyurl = null;
@ -1325,7 +1317,7 @@ public final class HTTPDFileHandler {
}
String host = proxyurl.getHost();
if (proxyurl.getPort() != -1) {
host += ":" + proxyurl.getPort();
host += ":" + proxyurl.getPort();
}
// set properties for proxy connection
@ -1430,7 +1422,7 @@ public final class HTTPDFileHandler {
} else if (url.startsWith("//")) {
// absoulte url but same protocol of form href="//domain.com/path"
String complete_url = proxyurl.getProtocol() + ":" + url;
final String complete_url = proxyurl.getProtocol() + ":" + url;
if (sb.getConfig("proxyURL.rewriteURLs", "all").equals("domainlist")) {
if (sb.crawlStacker.urlInAcceptedDomain(new DigestURI(complete_url)) != null) {
continue;
@ -1455,7 +1447,7 @@ public final class HTTPDFileHandler {
newurl = newurl.replaceAll("\\$","\\\\\\$");
m.appendReplacement(result, newurl);
}
catch (MalformedURLException e) {}
catch (final MalformedURLException e) {}
}
}
@ -1466,7 +1458,7 @@ public final class HTTPDFileHandler {
if (outgoingHeader.containsKey(HeaderFramework.TRANSFER_ENCODING)) {
HTTPDemon.sendRespondHeader(conProp, out, httpVersion, httpStatus, outgoingHeader);
ChunkedOutputStream cos = new ChunkedOutputStream(out);
final ChunkedOutputStream cos = new ChunkedOutputStream(out);
cos.write(sbb);
cos.finish();

@ -127,6 +127,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
// identify protocol
assert (url != null);
url = url.trim();
url = UTF8.decodeURL(url); // normalization here
//url = patternSpace.matcher(url).replaceAll(" ");
if (url.startsWith("\\\\")) {
url = "smb://" + patternBackSlash.matcher(url.substring(2)).replaceAll("/");

@ -154,4 +154,51 @@ public class UTF8 {
return s.getBytes(charset);
}
/**
* Decodes a <code>application/x-www-form-urlencoded</code> string using a specific
* encoding scheme.
*/
public static String decodeURL(final String s) {
boolean needToChange = false;
final int numChars = s.length();
final StringBuffer sb = new StringBuffer(numChars > 500 ? numChars / 2 : numChars);
int i = 0;
char c;
byte[] bytes = null;
while (i < numChars) {
c = s.charAt(i);
switch (c) {
case '+':
sb.append(' ');
i++;
needToChange = true;
break;
case '%':
try {
if (bytes == null) bytes = new byte[(numChars-i)/3];
int pos = 0;
while (((i+2) < numChars) && (c=='%')) {
final int v = Integer.parseInt(s.substring(i+1,i+3),16);
if (v < 0) throw new IllegalArgumentException("URLDecoder: Illegal hex characters in escape (%) pattern - negative value");
bytes[pos++] = (byte) v;
i+= 3;
if (i < numChars) c = s.charAt(i);
}
if ((i < numChars) && (c=='%')) throw new IllegalArgumentException("URLDecoder: Incomplete trailing escape (%) pattern");
sb.append(new String(bytes, 0, pos, charset));
} catch (final NumberFormatException e) {
throw new IllegalArgumentException("URLDecoder: Illegal hex characters in escape (%) pattern - " + e.getMessage());
}
needToChange = true;
break;
default:
sb.append(c);
i++;
break;
}
}
return (needToChange? sb.toString() : s);
}
}

Loading…
Cancel
Save