diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index 4db98c48d..f5bfd0a4f 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -29,16 +29,14 @@ //if the shell's current path is HTROOT -import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; -import java.net.URLDecoder; import java.util.Date; +import net.yacy.cora.document.UTF8; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.logging.Log; import net.yacy.search.Switchboard; import net.yacy.search.index.Segment; import net.yacy.search.index.Segments; @@ -98,11 +96,7 @@ public class QuickCrawlLink_p { // get the URL String crawlingStart = post.get("url",null); - try { - crawlingStart = URLDecoder.decode(crawlingStart, "UTF-8"); - } catch (final UnsupportedEncodingException e) { - Log.logException(e); - } + crawlingStart = UTF8.decodeURL(crawlingStart); // get the browser title final String title = post.get("title",null); diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index a25fc4fda..aa66fb061 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -29,9 +29,7 @@ import java.io.ByteArrayInputStream; import java.io.IOException; -import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; -import java.net.URLDecoder; import java.util.Collection; import java.util.Enumeration; import java.util.Iterator; @@ -358,14 +356,12 @@ public class ViewFile { if (words.length() > 1 && words.charAt(0) == '[' && words.charAt(words.length() - 1) == ']') { words = words.substring(1, words.length() - 1); } - try { - words = URLDecoder.decode(words, "UTF-8"); - if (words.indexOf(' ') >= 0) return words.split(" "); - if (words.indexOf(',') >= 0) return words.split(","); - if (words.indexOf('+') >= 0) return words.split("\\+"); - w = new String[1]; - w[0] = words; - } catch (final UnsupportedEncodingException e) {} + words = UTF8.decodeURL(words); + if (words.indexOf(' ') >= 0) return words.split(" "); + if (words.indexOf(',') >= 0) return words.split(","); + if (words.indexOf('+') >= 0) return words.split("\\+"); + w = new String[1]; + w[0] = words; return w; } diff --git a/source/de/anomic/crawler/RobotsTxtParser.java b/source/de/anomic/crawler/RobotsTxtParser.java index 390010da3..fa0a16964 100644 --- a/source/de/anomic/crawler/RobotsTxtParser.java +++ b/source/de/anomic/crawler/RobotsTxtParser.java @@ -1,24 +1,24 @@ /* - robotsParser.java + robotsParser.java ------------------------------------- part of YACY - + (C) 2005, 2006 by Alexander Schier Martin Thelian - + last change: $LastChangedDate$LastChangedBy: orbiter $ Revision: $LastChangedRevision$ - + This program is free software; you can redistribute it and/or modify it under the terms of the GNU General public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. - + This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General public License for more details. - + You should have received a copy of the GNU General private License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA @@ -35,48 +35,49 @@ import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStreamReader; -import java.net.URLDecoder; import java.util.ArrayList; import java.util.Set; import java.util.regex.Pattern; +import net.yacy.cora.document.UTF8; + /* * A class for Parsing robots.txt files. * It only parses the Deny Part, yet. - * + * * Robots RFC * http://www.robotstxt.org/wc/norobots-rfc.html - * + * * TODO: * - On the request attempt resulted in temporary failure a robot * should defer visits to the site until such time as the resource * can be retrieved. - * - * - Extended Standard for Robot Exclusion + * + * - Extended Standard for Robot Exclusion * See: http://www.conman.org/people/spc/robots2.html - * - * - Robot Exclusion Standard Revisited + * + * - Robot Exclusion Standard Revisited * See: http://www.kollar.com/robots.html */ public final class RobotsTxtParser { - + private static final Pattern patternTab = Pattern.compile("\t"); - + private static final String ROBOTS_USER_AGENT = "User-agent:".toUpperCase(); private static final String ROBOTS_DISALLOW = "Disallow:".toUpperCase(); private static final String ROBOTS_ALLOW = "Allow:".toUpperCase(); private static final String ROBOTS_COMMENT = "#"; private static final String ROBOTS_SITEMAP = "Sitemap:".toUpperCase(); private static final String ROBOTS_CRAWL_DELAY = "Crawl-delay:".toUpperCase(); - + private final ArrayList allowList; private final ArrayList denyList; private String sitemap; private long crawlDelayMillis; private final Set myNames; // a list of own name lists private String agentName; // the name of the agent that was used to return the result - + protected RobotsTxtParser(final byte[] robotsTxt, final Set myNames) { this.allowList = new ArrayList(0); this.denyList = new ArrayList(0); @@ -90,26 +91,26 @@ public final class RobotsTxtParser { parse(reader); } } - + private void parse(final BufferedReader reader) { final ArrayList deny4AllAgents = new ArrayList(); final ArrayList deny4ThisAgents = new ArrayList(); final ArrayList allow4AllAgents = new ArrayList(); final ArrayList allow4ThisAgents = new ArrayList(); - + int pos; String line = null, lineUpper = null; boolean isRule4AllAgents = false, isRule4ThisAgents = false, rule4ThisAgentsFound = false, - inBlock = false; - + inBlock = false; + try { lineparser: while ((line = reader.readLine()) != null) { // replacing all tabs with spaces line = patternTab.matcher(line).replaceAll(" ").trim(); lineUpper = line.toUpperCase(); - + // parse empty line if (line.length() == 0) { // we have reached the end of the rule block @@ -120,26 +121,26 @@ public final class RobotsTxtParser { } continue lineparser; } - + // parse comment if (line.startsWith(ROBOTS_COMMENT)) { // we can ignore this. Just a comment line continue lineparser; } - + // parse sitemap; if there are several sitemaps then take the first url // TODO: support for multiple sitemaps - if (lineUpper.startsWith(ROBOTS_SITEMAP) && (sitemap == null || sitemap.length() == 0)) { + if (lineUpper.startsWith(ROBOTS_SITEMAP) && (this.sitemap == null || this.sitemap.length() == 0)) { pos = line.indexOf(' '); if (pos != -1) { - sitemap = line.substring(pos).trim(); + this.sitemap = line.substring(pos).trim(); } continue lineparser; } - + // parse user agent if (lineUpper.startsWith(ROBOTS_USER_AGENT)) { - + if (inBlock) { // we have detected the start of a new block if (rule4ThisAgentsFound) { @@ -147,23 +148,23 @@ public final class RobotsTxtParser { // or global settings which shall not overwrite YaCys settings. break lineparser; } - + inBlock = false; isRule4AllAgents = false; isRule4ThisAgents = false; - crawlDelayMillis = 0; // each block has a separate delay + this.crawlDelayMillis = 0; // each block has a separate delay } - + // cutting off comments at the line end pos = line.indexOf(ROBOTS_COMMENT); if (pos != -1) line = line.substring(0,pos).trim(); - + // getting out the robots name pos = line.indexOf(' '); if (pos != -1) { final String userAgent = line.substring(pos).trim(); isRule4AllAgents |= userAgent.equals("*"); - for (String agent: this.myNames) { + for (final String agent: this.myNames) { if (userAgent.toLowerCase().equals(agent)) { this.agentName = agent; isRule4ThisAgents = true; @@ -174,7 +175,7 @@ public final class RobotsTxtParser { } continue lineparser; } - + // parse crawl delay if (lineUpper.startsWith(ROBOTS_CRAWL_DELAY)) { inBlock = true; @@ -183,7 +184,7 @@ public final class RobotsTxtParser { if (pos != -1) { try { // the crawl delay can be a float number and means number of seconds - crawlDelayMillis = (long) (1000.0 * Float.parseFloat(line.substring(pos).trim())); + this.crawlDelayMillis = (long) (1000.0 * Float.parseFloat(line.substring(pos).trim())); } catch (final NumberFormatException e) { // invalid crawling delay } @@ -191,39 +192,39 @@ public final class RobotsTxtParser { } continue lineparser; } - + // parse disallow if (lineUpper.startsWith(ROBOTS_DISALLOW) || lineUpper.startsWith(ROBOTS_ALLOW)) { inBlock = true; final boolean isDisallowRule = lineUpper.startsWith(ROBOTS_DISALLOW); - + if (isRule4ThisAgents || isRule4AllAgents) { // cutting off comments at the line end pos = line.indexOf(ROBOTS_COMMENT); if (pos != -1) line = line.substring(0,pos).trim(); - + // cut off tailing * if (line.endsWith("*")) line = line.substring(0,line.length()-1); - + // parse the path pos = line.indexOf(' '); if (pos >= 0) { // getting the path String path = line.substring(pos).trim(); - + // unencoding all special charsx try { - path = URLDecoder.decode(path, "UTF-8"); + path = UTF8.decodeURL(path); } catch (final Exception e) { - /* + /* * url decoding failed. E.g. because of * "Incomplete trailing escape (%) pattern" */ } - + // escaping all occurences of ; because this char is used as special char in the Robots DB path = RobotsTxt.ROBOTS_DB_PATH_SEPARATOR_MATCHER.matcher(path).replaceAll("%3B"); - + // adding it to the pathlist if (isDisallowRule) { if (isRule4AllAgents) deny4AllAgents.add(path); @@ -238,11 +239,11 @@ public final class RobotsTxtParser { } } } catch (final IOException e) {} - - allowList.addAll(rule4ThisAgentsFound ? allow4ThisAgents : allow4AllAgents); - denyList.addAll(rule4ThisAgentsFound ? deny4ThisAgents : deny4AllAgents); + + this.allowList.addAll(rule4ThisAgentsFound ? allow4ThisAgents : allow4AllAgents); + this.denyList.addAll(rule4ThisAgentsFound ? deny4ThisAgents : deny4AllAgents); } - + /** * a crawl delay can be assigned to every agent or for all agents * a special case is where the user agent of this yacy peer is given explicitely @@ -253,7 +254,7 @@ public final class RobotsTxtParser { protected long crawlDelayMillis() { return this.crawlDelayMillis; } - + /** * the user agent that was applied to get the crawl properties is recorded * because it is possible that this robots.txt parser applies to several user agents @@ -264,15 +265,15 @@ public final class RobotsTxtParser { protected String agentName() { return this.agentName; } - + protected String sitemap() { return this.sitemap; } - + protected ArrayList allowList() { return this.allowList; } - + protected ArrayList denyList() { return this.denyList; } diff --git a/source/de/anomic/http/server/HTTPDFileHandler.java b/source/de/anomic/http/server/HTTPDFileHandler.java index 3f082de7a..092825c22 100644 --- a/source/de/anomic/http/server/HTTPDFileHandler.java +++ b/source/de/anomic/http/server/HTTPDFileHandler.java @@ -69,13 +69,11 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.StringWriter; -import java.io.UnsupportedEncodingException; import java.lang.ref.SoftReference; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.net.MalformedURLException; import java.net.URL; -import java.net.URLDecoder; import java.util.Date; import java.util.HashMap; import java.util.Iterator; @@ -266,13 +264,7 @@ public final class HTTPDFileHandler { return; } - // url decoding of path - try { - path = URLDecoder.decode(path, "UTF-8"); - } catch (final UnsupportedEncodingException e) { - // This should never occur - assert(false) : "UnsupportedEncodingException: " + e.getMessage(); - } + path = UTF8.decodeURL(path); // check against hack attacks in path if (path.indexOf("..") >= 0) { @@ -538,8 +530,8 @@ public final class HTTPDFileHandler { // implement proxy via url (not in servlet, because we need binary access on ouputStream) if (path.equals("/proxy.html")) { final List urlProxyAccess = Domains.makePatterns(sb.getConfig("proxyURL.access", "127.0.0.1")); - UserDB.Entry user = sb.userDB.getUser(requestHeader); - boolean user_may_see_proxyurl = Domains.matchesList(clientIP, urlProxyAccess) || (user!=null && user.hasRight(UserDB.AccessRight.PROXY_RIGHT)); + final UserDB.Entry user = sb.userDB.getUser(requestHeader); + final boolean user_may_see_proxyurl = Domains.matchesList(clientIP, urlProxyAccess) || (user!=null && user.hasRight(UserDB.AccessRight.PROXY_RIGHT)); if (sb.getConfigBool("proxyURL", false) && user_may_see_proxyurl) { doURLProxy(args, conProp, requestHeader, out); return; @@ -1308,7 +1300,7 @@ public final class HTTPDFileHandler { * not in separete servlet, because we need access to binary outstream * @throws IOException */ - private static void doURLProxy(final serverObjects args, final HashMap conProp, final RequestHeader requestHeader, OutputStream out) throws IOException { + private static void doURLProxy(final serverObjects args, final HashMap conProp, final RequestHeader requestHeader, final OutputStream out) throws IOException { final String httpVersion = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HTTP_VER); URL proxyurl = null; @@ -1325,7 +1317,7 @@ public final class HTTPDFileHandler { } String host = proxyurl.getHost(); if (proxyurl.getPort() != -1) { - host += ":" + proxyurl.getPort(); + host += ":" + proxyurl.getPort(); } // set properties for proxy connection @@ -1430,7 +1422,7 @@ public final class HTTPDFileHandler { } else if (url.startsWith("//")) { // absoulte url but same protocol of form href="//domain.com/path" - String complete_url = proxyurl.getProtocol() + ":" + url; + final String complete_url = proxyurl.getProtocol() + ":" + url; if (sb.getConfig("proxyURL.rewriteURLs", "all").equals("domainlist")) { if (sb.crawlStacker.urlInAcceptedDomain(new DigestURI(complete_url)) != null) { continue; @@ -1455,7 +1447,7 @@ public final class HTTPDFileHandler { newurl = newurl.replaceAll("\\$","\\\\\\$"); m.appendReplacement(result, newurl); } - catch (MalformedURLException e) {} + catch (final MalformedURLException e) {} } } @@ -1466,7 +1458,7 @@ public final class HTTPDFileHandler { if (outgoingHeader.containsKey(HeaderFramework.TRANSFER_ENCODING)) { HTTPDemon.sendRespondHeader(conProp, out, httpVersion, httpStatus, outgoingHeader); - ChunkedOutputStream cos = new ChunkedOutputStream(out); + final ChunkedOutputStream cos = new ChunkedOutputStream(out); cos.write(sbb); cos.finish(); diff --git a/source/net/yacy/cora/document/MultiProtocolURI.java b/source/net/yacy/cora/document/MultiProtocolURI.java index 300ed8e46..57cc12cbc 100644 --- a/source/net/yacy/cora/document/MultiProtocolURI.java +++ b/source/net/yacy/cora/document/MultiProtocolURI.java @@ -127,6 +127,7 @@ public class MultiProtocolURI implements Serializable, Comparableapplication/x-www-form-urlencoded string using a specific + * encoding scheme. + */ + public static String decodeURL(final String s) { + boolean needToChange = false; + final int numChars = s.length(); + final StringBuffer sb = new StringBuffer(numChars > 500 ? numChars / 2 : numChars); + int i = 0; + char c; + byte[] bytes = null; + while (i < numChars) { + c = s.charAt(i); + switch (c) { + case '+': + sb.append(' '); + i++; + needToChange = true; + break; + case '%': + try { + if (bytes == null) bytes = new byte[(numChars-i)/3]; + int pos = 0; + while (((i+2) < numChars) && (c=='%')) { + final int v = Integer.parseInt(s.substring(i+1,i+3),16); + if (v < 0) throw new IllegalArgumentException("URLDecoder: Illegal hex characters in escape (%) pattern - negative value"); + bytes[pos++] = (byte) v; + i+= 3; + if (i < numChars) c = s.charAt(i); + } + if ((i < numChars) && (c=='%')) throw new IllegalArgumentException("URLDecoder: Incomplete trailing escape (%) pattern"); + sb.append(new String(bytes, 0, pos, charset)); + } catch (final NumberFormatException e) { + throw new IllegalArgumentException("URLDecoder: Illegal hex characters in escape (%) pattern - " + e.getMessage()); + } + needToChange = true; + break; + default: + sb.append(c); + i++; + break; + } + } + + return (needToChange? sb.toString() : s); + } + }