diff --git a/source/de/anomic/crawler/Latency.java b/source/de/anomic/crawler/Latency.java index 0f9cbeab1..f83a85979 100644 --- a/source/de/anomic/crawler/Latency.java +++ b/source/de/anomic/crawler/Latency.java @@ -172,7 +172,7 @@ public class Latency { if (!local && host != null) waiting += host.flux(waiting); // find the delay as given by robots.txt on target site - long robotsDelay = (local) ? 0 : Switchboard.getSwitchboard().robots.crawlDelayMillis(url); + long robotsDelay = (local) ? 0 : Switchboard.getSwitchboard().robots.getCrawlDelayMillis(url); waiting = Math.max(waiting, robotsDelay); // use the access latency as rule how fast we can access the server @@ -216,7 +216,7 @@ public class Latency { if (!local && host != null) s.append(", flux = ").append(host.flux(waiting)); // find the delay as given by robots.txt on target site - long robotsDelay = (local) ? 0 : Switchboard.getSwitchboard().robots.crawlDelayMillis(url); + long robotsDelay = (local) ? 0 : Switchboard.getSwitchboard().robots.getCrawlDelayMillis(url); s.append(", robots.delay = ").append(robotsDelay); // use the access latency as rule how fast we can access the server diff --git a/source/de/anomic/crawler/RobotsTxt.java b/source/de/anomic/crawler/RobotsTxt.java index ad555f178..db17d1ea0 100644 --- a/source/de/anomic/crawler/RobotsTxt.java +++ b/source/de/anomic/crawler/RobotsTxt.java @@ -228,12 +228,6 @@ public class RobotsTxt { return robotsTxt4Host; } - public long crawlDelayMillis(final DigestURI theURL) { - final String urlHostPort = getHostPort(theURL); - final RobotsEntry robotsEntry = getEntry(urlHostPort, true); - return robotsEntry.getCrawlDelayMillis(); - } - private RobotsEntry addEntry( final String hostName, final ArrayList allowPathList, @@ -309,17 +303,9 @@ public class RobotsTxt { public Long getCrawlDelayMillis(final DigestURI theURL) { if (theURL == null) throw new IllegalArgumentException(); - Long crawlDelay = null; - - // generating the hostname:poart string needed to do a DB lookup final String urlHostPort = getHostPort(theURL); - final RobotsEntry robotsTxt4Host = getEntry(urlHostPort, true); - - try { - crawlDelay = robotsTxt4Host.getCrawlDelayMillis(); - } catch (final NumberFormatException e) {/* ignore this */} - - return crawlDelay; + final RobotsEntry robotsEntry = getEntry(urlHostPort, true); + return robotsEntry.getCrawlDelayMillis(); } public boolean isDisallowed(final DigestURI nexturl) { diff --git a/source/de/anomic/crawler/robotsParser.java b/source/de/anomic/crawler/robotsParser.java index 74f49c91a..ce95207ee 100644 --- a/source/de/anomic/crawler/robotsParser.java +++ b/source/de/anomic/crawler/robotsParser.java @@ -60,7 +60,7 @@ public final class robotsParser { public static final String ROBOTS_ALLOW = "Allow:".toUpperCase(); public static final String ROBOTS_COMMENT = "#"; public static final String ROBOTS_SITEMAP = "Sitemap:".toUpperCase(); - public static final String ROBOTS_CRAWL_DELAY = "Crawl-Delay:".toUpperCase(); + public static final String ROBOTS_CRAWL_DELAY = "Crawl-delay:".toUpperCase(); private ArrayList allowList; private ArrayList denyList; @@ -107,27 +107,48 @@ public final class robotsParser { inBlock = false; try { - while ((line = reader.readLine()) != null) { + lineparser: while ((line = reader.readLine()) != null) { // replacing all tabs with spaces line = line.replaceAll("\t"," ").trim(); lineUpper = line.toUpperCase(); + // parse empty line if (line.length() == 0) { - // OLD: we have reached the end of the rule block - // rule4Yacy = false; inBlock = false; - - // NEW: just ignore it - } else if (line.startsWith(ROBOTS_COMMENT)) { + // we have reached the end of the rule block + if (rule4YaCyFound) { + // stop here because other robot blocks are either not for YaCy + // or global settings which shall not overwrite YaCys settings. + break lineparser; + } + continue lineparser; + } + + // parse comment + if (line.startsWith(ROBOTS_COMMENT)) { // we can ignore this. Just a comment line - } else if (lineUpper.startsWith(ROBOTS_SITEMAP)) { + continue lineparser; + } + + // parse sitemap + if (lineUpper.startsWith(ROBOTS_SITEMAP)) { pos = line.indexOf(" "); if (pos != -1) { sitemap = line.substring(pos).trim(); } - } else if (lineUpper.startsWith(ROBOTS_USER_AGENT)) { + continue lineparser; + } + + // parse user agent + if (lineUpper.startsWith(ROBOTS_USER_AGENT)) { if (inBlock) { // we have detected the start of a new block + if (rule4YaCyFound) { + // stop here because other robot blocks are either not for YaCy + // or global settings which shall not overwrite YaCys settings. + break lineparser; + } + inBlock = false; isRule4AllAgents = false; isRule4YaCyAgent = false; @@ -144,9 +165,14 @@ public final class robotsParser { final String userAgent = line.substring(pos).trim(); isRule4AllAgents |= userAgent.equals("*"); isRule4YaCyAgent |= userAgent.toLowerCase().indexOf("yacy") >=0; + isRule4YaCyAgent |= userAgent.toLowerCase().indexOf("yacybot") >=0; if (isRule4YaCyAgent) rule4YaCyFound = true; } - } else if (lineUpper.startsWith(ROBOTS_CRAWL_DELAY)) { + continue lineparser; + } + + // parse crawl delay + if (lineUpper.startsWith(ROBOTS_CRAWL_DELAY)) { inBlock = true; if (isRule4YaCyAgent || isRule4AllAgents) { pos = line.indexOf(" "); @@ -159,8 +185,11 @@ public final class robotsParser { } } } - } else if (lineUpper.startsWith(ROBOTS_DISALLOW) || - lineUpper.startsWith(ROBOTS_ALLOW)) { + continue lineparser; + } + + // parse disallow + if (lineUpper.startsWith(ROBOTS_DISALLOW) || lineUpper.startsWith(ROBOTS_ALLOW)) { inBlock = true; final boolean isDisallowRule = lineUpper.startsWith(ROBOTS_DISALLOW); @@ -169,18 +198,18 @@ public final class robotsParser { pos = line.indexOf(ROBOTS_COMMENT); if (pos != -1) line = line.substring(0,pos).trim(); - // cutting of tailing * + // cut off tailing * if (line.endsWith("*")) line = line.substring(0,line.length()-1); - // getting the path + // parse the path pos = line.indexOf(" "); - if (pos != -1) { + if (pos >= 0) { // getting the path String path = line.substring(pos).trim(); // unencoding all special charsx try { - path = URLDecoder.decode(path,"UTF-8"); + path = URLDecoder.decode(path, "UTF-8"); } catch (final Exception e) { /* * url decoding failed. E.g. because of @@ -201,6 +230,7 @@ public final class robotsParser { } } } + continue lineparser; } } } catch (final IOException e) {}