- clean-up of robots.txt parser

- added 'yacybot' as key to recognize robots.txt entries for YaCy
- removed unused method to get robots.txt from database

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6565 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 2113fcd7e5
commit bc96d74813

@ -172,7 +172,7 @@ public class Latency {
if (!local && host != null) waiting += host.flux(waiting); if (!local && host != null) waiting += host.flux(waiting);
// find the delay as given by robots.txt on target site // find the delay as given by robots.txt on target site
long robotsDelay = (local) ? 0 : Switchboard.getSwitchboard().robots.crawlDelayMillis(url); long robotsDelay = (local) ? 0 : Switchboard.getSwitchboard().robots.getCrawlDelayMillis(url);
waiting = Math.max(waiting, robotsDelay); waiting = Math.max(waiting, robotsDelay);
// use the access latency as rule how fast we can access the server // use the access latency as rule how fast we can access the server
@ -216,7 +216,7 @@ public class Latency {
if (!local && host != null) s.append(", flux = ").append(host.flux(waiting)); if (!local && host != null) s.append(", flux = ").append(host.flux(waiting));
// find the delay as given by robots.txt on target site // find the delay as given by robots.txt on target site
long robotsDelay = (local) ? 0 : Switchboard.getSwitchboard().robots.crawlDelayMillis(url); long robotsDelay = (local) ? 0 : Switchboard.getSwitchboard().robots.getCrawlDelayMillis(url);
s.append(", robots.delay = ").append(robotsDelay); s.append(", robots.delay = ").append(robotsDelay);
// use the access latency as rule how fast we can access the server // use the access latency as rule how fast we can access the server

@ -228,12 +228,6 @@ public class RobotsTxt {
return robotsTxt4Host; return robotsTxt4Host;
} }
public long crawlDelayMillis(final DigestURI theURL) {
final String urlHostPort = getHostPort(theURL);
final RobotsEntry robotsEntry = getEntry(urlHostPort, true);
return robotsEntry.getCrawlDelayMillis();
}
private RobotsEntry addEntry( private RobotsEntry addEntry(
final String hostName, final String hostName,
final ArrayList<String> allowPathList, final ArrayList<String> allowPathList,
@ -309,17 +303,9 @@ public class RobotsTxt {
public Long getCrawlDelayMillis(final DigestURI theURL) { public Long getCrawlDelayMillis(final DigestURI theURL) {
if (theURL == null) throw new IllegalArgumentException(); if (theURL == null) throw new IllegalArgumentException();
Long crawlDelay = null;
// generating the hostname:poart string needed to do a DB lookup
final String urlHostPort = getHostPort(theURL); final String urlHostPort = getHostPort(theURL);
final RobotsEntry robotsTxt4Host = getEntry(urlHostPort, true); final RobotsEntry robotsEntry = getEntry(urlHostPort, true);
return robotsEntry.getCrawlDelayMillis();
try {
crawlDelay = robotsTxt4Host.getCrawlDelayMillis();
} catch (final NumberFormatException e) {/* ignore this */}
return crawlDelay;
} }
public boolean isDisallowed(final DigestURI nexturl) { public boolean isDisallowed(final DigestURI nexturl) {

@ -60,7 +60,7 @@ public final class robotsParser {
public static final String ROBOTS_ALLOW = "Allow:".toUpperCase(); public static final String ROBOTS_ALLOW = "Allow:".toUpperCase();
public static final String ROBOTS_COMMENT = "#"; public static final String ROBOTS_COMMENT = "#";
public static final String ROBOTS_SITEMAP = "Sitemap:".toUpperCase(); public static final String ROBOTS_SITEMAP = "Sitemap:".toUpperCase();
public static final String ROBOTS_CRAWL_DELAY = "Crawl-Delay:".toUpperCase(); public static final String ROBOTS_CRAWL_DELAY = "Crawl-delay:".toUpperCase();
private ArrayList<String> allowList; private ArrayList<String> allowList;
private ArrayList<String> denyList; private ArrayList<String> denyList;
@ -107,27 +107,48 @@ public final class robotsParser {
inBlock = false; inBlock = false;
try { try {
while ((line = reader.readLine()) != null) { lineparser: while ((line = reader.readLine()) != null) {
// replacing all tabs with spaces // replacing all tabs with spaces
line = line.replaceAll("\t"," ").trim(); line = line.replaceAll("\t"," ").trim();
lineUpper = line.toUpperCase(); lineUpper = line.toUpperCase();
// parse empty line
if (line.length() == 0) { if (line.length() == 0) {
// OLD: we have reached the end of the rule block // we have reached the end of the rule block
// rule4Yacy = false; inBlock = false; if (rule4YaCyFound) {
// stop here because other robot blocks are either not for YaCy
// or global settings which shall not overwrite YaCys settings.
break lineparser;
}
continue lineparser;
}
// NEW: just ignore it // parse comment
} else if (line.startsWith(ROBOTS_COMMENT)) { if (line.startsWith(ROBOTS_COMMENT)) {
// we can ignore this. Just a comment line // we can ignore this. Just a comment line
} else if (lineUpper.startsWith(ROBOTS_SITEMAP)) { continue lineparser;
}
// parse sitemap
if (lineUpper.startsWith(ROBOTS_SITEMAP)) {
pos = line.indexOf(" "); pos = line.indexOf(" ");
if (pos != -1) { if (pos != -1) {
sitemap = line.substring(pos).trim(); sitemap = line.substring(pos).trim();
} }
} else if (lineUpper.startsWith(ROBOTS_USER_AGENT)) { continue lineparser;
}
// parse user agent
if (lineUpper.startsWith(ROBOTS_USER_AGENT)) {
if (inBlock) { if (inBlock) {
// we have detected the start of a new block // we have detected the start of a new block
if (rule4YaCyFound) {
// stop here because other robot blocks are either not for YaCy
// or global settings which shall not overwrite YaCys settings.
break lineparser;
}
inBlock = false; inBlock = false;
isRule4AllAgents = false; isRule4AllAgents = false;
isRule4YaCyAgent = false; isRule4YaCyAgent = false;
@ -144,9 +165,14 @@ public final class robotsParser {
final String userAgent = line.substring(pos).trim(); final String userAgent = line.substring(pos).trim();
isRule4AllAgents |= userAgent.equals("*"); isRule4AllAgents |= userAgent.equals("*");
isRule4YaCyAgent |= userAgent.toLowerCase().indexOf("yacy") >=0; isRule4YaCyAgent |= userAgent.toLowerCase().indexOf("yacy") >=0;
isRule4YaCyAgent |= userAgent.toLowerCase().indexOf("yacybot") >=0;
if (isRule4YaCyAgent) rule4YaCyFound = true; if (isRule4YaCyAgent) rule4YaCyFound = true;
} }
} else if (lineUpper.startsWith(ROBOTS_CRAWL_DELAY)) { continue lineparser;
}
// parse crawl delay
if (lineUpper.startsWith(ROBOTS_CRAWL_DELAY)) {
inBlock = true; inBlock = true;
if (isRule4YaCyAgent || isRule4AllAgents) { if (isRule4YaCyAgent || isRule4AllAgents) {
pos = line.indexOf(" "); pos = line.indexOf(" ");
@ -159,8 +185,11 @@ public final class robotsParser {
} }
} }
} }
} else if (lineUpper.startsWith(ROBOTS_DISALLOW) || continue lineparser;
lineUpper.startsWith(ROBOTS_ALLOW)) { }
// parse disallow
if (lineUpper.startsWith(ROBOTS_DISALLOW) || lineUpper.startsWith(ROBOTS_ALLOW)) {
inBlock = true; inBlock = true;
final boolean isDisallowRule = lineUpper.startsWith(ROBOTS_DISALLOW); final boolean isDisallowRule = lineUpper.startsWith(ROBOTS_DISALLOW);
@ -169,12 +198,12 @@ public final class robotsParser {
pos = line.indexOf(ROBOTS_COMMENT); pos = line.indexOf(ROBOTS_COMMENT);
if (pos != -1) line = line.substring(0,pos).trim(); if (pos != -1) line = line.substring(0,pos).trim();
// cutting of tailing * // cut off tailing *
if (line.endsWith("*")) line = line.substring(0,line.length()-1); if (line.endsWith("*")) line = line.substring(0,line.length()-1);
// getting the path // parse the path
pos = line.indexOf(" "); pos = line.indexOf(" ");
if (pos != -1) { if (pos >= 0) {
// getting the path // getting the path
String path = line.substring(pos).trim(); String path = line.substring(pos).trim();
@ -201,6 +230,7 @@ public final class robotsParser {
} }
} }
} }
continue lineparser;
} }
} }
} catch (final IOException e) {} } catch (final IOException e) {}

Loading…
Cancel
Save