*) robots.txt: adding support for crawl-delay

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3737 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 18 years ago
parent 11ac7688d5
commit 9a4375b115

@ -90,6 +90,7 @@ public final class robotsParser{
public static final String ROBOTS_ALLOW = "Allow:".toUpperCase(); public static final String ROBOTS_ALLOW = "Allow:".toUpperCase();
public static final String ROBOTS_COMMENT = "#"; public static final String ROBOTS_COMMENT = "#";
public static final String ROBOTS_SITEMAP = "Sitemap:".toUpperCase(); public static final String ROBOTS_SITEMAP = "Sitemap:".toUpperCase();
public static final String ROBOTS_CRAWL_DELAY = "Crawl-Delay:".toUpperCase();
/*public robotsParser(URL robotsUrl){ /*public robotsParser(URL robotsUrl){
}*/ }*/
@ -121,6 +122,7 @@ public final class robotsParser{
int pos; int pos;
String line = null, lineUpper = null, sitemap = null; String line = null, lineUpper = null, sitemap = null;
Integer crawlDelay = null;
boolean isRuleBlock4AllAgents = false, boolean isRuleBlock4AllAgents = false,
isRuleBlock4YaCyAgent = false, isRuleBlock4YaCyAgent = false,
rule4YaCyFound = false, rule4YaCyFound = false,
@ -149,6 +151,7 @@ public final class robotsParser{
inBlock = false; inBlock = false;
isRuleBlock4AllAgents = false; isRuleBlock4AllAgents = false;
isRuleBlock4YaCyAgent = false; isRuleBlock4YaCyAgent = false;
crawlDelay = null; // each block has a separate delay
} }
// cutting off comments at the line end // cutting off comments at the line end
@ -166,6 +169,15 @@ public final class robotsParser{
isRuleBlock4YaCyAgent |= userAgent.toLowerCase().indexOf("yacy") >=0; isRuleBlock4YaCyAgent |= userAgent.toLowerCase().indexOf("yacy") >=0;
if (isRuleBlock4YaCyAgent) rule4YaCyFound = true; if (isRuleBlock4YaCyAgent) rule4YaCyFound = true;
} }
} else if (lineUpper.startsWith(ROBOTS_CRAWL_DELAY)) {
pos = line.indexOf(" ");
if (pos != -1) {
try {
crawlDelay = Integer.valueOf(line.substring(pos).trim());
} catch (NumberFormatException e) {
// invalid crawling delay
}
}
} else if (lineUpper.startsWith(ROBOTS_DISALLOW) || } else if (lineUpper.startsWith(ROBOTS_DISALLOW) ||
lineUpper.startsWith(ROBOTS_ALLOW)) { lineUpper.startsWith(ROBOTS_ALLOW)) {
inBlock = true; inBlock = true;
@ -211,7 +223,7 @@ public final class robotsParser{
} }
ArrayList denyList = (rule4YaCyFound)?deny4YaCyAgent:deny4AllAgents; ArrayList denyList = (rule4YaCyFound)?deny4YaCyAgent:deny4AllAgents;
return new Object[]{denyList,sitemap}; return new Object[]{denyList,sitemap,crawlDelay};
} }
private static final int getPort(URL theURL) { private static final int getPort(URL theURL) {
@ -258,6 +270,27 @@ public final class robotsParser{
return sitemapURL; return sitemapURL;
} }
public static Integer getCrawlDelay(URL theURL) {
if (theURL == null) throw new IllegalArgumentException();
Integer crawlDelay = null;
// generating the hostname:poart string needed to do a DB lookup
String urlHostPort = getHostPort(theURL);
plasmaCrawlRobotsTxt.Entry robotsTxt4Host = null;
synchronized(urlHostPort) {
// doing a DB lookup to determine if the robots data is already available
robotsTxt4Host = plasmaSwitchboard.robots.getEntry(urlHostPort);
}
if (robotsTxt4Host == null) return null;
try {
crawlDelay = robotsTxt4Host.getCrawlDelay();
} catch (NumberFormatException e) {/* ignore this */}
return crawlDelay;
}
public static boolean isDisallowed(URL nexturl) { public static boolean isDisallowed(URL nexturl) {
if (nexturl == null) throw new IllegalArgumentException(); if (nexturl == null) throw new IllegalArgumentException();
@ -309,6 +342,7 @@ public final class robotsParser{
if ((robotsTxt4Host==null)||((robotsTxt4Host!=null)&&(result!=null))) { if ((robotsTxt4Host==null)||((robotsTxt4Host!=null)&&(result!=null))) {
ArrayList denyPath = null; ArrayList denyPath = null;
String sitemap = null; String sitemap = null;
Integer crawlDelay = null;
if (accessCompletelyRestricted) { if (accessCompletelyRestricted) {
denyPath = new ArrayList(); denyPath = new ArrayList();
denyPath.add("/"); denyPath.add("/");
@ -318,13 +352,14 @@ public final class robotsParser{
Object[] parserResult = robotsParser.parse(robotsTxt); Object[] parserResult = robotsParser.parse(robotsTxt);
denyPath = (ArrayList) parserResult[0]; denyPath = (ArrayList) parserResult[0];
sitemap = (String) parserResult[1]; sitemap = (String) parserResult[1];
crawlDelay = (Integer) parserResult[2];
} catch (IOException e) { } catch (IOException e) {
serverLog.logSevere("ROBOTS","Unable to parse the robots.txt file from URL '" + robotsURL + "'."); serverLog.logSevere("ROBOTS","Unable to parse the robots.txt file from URL '" + robotsURL + "'.");
} }
} }
// storing the data into the robots DB // storing the data into the robots DB
robotsTxt4Host = plasmaSwitchboard.robots.addEntry(urlHostPort,denyPath,new Date(),modDate,eTag,sitemap); robotsTxt4Host = plasmaSwitchboard.robots.addEntry(urlHostPort,denyPath,new Date(),modDate,eTag,sitemap,crawlDelay);
} }
} }
} }

@ -84,16 +84,16 @@ public class plasmaCrawlRobotsTxt {
} }
public void close() { public void close() {
robotsTable.close(); this.robotsTable.close();
} }
public int size() { public int size() {
return robotsTable.size(); return this.robotsTable.size();
} }
public void removeEntry(String hostName) { public void removeEntry(String hostName) {
try { try {
robotsTable.remove(hostName.toLowerCase()); this.robotsTable.remove(hostName.toLowerCase());
} catch (IOException e) { } catch (IOException e) {
} catch (kelondroException e) { } catch (kelondroException e) {
@ -103,7 +103,7 @@ public class plasmaCrawlRobotsTxt {
public Entry getEntry(String hostName) { public Entry getEntry(String hostName) {
try { try {
Map record = robotsTable.getMap(hostName); Map record = this.robotsTable.getMap(hostName);
if (record == null) return null; if (record == null) return null;
return new Entry(hostName, record); return new Entry(hostName, record);
} catch (kelondroException e) { } catch (kelondroException e) {
@ -112,8 +112,16 @@ public class plasmaCrawlRobotsTxt {
} }
} }
public Entry addEntry(String hostName, ArrayList disallowPathList, Date loadedDate, Date modDate, String eTag, String sitemap) { public Entry addEntry(
Entry entry = new Entry(hostName,disallowPathList,loadedDate,modDate,eTag,sitemap); String hostName,
ArrayList disallowPathList,
Date loadedDate,
Date modDate,
String eTag,
String sitemap,
Integer crawlDelay
) {
Entry entry = new Entry(hostName,disallowPathList,loadedDate,modDate,eTag,sitemap,crawlDelay);
addEntry(entry); addEntry(entry);
return entry; return entry;
} }
@ -121,7 +129,7 @@ public class plasmaCrawlRobotsTxt {
public String addEntry(Entry entry) { public String addEntry(Entry entry) {
// writes a new page and returns key // writes a new page and returns key
try { try {
robotsTable.set(entry.hostName,entry.mem); this.robotsTable.set(entry.hostName,entry.mem);
return entry.hostName; return entry.hostName;
} catch (IOException e) { } catch (IOException e) {
return null; return null;
@ -134,11 +142,12 @@ public class plasmaCrawlRobotsTxt {
public static final String MOD_DATE = "modDate"; public static final String MOD_DATE = "modDate";
public static final String ETAG = "etag"; public static final String ETAG = "etag";
public static final String SITEMAP = "sitemap"; public static final String SITEMAP = "sitemap";
public static final String CRAWL_DELAY = "crawlDelay";
// this is a simple record structure that hold all properties of a single crawl start // this is a simple record structure that hold all properties of a single crawl start
private Map mem; Map mem;
private LinkedList disallowPathList; private LinkedList disallowPathList;
private String hostName; String hostName;
public Entry(String hostName, Map mem) { public Entry(String hostName, Map mem) {
this.hostName = hostName.toLowerCase(); this.hostName = hostName.toLowerCase();
@ -164,8 +173,10 @@ public class plasmaCrawlRobotsTxt {
Date loadedDate, Date loadedDate,
Date modDate, Date modDate,
String eTag, String eTag,
String sitemap) { String sitemap,
if ((hostName == null) || (hostName.length() == 0)) throw new IllegalArgumentException(); Integer crawlDelay
) {
if ((hostName == null) || (hostName.length() == 0)) throw new IllegalArgumentException("The hostname is missing");
this.hostName = hostName.trim().toLowerCase(); this.hostName = hostName.trim().toLowerCase();
this.disallowPathList = new LinkedList(); this.disallowPathList = new LinkedList();
@ -175,6 +186,7 @@ public class plasmaCrawlRobotsTxt {
if (modDate != null) this.mem.put(MOD_DATE,Long.toString(modDate.getTime())); if (modDate != null) this.mem.put(MOD_DATE,Long.toString(modDate.getTime()));
if (eTag != null) this.mem.put(ETAG,eTag); if (eTag != null) this.mem.put(ETAG,eTag);
if (sitemap != null) this.mem.put(SITEMAP,sitemap); if (sitemap != null) this.mem.put(SITEMAP,sitemap);
if (crawlDelay != null) this.mem.put(CRAWL_DELAY,crawlDelay.toString());
if ((disallowPathList != null)&&(disallowPathList.size()>0)) { if ((disallowPathList != null)&&(disallowPathList.size()>0)) {
this.disallowPathList.addAll(disallowPathList); this.disallowPathList.addAll(disallowPathList);
@ -231,6 +243,13 @@ public class plasmaCrawlRobotsTxt {
return null; return null;
} }
public Integer getCrawlDelay() {
if (this.mem.containsKey(CRAWL_DELAY)) {
return Integer.valueOf((String)this.mem.get(CRAWL_DELAY));
}
return null;
}
public boolean isDisallowed(String path) { public boolean isDisallowed(String path) {
if ((this.mem == null) || (this.disallowPathList.size() == 0)) return false; if ((this.mem == null) || (this.disallowPathList.size() == 0)) return false;

Loading…
Cancel
Save