- clean-up of robots.txt parser

- added 'yacybot' as key to recognize robots.txt entries for YaCy - removed unused method to get robots.txt from database git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6565 6c8d7289-2bf4-0310-a012-ef5d649a1542
15 years ago · bc96d74813
parent 2113fcd7e5
commit bc96d74813
3 changed files with 50 additions and 34 deletions
--- a/source/de/anomic/crawler/Latency.java
+++ b/source/de/anomic/crawler/Latency.java
@ -172,7 +172,7 @@ public class Latency {
        if (!local && host != null) waiting += host.flux(waiting);
        
        // find the delay as given by robots.txt on target site
-        long robotsDelay = (local) ? 0 : Switchboard.getSwitchboard().robots.crawlDelayMillis(url);
+        long robotsDelay = (local) ? 0 : Switchboard.getSwitchboard().robots.getCrawlDelayMillis(url);
        waiting = Math.max(waiting, robotsDelay);
        
        // use the access latency as rule how fast we can access the server
@ -216,7 +216,7 @@ public class Latency {
        if (!local && host != null) s.append(", flux = ").append(host.flux(waiting));
        
        // find the delay as given by robots.txt on target site
-        long robotsDelay = (local) ? 0 : Switchboard.getSwitchboard().robots.crawlDelayMillis(url);
+        long robotsDelay = (local) ? 0 : Switchboard.getSwitchboard().robots.getCrawlDelayMillis(url);
        s.append(", robots.delay = ").append(robotsDelay);
        
        // use the access latency as rule how fast we can access the server
--- a/source/de/anomic/crawler/RobotsTxt.java
+++ b/source/de/anomic/crawler/RobotsTxt.java
@ -228,12 +228,6 @@ public class RobotsTxt {
        return robotsTxt4Host;
    }
    
-    public long crawlDelayMillis(final DigestURI theURL) {
-        final String urlHostPort = getHostPort(theURL);
-        final RobotsEntry robotsEntry = getEntry(urlHostPort, true);
-        return robotsEntry.getCrawlDelayMillis();
-    }
-    
    private RobotsEntry addEntry(
    		final String hostName, 
    		final ArrayList<String> allowPathList, 
@ -309,17 +303,9 @@ public class RobotsTxt {
    
    public Long getCrawlDelayMillis(final DigestURI theURL) {
        if (theURL == null) throw new IllegalArgumentException(); 
-        Long crawlDelay = null;
-        
-        // generating the hostname:poart string needed to do a DB lookup
        final String urlHostPort = getHostPort(theURL);
-        final RobotsEntry robotsTxt4Host = getEntry(urlHostPort, true);
-                       
-        try {
-            crawlDelay = robotsTxt4Host.getCrawlDelayMillis();
-        } catch (final NumberFormatException e) {/* ignore this */}
-        
-        return crawlDelay;
+        final RobotsEntry robotsEntry = getEntry(urlHostPort, true);
+        return robotsEntry.getCrawlDelayMillis();
    }
    
    public boolean isDisallowed(final DigestURI nexturl) {
--- a/source/de/anomic/crawler/robotsParser.java
+++ b/source/de/anomic/crawler/robotsParser.java
@ -60,7 +60,7 @@ public final class robotsParser {
    public static final String ROBOTS_ALLOW = "Allow:".toUpperCase();
    public static final String ROBOTS_COMMENT = "#";
    public static final String ROBOTS_SITEMAP = "Sitemap:".toUpperCase();
-    public static final String ROBOTS_CRAWL_DELAY = "Crawl-Delay:".toUpperCase();
+    public static final String ROBOTS_CRAWL_DELAY = "Crawl-delay:".toUpperCase();
    
    private ArrayList<String> allowList;
    private ArrayList<String> denyList;
@ -107,27 +107,48 @@ public final class robotsParser {
                inBlock = false;        
        
        try {
-            while ((line = reader.readLine()) != null) {
+            lineparser: while ((line = reader.readLine()) != null) {
                // replacing all tabs with spaces
                line = line.replaceAll("\t"," ").trim();
                lineUpper = line.toUpperCase();
                
+                // parse empty line
                if (line.length() == 0) {
-                    // OLD: we have reached the end of the rule block
-                    // rule4Yacy = false; inBlock = false;
-                    
-                    // NEW: just ignore it
-                } else if (line.startsWith(ROBOTS_COMMENT)) {
+                    // we have reached the end of the rule block
+                    if (rule4YaCyFound) {
+                        // stop here because other robot blocks are either not for YaCy
+                        // or global settings which shall not overwrite YaCys settings.
+                        break lineparser;
+                    }
+                    continue lineparser;
+                }
+                
+                // parse comment
+                if (line.startsWith(ROBOTS_COMMENT)) {
                    // we can ignore this. Just a comment line
-                } else if (lineUpper.startsWith(ROBOTS_SITEMAP)) {
+                    continue lineparser;
+                }
+                
+                // parse sitemap
+                if (lineUpper.startsWith(ROBOTS_SITEMAP)) {
                    pos = line.indexOf(" ");
                    if (pos != -1) {
                        sitemap = line.substring(pos).trim();
                    }
-                } else if (lineUpper.startsWith(ROBOTS_USER_AGENT)) {
+                    continue lineparser;
+                }
+                
+                // parse user agent
+                if (lineUpper.startsWith(ROBOTS_USER_AGENT)) {
                    
                    if (inBlock) {
                        // we have detected the start of a new block
+                        if (rule4YaCyFound) {
+                            // stop here because other robot blocks are either not for YaCy
+                            // or global settings which shall not overwrite YaCys settings.
+                            break lineparser;
+                        }
+                        
                        inBlock = false;
                        isRule4AllAgents = false;
                        isRule4YaCyAgent = false;
@ -144,9 +165,14 @@ public final class robotsParser {
                        final String userAgent = line.substring(pos).trim();
                        isRule4AllAgents |= userAgent.equals("*");
                        isRule4YaCyAgent |= userAgent.toLowerCase().indexOf("yacy") >=0;
+                        isRule4YaCyAgent |= userAgent.toLowerCase().indexOf("yacybot") >=0;
                        if (isRule4YaCyAgent) rule4YaCyFound = true;
                    }
-                } else if (lineUpper.startsWith(ROBOTS_CRAWL_DELAY)) {
+                    continue lineparser;
+                }
+                
+                // parse crawl delay
+                if (lineUpper.startsWith(ROBOTS_CRAWL_DELAY)) {
                    inBlock = true;
                	if (isRule4YaCyAgent || isRule4AllAgents) {
                		pos = line.indexOf(" ");
@ -159,8 +185,11 @@ public final class robotsParser {
                			}
                		}
                	}
-                } else if (lineUpper.startsWith(ROBOTS_DISALLOW) || 
-                           lineUpper.startsWith(ROBOTS_ALLOW)) {
+                	continue lineparser;
+                }
+                
+                // parse disallow
+                if (lineUpper.startsWith(ROBOTS_DISALLOW) || lineUpper.startsWith(ROBOTS_ALLOW)) {
                    inBlock = true;
                    final boolean isDisallowRule = lineUpper.startsWith(ROBOTS_DISALLOW);
                    
@ -169,18 +198,18 @@ public final class robotsParser {
                        pos = line.indexOf(ROBOTS_COMMENT);
                        if (pos != -1) line = line.substring(0,pos).trim();
                                           
-                        // cutting of tailing *
+                        // cut off tailing *
                        if (line.endsWith("*")) line = line.substring(0,line.length()-1);
                        
-                        // getting the path
+                        // parse the path
                        pos = line.indexOf(" ");
-                        if (pos != -1) {
+                        if (pos >= 0) {
                            // getting the path
                            String path = line.substring(pos).trim();
                            
                            // unencoding all special charsx
                            try {
-                                path = URLDecoder.decode(path,"UTF-8");
+                                path = URLDecoder.decode(path, "UTF-8");
                            } catch (final Exception e) {
                                /* 
                                 * url decoding failed. E.g. because of
@ -201,6 +230,7 @@ public final class robotsParser {
                            }
                        }
                    }
+                    continue lineparser;
                }
            }
        } catch (final IOException e) {}