fix for sitemap detection: the sitemap url was not visible if it

appeared after the declaration of robots allow/deny for the crawler because the sitemap parser terminated after the allow/deny rules had been found. Now the parser reads the robots.txt until the end to discover also sitemap rules at the end of the file.
12 years ago · 038f956821
parent 442ed50be0
commit 038f956821
2 changed files with 1 additions and 11 deletions
--- a/htroot/api/getpageinfo_p.java
+++ b/htroot/api/getpageinfo_p.java
@ -148,6 +148,7 @@ public class getpageinfo_p {
                    final DigestURI theURL = new DigestURI(url);

                	// determine if crawling of the current URL is allowed
+                    sb.robots.ensureExist(theURL, sb.peers.myBotIDs(), true);
                    RobotsTxtEntry robotsEntry = sb.robots.getEntry(theURL, sb.peers.myBotIDs());
                	prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1);
                    prop.putHTML("robotsInfo", robotsEntry == null ? "" : robotsEntry.getInfo());
--- a/source/net/yacy/crawler/robots/RobotsTxtParser.java
+++ b/source/net/yacy/crawler/robots/RobotsTxtParser.java
@ -118,11 +118,6 @@ public final class RobotsTxtParser {
                // parse empty line
                if (line.isEmpty()) {
                    // we have reached the end of the rule block
-                    if (rule4ThisAgentsFound) {
-                        // stop here because other robot blocks are either not for YaCy
-                        // or global settings which shall not overwrite YaCys settings.
-                        break lineparser;
-                    }
                    continue lineparser;
                }

@ -147,12 +142,6 @@ public final class RobotsTxtParser {

                    if (inBlock) {
                        // we have detected the start of a new block
-                        if (rule4ThisAgentsFound) {
-                            // stop here because other robot blocks are either not for YaCy
-                            // or global settings which shall not overwrite YaCys settings.
-                            break lineparser;
-                        }
-
                        inBlock = false;
                        isRule4AllAgents = false;
                        isRule4ThisAgents = false;