fix for sitemap detection: the sitemap url was not visible if it

appeared after the declaration of robots allow/deny for the crawler
because the sitemap parser terminated after the allow/deny rules had
been found. Now the parser reads the robots.txt until the end to
discover also sitemap rules at the end of the file.
pull/1/head
Michael Peter Christen 12 years ago
parent 442ed50be0
commit 038f956821

@ -148,6 +148,7 @@ public class getpageinfo_p {
final DigestURI theURL = new DigestURI(url);
// determine if crawling of the current URL is allowed
sb.robots.ensureExist(theURL, sb.peers.myBotIDs(), true);
RobotsTxtEntry robotsEntry = sb.robots.getEntry(theURL, sb.peers.myBotIDs());
prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1);
prop.putHTML("robotsInfo", robotsEntry == null ? "" : robotsEntry.getInfo());

@ -118,11 +118,6 @@ public final class RobotsTxtParser {
// parse empty line
if (line.isEmpty()) {
// we have reached the end of the rule block
if (rule4ThisAgentsFound) {
// stop here because other robot blocks are either not for YaCy
// or global settings which shall not overwrite YaCys settings.
break lineparser;
}
continue lineparser;
}
@ -147,12 +142,6 @@ public final class RobotsTxtParser {
if (inBlock) {
// we have detected the start of a new block
if (rule4ThisAgentsFound) {
// stop here because other robot blocks are either not for YaCy
// or global settings which shall not overwrite YaCys settings.
break lineparser;
}
inBlock = false;
isRule4AllAgents = false;
isRule4ThisAgents = false;

Loading…
Cancel
Save