*) Changing robots parser cxclusion policy

- crawling is now allowed if server returned a 403 statuscode when trying to download the robots.txt See: http://www.yacy-forum.de/viewtopic.php?t=1612 git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1421 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago · 754a35877f
parent c69f7a39a3
commit 754a35877f
1 changed files with 2 additions and 2 deletions
--- a/source/de/anomic/data/robotsParser.java
+++ b/source/de/anomic/data/robotsParser.java
@ -293,7 +293,7 @@ public final class robotsParser{
                if (modDate != null) reqHeaders.put(httpHeader.IF_MODIFIED_SINCE,httpc.dateString(entry.getModDate()));
            }
            
-            httpc.response res = con.GET(robotsURL.getPath(), reqHeaders);
+            httpc.response res = con.GET(robotsURL.getFile(), reqHeaders);
            if (res.status.startsWith("2")) {
                if (!res.responseHeader.mime().startsWith("text/plain")) {
                    robotsTxt = null;
@ -336,7 +336,7 @@ public final class robotsParser{
                        "\nRedirecting request to: " + redirectionUrl);
                return downloadRobotsTxt(redirectionUrl,redirectionCount,entry);
                
-            } else if (res.status.startsWith("401") || res.status.startsWith("403")) {
+            } else if (res.status.startsWith("401")/* || res.status.startsWith("403") */) {
                accessCompletelyRestricted = true;
                serverLog.logFinest("ROBOTS","Access to Robots.txt not allowed on URL '" + robotsURL + "'.");
            } else {