From d3da7c9a08d87a53fced7719040b099dec77d661 Mon Sep 17 00:00:00 2001 From: theli Date: Thu, 9 Mar 2006 14:03:54 +0000 Subject: [PATCH] *) Adding support for robots Allow directive git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1872 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/data/robotsParser.java | 20 +++++++++----- .../anomic/plasma/plasmaCrawlRobotsTxt.java | 26 +++++++++++++++---- 2 files changed, 35 insertions(+), 11 deletions(-) diff --git a/source/de/anomic/data/robotsParser.java b/source/de/anomic/data/robotsParser.java index d591a7de4..b09992fc3 100644 --- a/source/de/anomic/data/robotsParser.java +++ b/source/de/anomic/data/robotsParser.java @@ -75,6 +75,11 @@ import de.anomic.server.logging.serverLog; */ public final class robotsParser{ + public static final String ROBOTS_USER_AGENT = "User-agent:".toUpperCase(); + public static final String ROBOTS_DISALLOW = "Disallow:".toUpperCase(); + public static final String ROBOTS_ALLOW = "Allow:".toUpperCase(); + public static final String ROBOTS_COMMENT = "#"; + /*public robotsParser(URL robotsUrl){ }*/ /* @@ -119,9 +124,9 @@ public final class robotsParser{ // rule4Yacy = false; inBlock = false; // NEW: just ignore it - } else if (line.startsWith("#")) { + } else if (line.startsWith(ROBOTS_COMMENT)) { // we can ignore this. Just a comment line - } else if (lineUpper.startsWith("User-agent:".toUpperCase())) { + } else if (lineUpper.startsWith(ROBOTS_USER_AGENT)) { if (inBlock) { // we have detected the start of a new block @@ -131,7 +136,7 @@ public final class robotsParser{ } // cutting off comments at the line end - pos = line.indexOf("#"); + pos = line.indexOf(ROBOTS_COMMENT); if (pos != -1) line = line.substring(0,pos).trim(); // replacing all tabs with spaces @@ -145,12 +150,14 @@ public final class robotsParser{ isRuleBlock4YaCyAgent |= userAgent.toLowerCase().indexOf("yacy") >=0; if (isRuleBlock4YaCyAgent) rule4YaCyFound = true; } - } else if (lineUpper.startsWith("Disallow:".toUpperCase())) { + } else if (lineUpper.startsWith(ROBOTS_DISALLOW) || + lineUpper.startsWith(ROBOTS_ALLOW)) { inBlock = true; + boolean isDisallowRule = lineUpper.startsWith(ROBOTS_DISALLOW); if (isRuleBlock4YaCyAgent || isRuleBlock4AllAgents) { // cutting off comments at the line end - pos = line.indexOf("#"); + pos = line.indexOf(ROBOTS_COMMENT); if (pos != -1) line = line.substring(0,pos).trim(); // cutting of tailing * @@ -176,9 +183,10 @@ public final class robotsParser{ } // escaping all occurences of ; because this char is used as special char in the Robots DB - path = path.replaceAll(";","%3B"); + path = path.replaceAll(plasmaCrawlRobotsTxt.ROBOTS_DB_PATH_SEPARATOR,"%3B"); // adding it to the pathlist + if (!isDisallowRule) path = "!" + path; if (isRuleBlock4AllAgents) deny4AllAgents.add(path); if (isRuleBlock4YaCyAgent) deny4YaCyAgent.add(path); } diff --git a/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java b/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java index 6d1043095..373a17d7d 100644 --- a/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java +++ b/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java @@ -59,6 +59,9 @@ import de.anomic.kelondro.kelondroMap; import de.anomic.kelondro.kelondroException; public class plasmaCrawlRobotsTxt { + + public static final String ROBOTS_DB_PATH_SEPARATOR = ";"; + kelondroMap robotsTable; private final File robotsTableFile; private int bufferkb; @@ -168,7 +171,7 @@ public class plasmaCrawlRobotsTxt { this.disallowPathList = new LinkedList(); String csPl = (String) this.mem.get(DISALLOW_PATH_LIST); if (csPl.length() > 0){ - String[] pathArray = csPl.split(";"); + String[] pathArray = csPl.split(ROBOTS_DB_PATH_SEPARATOR); if ((pathArray != null)&&(pathArray.length > 0)) { this.disallowPathList.addAll(Arrays.asList(pathArray)); } @@ -200,7 +203,7 @@ public class plasmaCrawlRobotsTxt { StringBuffer pathListStr = new StringBuffer(); for (int i=0; i 1 && path.startsWith(nextPath.substring(1))) { + return false; + } + + // disallow rule + if (path.startsWith(nextPath)) { + return true; + } } return false; }