*) Adding support for robots Allow directive

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1872 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 19 years ago
parent 7fe0658f8b
commit d3da7c9a08

@ -75,6 +75,11 @@ import de.anomic.server.logging.serverLog;
*/ */
public final class robotsParser{ public final class robotsParser{
public static final String ROBOTS_USER_AGENT = "User-agent:".toUpperCase();
public static final String ROBOTS_DISALLOW = "Disallow:".toUpperCase();
public static final String ROBOTS_ALLOW = "Allow:".toUpperCase();
public static final String ROBOTS_COMMENT = "#";
/*public robotsParser(URL robotsUrl){ /*public robotsParser(URL robotsUrl){
}*/ }*/
/* /*
@ -119,9 +124,9 @@ public final class robotsParser{
// rule4Yacy = false; inBlock = false; // rule4Yacy = false; inBlock = false;
// NEW: just ignore it // NEW: just ignore it
} else if (line.startsWith("#")) { } else if (line.startsWith(ROBOTS_COMMENT)) {
// we can ignore this. Just a comment line // we can ignore this. Just a comment line
} else if (lineUpper.startsWith("User-agent:".toUpperCase())) { } else if (lineUpper.startsWith(ROBOTS_USER_AGENT)) {
if (inBlock) { if (inBlock) {
// we have detected the start of a new block // we have detected the start of a new block
@ -131,7 +136,7 @@ public final class robotsParser{
} }
// cutting off comments at the line end // cutting off comments at the line end
pos = line.indexOf("#"); pos = line.indexOf(ROBOTS_COMMENT);
if (pos != -1) line = line.substring(0,pos).trim(); if (pos != -1) line = line.substring(0,pos).trim();
// replacing all tabs with spaces // replacing all tabs with spaces
@ -145,12 +150,14 @@ public final class robotsParser{
isRuleBlock4YaCyAgent |= userAgent.toLowerCase().indexOf("yacy") >=0; isRuleBlock4YaCyAgent |= userAgent.toLowerCase().indexOf("yacy") >=0;
if (isRuleBlock4YaCyAgent) rule4YaCyFound = true; if (isRuleBlock4YaCyAgent) rule4YaCyFound = true;
} }
} else if (lineUpper.startsWith("Disallow:".toUpperCase())) { } else if (lineUpper.startsWith(ROBOTS_DISALLOW) ||
lineUpper.startsWith(ROBOTS_ALLOW)) {
inBlock = true; inBlock = true;
boolean isDisallowRule = lineUpper.startsWith(ROBOTS_DISALLOW);
if (isRuleBlock4YaCyAgent || isRuleBlock4AllAgents) { if (isRuleBlock4YaCyAgent || isRuleBlock4AllAgents) {
// cutting off comments at the line end // cutting off comments at the line end
pos = line.indexOf("#"); pos = line.indexOf(ROBOTS_COMMENT);
if (pos != -1) line = line.substring(0,pos).trim(); if (pos != -1) line = line.substring(0,pos).trim();
// cutting of tailing * // cutting of tailing *
@ -176,9 +183,10 @@ public final class robotsParser{
} }
// escaping all occurences of ; because this char is used as special char in the Robots DB // escaping all occurences of ; because this char is used as special char in the Robots DB
path = path.replaceAll(";","%3B"); path = path.replaceAll(plasmaCrawlRobotsTxt.ROBOTS_DB_PATH_SEPARATOR,"%3B");
// adding it to the pathlist // adding it to the pathlist
if (!isDisallowRule) path = "!" + path;
if (isRuleBlock4AllAgents) deny4AllAgents.add(path); if (isRuleBlock4AllAgents) deny4AllAgents.add(path);
if (isRuleBlock4YaCyAgent) deny4YaCyAgent.add(path); if (isRuleBlock4YaCyAgent) deny4YaCyAgent.add(path);
} }

@ -59,6 +59,9 @@ import de.anomic.kelondro.kelondroMap;
import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroException;
public class plasmaCrawlRobotsTxt { public class plasmaCrawlRobotsTxt {
public static final String ROBOTS_DB_PATH_SEPARATOR = ";";
kelondroMap robotsTable; kelondroMap robotsTable;
private final File robotsTableFile; private final File robotsTableFile;
private int bufferkb; private int bufferkb;
@ -168,7 +171,7 @@ public class plasmaCrawlRobotsTxt {
this.disallowPathList = new LinkedList(); this.disallowPathList = new LinkedList();
String csPl = (String) this.mem.get(DISALLOW_PATH_LIST); String csPl = (String) this.mem.get(DISALLOW_PATH_LIST);
if (csPl.length() > 0){ if (csPl.length() > 0){
String[] pathArray = csPl.split(";"); String[] pathArray = csPl.split(ROBOTS_DB_PATH_SEPARATOR);
if ((pathArray != null)&&(pathArray.length > 0)) { if ((pathArray != null)&&(pathArray.length > 0)) {
this.disallowPathList.addAll(Arrays.asList(pathArray)); this.disallowPathList.addAll(Arrays.asList(pathArray));
} }
@ -200,7 +203,7 @@ public class plasmaCrawlRobotsTxt {
StringBuffer pathListStr = new StringBuffer(); StringBuffer pathListStr = new StringBuffer();
for (int i=0; i<disallowPathList.size();i++) { for (int i=0; i<disallowPathList.size();i++) {
pathListStr.append(disallowPathList.get(i)) pathListStr.append(disallowPathList.get(i))
.append(";"); .append(ROBOTS_DB_PATH_SEPARATOR);
} }
this.mem.put(DISALLOW_PATH_LIST,pathListStr.substring(0,pathListStr.length()-1)); this.mem.put(DISALLOW_PATH_LIST,pathListStr.substring(0,pathListStr.length()-1));
} }
@ -247,12 +250,25 @@ public class plasmaCrawlRobotsTxt {
public boolean isDisallowed(String path) { public boolean isDisallowed(String path) {
if ((this.mem == null) || (this.disallowPathList.size() == 0)) return false; if ((this.mem == null) || (this.disallowPathList.size() == 0)) return false;
// if the path is null or empty we set it to /
if ((path == null) || (path.length() == 0)) path = "/"; if ((path == null) || (path.length() == 0)) path = "/";
// escaping all occurences of ; because this char is used as special char in the Robots DB
else path = path.replaceAll(ROBOTS_DB_PATH_SEPARATOR,"%3B");
Iterator pathIter = this.disallowPathList.iterator(); Iterator pathIter = this.disallowPathList.iterator();
while (pathIter.hasNext()) { while (pathIter.hasNext()) {
String nextPath = (String) pathIter.next(); String nextPath = (String) pathIter.next();
if (path.startsWith(nextPath)) return true; // allow rule
if (nextPath.startsWith("!") && nextPath.length() > 1 && path.startsWith(nextPath.substring(1))) {
return false;
}
// disallow rule
if (path.startsWith(nextPath)) {
return true;
}
} }
return false; return false;
} }

Loading…
Cancel
Save