*) Adding support for robots Allow directive

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1872 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 19 years ago
parent 7fe0658f8b
commit d3da7c9a08

@ -75,6 +75,11 @@ import de.anomic.server.logging.serverLog;
*/
public final class robotsParser{
public static final String ROBOTS_USER_AGENT = "User-agent:".toUpperCase();
public static final String ROBOTS_DISALLOW = "Disallow:".toUpperCase();
public static final String ROBOTS_ALLOW = "Allow:".toUpperCase();
public static final String ROBOTS_COMMENT = "#";
/*public robotsParser(URL robotsUrl){
}*/
/*
@ -119,9 +124,9 @@ public final class robotsParser{
// rule4Yacy = false; inBlock = false;
// NEW: just ignore it
} else if (line.startsWith("#")) {
} else if (line.startsWith(ROBOTS_COMMENT)) {
// we can ignore this. Just a comment line
} else if (lineUpper.startsWith("User-agent:".toUpperCase())) {
} else if (lineUpper.startsWith(ROBOTS_USER_AGENT)) {
if (inBlock) {
// we have detected the start of a new block
@ -131,7 +136,7 @@ public final class robotsParser{
}
// cutting off comments at the line end
pos = line.indexOf("#");
pos = line.indexOf(ROBOTS_COMMENT);
if (pos != -1) line = line.substring(0,pos).trim();
// replacing all tabs with spaces
@ -145,12 +150,14 @@ public final class robotsParser{
isRuleBlock4YaCyAgent |= userAgent.toLowerCase().indexOf("yacy") >=0;
if (isRuleBlock4YaCyAgent) rule4YaCyFound = true;
}
} else if (lineUpper.startsWith("Disallow:".toUpperCase())) {
} else if (lineUpper.startsWith(ROBOTS_DISALLOW) ||
lineUpper.startsWith(ROBOTS_ALLOW)) {
inBlock = true;
boolean isDisallowRule = lineUpper.startsWith(ROBOTS_DISALLOW);
if (isRuleBlock4YaCyAgent || isRuleBlock4AllAgents) {
// cutting off comments at the line end
pos = line.indexOf("#");
pos = line.indexOf(ROBOTS_COMMENT);
if (pos != -1) line = line.substring(0,pos).trim();
// cutting of tailing *
@ -176,9 +183,10 @@ public final class robotsParser{
}
// escaping all occurences of ; because this char is used as special char in the Robots DB
path = path.replaceAll(";","%3B");
path = path.replaceAll(plasmaCrawlRobotsTxt.ROBOTS_DB_PATH_SEPARATOR,"%3B");
// adding it to the pathlist
if (!isDisallowRule) path = "!" + path;
if (isRuleBlock4AllAgents) deny4AllAgents.add(path);
if (isRuleBlock4YaCyAgent) deny4YaCyAgent.add(path);
}

@ -59,6 +59,9 @@ import de.anomic.kelondro.kelondroMap;
import de.anomic.kelondro.kelondroException;
public class plasmaCrawlRobotsTxt {
public static final String ROBOTS_DB_PATH_SEPARATOR = ";";
kelondroMap robotsTable;
private final File robotsTableFile;
private int bufferkb;
@ -168,7 +171,7 @@ public class plasmaCrawlRobotsTxt {
this.disallowPathList = new LinkedList();
String csPl = (String) this.mem.get(DISALLOW_PATH_LIST);
if (csPl.length() > 0){
String[] pathArray = csPl.split(";");
String[] pathArray = csPl.split(ROBOTS_DB_PATH_SEPARATOR);
if ((pathArray != null)&&(pathArray.length > 0)) {
this.disallowPathList.addAll(Arrays.asList(pathArray));
}
@ -200,7 +203,7 @@ public class plasmaCrawlRobotsTxt {
StringBuffer pathListStr = new StringBuffer();
for (int i=0; i<disallowPathList.size();i++) {
pathListStr.append(disallowPathList.get(i))
.append(";");
.append(ROBOTS_DB_PATH_SEPARATOR);
}
this.mem.put(DISALLOW_PATH_LIST,pathListStr.substring(0,pathListStr.length()-1));
}
@ -247,12 +250,25 @@ public class plasmaCrawlRobotsTxt {
public boolean isDisallowed(String path) {
if ((this.mem == null) || (this.disallowPathList.size() == 0)) return false;
// if the path is null or empty we set it to /
if ((path == null) || (path.length() == 0)) path = "/";
// escaping all occurences of ; because this char is used as special char in the Robots DB
else path = path.replaceAll(ROBOTS_DB_PATH_SEPARATOR,"%3B");
Iterator pathIter = this.disallowPathList.iterator();
while (pathIter.hasNext()) {
String nextPath = (String) pathIter.next();
if (path.startsWith(nextPath)) return true;
// allow rule
if (nextPath.startsWith("!") && nextPath.length() > 1 && path.startsWith(nextPath.substring(1))) {
return false;
}
// disallow rule
if (path.startsWith(nextPath)) {
return true;
}
}
return false;
}

Loading…
Cancel
Save