- refactoring of robots parser (removed opaque Objects[] result vector)

- added Allow-component to robots result object

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5016 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent 7913bdb75b
commit 50ef5c406f

@ -180,6 +180,7 @@ public class RobotsTxt {
robotsTxt4Host = new Entry(
urlHostPort,
new ArrayList<String>(),
new ArrayList<String>(),
new Date(),
new Date(),
null,
@ -192,8 +193,8 @@ public class RobotsTxt {
// store the data into the robots DB
addEntry(robotsTxt4Host);
} else {
Object[] parserResult = robotsParser.parse((byte[]) result[DOWNLOAD_ROBOTS_TXT]);
ArrayList<String> denyPath = (ArrayList<String>) parserResult[0];
robotsParser parserResult = new robotsParser((byte[]) result[DOWNLOAD_ROBOTS_TXT]);
ArrayList<String> denyPath = parserResult.denyList();
if (((Boolean) result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue()) {
denyPath = new ArrayList<String>();
denyPath.add("/");
@ -202,12 +203,13 @@ public class RobotsTxt {
// store the data into the robots DB
robotsTxt4Host = addEntry(
urlHostPort,
parserResult.allowList(),
denyPath,
new Date(),
(Date) result[DOWNLOAD_MODDATE],
(String) result[DOWNLOAD_ETAG],
(String) parserResult[1],
(Integer) parserResult[2]);
parserResult.sitemap(),
parserResult.crawlDelay());
}
}
}
@ -223,15 +225,16 @@ public class RobotsTxt {
private Entry addEntry(
String hostName,
ArrayList<String> disallowPathList,
Date loadedDate,
ArrayList<String> allowPathList,
ArrayList<String> denyPathList,
Date loadedDate,
Date modDate,
String eTag,
String sitemap,
Integer crawlDelay
int crawlDelay
) {
Entry entry = new Entry(
hostName, disallowPathList, loadedDate, modDate,
hostName, allowPathList, denyPathList, loadedDate, modDate,
eTag, sitemap, crawlDelay);
addEntry(entry);
return entry;
@ -248,16 +251,17 @@ public class RobotsTxt {
}
public class Entry {
public static final String ALLOW_PATH_LIST = "allow";
public static final String DISALLOW_PATH_LIST = "disallow";
public static final String LOADED_DATE = "date";
public static final String MOD_DATE = "modDate";
public static final String ETAG = "etag";
public static final String SITEMAP = "sitemap";
public static final String CRAWL_DELAY = "crawlDelay";
public static final String LOADED_DATE = "date";
public static final String MOD_DATE = "modDate";
public static final String ETAG = "etag";
public static final String SITEMAP = "sitemap";
public static final String CRAWL_DELAY = "crawlDelay";
// this is a simple record structure that hold all properties of a single crawl start
// this is a simple record structure that holds all properties of a single crawl start
HashMap<String, String> mem;
private LinkedList<String> disallowPathList;
private LinkedList<String> allowPathList, denyPathList;
String hostName;
public Entry(String hostName, HashMap<String, String> mem) {
@ -265,42 +269,67 @@ public class RobotsTxt {
this.mem = mem;
if (this.mem.containsKey(DISALLOW_PATH_LIST)) {
this.disallowPathList = new LinkedList<String>();
this.denyPathList = new LinkedList<String>();
String csPl = this.mem.get(DISALLOW_PATH_LIST);
if (csPl.length() > 0){
String[] pathArray = csPl.split(ROBOTS_DB_PATH_SEPARATOR);
if ((pathArray != null)&&(pathArray.length > 0)) {
this.disallowPathList.addAll(Arrays.asList(pathArray));
this.denyPathList.addAll(Arrays.asList(pathArray));
}
}
} else {
this.denyPathList = new LinkedList<String>();
}
if (this.mem.containsKey(ALLOW_PATH_LIST)) {
this.allowPathList = new LinkedList<String>();
String csPl = this.mem.get(ALLOW_PATH_LIST);
if (csPl.length() > 0){
String[] pathArray = csPl.split(ROBOTS_DB_PATH_SEPARATOR);
if ((pathArray != null)&&(pathArray.length > 0)) {
this.allowPathList.addAll(Arrays.asList(pathArray));
}
}
} else {
this.disallowPathList = new LinkedList<String>();
this.allowPathList = new LinkedList<String>();
}
}
public Entry(
String hostName,
ArrayList<String> allowPathList,
ArrayList<String> disallowPathList,
Date loadedDate,
Date modDate,
String eTag,
String sitemap,
Integer crawlDelay
int crawlDelay
) {
if ((hostName == null) || (hostName.length() == 0)) throw new IllegalArgumentException("The hostname is missing");
this.hostName = hostName.trim().toLowerCase();
this.disallowPathList = new LinkedList<String>();
this.allowPathList = new LinkedList<String>();
this.denyPathList = new LinkedList<String>();
this.mem = new HashMap<String, String>(5);
if (loadedDate != null) this.mem.put(LOADED_DATE,Long.toString(loadedDate.getTime()));
if (modDate != null) this.mem.put(MOD_DATE,Long.toString(modDate.getTime()));
if (eTag != null) this.mem.put(ETAG,eTag);
if (sitemap != null) this.mem.put(SITEMAP,sitemap);
if (crawlDelay != null) this.mem.put(CRAWL_DELAY,crawlDelay.toString());
if (crawlDelay != 0) this.mem.put(CRAWL_DELAY, Integer.toString(crawlDelay));
if ((allowPathList != null)&&(allowPathList.size()>0)) {
this.allowPathList.addAll(allowPathList);
StringBuffer pathListStr = new StringBuffer();
for (int i=0; i<allowPathList.size();i++) {
pathListStr.append(allowPathList.get(i))
.append(ROBOTS_DB_PATH_SEPARATOR);
}
this.mem.put(ALLOW_PATH_LIST,pathListStr.substring(0,pathListStr.length()-1));
}
if ((disallowPathList != null)&&(disallowPathList.size()>0)) {
this.disallowPathList.addAll(disallowPathList);
this.denyPathList.addAll(disallowPathList);
StringBuffer pathListStr = new StringBuffer();
for (int i=0; i<disallowPathList.size();i++) {
@ -364,21 +393,16 @@ public class RobotsTxt {
}
public boolean isDisallowed(String path) {
if ((this.mem == null) || (this.disallowPathList.size() == 0)) return false;
if ((this.mem == null) || (this.denyPathList.size() == 0)) return false;
// if the path is null or empty we set it to /
if ((path == null) || (path.length() == 0)) path = "/";
// escaping all occurences of ; because this char is used as special char in the Robots DB
else path = path.replaceAll(ROBOTS_DB_PATH_SEPARATOR,"%3B");
Iterator<String> pathIter = this.disallowPathList.iterator();
Iterator<String> pathIter = this.denyPathList.iterator();
while (pathIter.hasNext()) {
String nextPath = pathIter.next();
// allow rule
if (nextPath.startsWith("!") && nextPath.length() > 1 && path.startsWith(nextPath.substring(1))) {
return false;
}
// disallow rule
if (path.startsWith(nextPath)) {

@ -22,13 +22,13 @@
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
// extended to return structured objects instead of a Object[] and
// extended to return a Allow-List by Michael Christen, 21.07.2008
package de.anomic.crawler;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URLDecoder;
@ -52,7 +52,9 @@ import java.util.ArrayList;
* - Robot Exclusion Standard Revisited
* See: http://www.kollar.com/robots.html
*/
public final class robotsParser{
public final class robotsParser {
public static final String ROBOTS_USER_AGENT = "User-agent:".toUpperCase();
public static final String ROBOTS_DISALLOW = "Disallow:".toUpperCase();
public static final String ROBOTS_ALLOW = "Allow:".toUpperCase();
@ -60,41 +62,47 @@ public final class robotsParser{
public static final String ROBOTS_SITEMAP = "Sitemap:".toUpperCase();
public static final String ROBOTS_CRAWL_DELAY = "Crawl-Delay:".toUpperCase();
/*public robotsParser(URL robotsUrl){
}*/
/*
* this parses the robots.txt.
* at the Moment it only creates a list of Deny Paths
*/
private ArrayList<String> allowList;
private ArrayList<String> denyList;
private String sitemap;
private int crawlDelay;
public static Object[] parse(File robotsFile) {
BufferedReader reader = null;
try {
reader = new BufferedReader(new FileReader(robotsFile));
if (reader != null) try{reader.close();}catch(Exception e){/* ignore this */}
return parse(reader);
} catch (FileNotFoundException e1) {
public robotsParser(byte[] robotsTxt) {
if ((robotsTxt == null)||(robotsTxt.length == 0)) {
allowList = new ArrayList<String>(0);
denyList = new ArrayList<String>(0);
sitemap = "";
crawlDelay = 0;
} else {
ByteArrayInputStream bin = new ByteArrayInputStream(robotsTxt);
BufferedReader reader = new BufferedReader(new InputStreamReader(bin));
parse(reader);
}
return new Object[]{new ArrayList<String>(), "", new Integer(0)};
}
@SuppressWarnings("unchecked")
public static Object[] parse(byte[] robotsTxt) {
if ((robotsTxt == null)||(robotsTxt.length == 0)) return new Object[]{new ArrayList(0),null,null};
ByteArrayInputStream bin = new ByteArrayInputStream(robotsTxt);
BufferedReader reader = new BufferedReader(new InputStreamReader(bin));
return parse(reader);
public robotsParser(BufferedReader reader) {
if (reader == null) {
allowList = new ArrayList<String>(0);
denyList = new ArrayList<String>(0);
sitemap = "";
crawlDelay = 0;
} else {
parse(reader);
}
}
public static Object[] parse(BufferedReader reader) {
private void parse(BufferedReader reader) {
ArrayList<String> deny4AllAgents = new ArrayList<String>();
ArrayList<String> deny4YaCyAgent = new ArrayList<String>();
ArrayList<String> allow4AllAgents = new ArrayList<String>();
ArrayList<String> allow4YaCyAgent = new ArrayList<String>();
int pos;
String line = null, lineUpper = null, sitemap = null;
Integer crawlDelay = null;
boolean isRuleBlock4AllAgents = false,
isRuleBlock4YaCyAgent = false,
String line = null, lineUpper = null;
sitemap = null;
crawlDelay = 0;
boolean isRule4AllAgents = false,
isRule4YaCyAgent = false,
rule4YaCyFound = false,
inBlock = false;
@ -120,9 +128,9 @@ public final class robotsParser{
if (inBlock) {
// we have detected the start of a new block
inBlock = false;
isRuleBlock4AllAgents = false;
isRuleBlock4YaCyAgent = false;
crawlDelay = null; // each block has a separate delay
isRule4AllAgents = false;
isRule4YaCyAgent = false;
crawlDelay = 0; // each block has a separate delay
}
// cutting off comments at the line end
@ -136,15 +144,15 @@ public final class robotsParser{
pos = line.indexOf(" ");
if (pos != -1) {
String userAgent = line.substring(pos).trim();
isRuleBlock4AllAgents |= userAgent.equals("*");
isRuleBlock4YaCyAgent |= userAgent.toLowerCase().indexOf("yacy") >=0;
if (isRuleBlock4YaCyAgent) rule4YaCyFound = true;
isRule4AllAgents |= userAgent.equals("*");
isRule4YaCyAgent |= userAgent.toLowerCase().indexOf("yacy") >=0;
if (isRule4YaCyAgent) rule4YaCyFound = true;
}
} else if (lineUpper.startsWith(ROBOTS_CRAWL_DELAY)) {
pos = line.indexOf(" ");
if (pos != -1) {
try {
crawlDelay = Integer.valueOf(line.substring(pos).trim());
crawlDelay = Integer.parseInt(line.substring(pos).trim());
} catch (NumberFormatException e) {
// invalid crawling delay
}
@ -154,7 +162,7 @@ public final class robotsParser{
inBlock = true;
boolean isDisallowRule = lineUpper.startsWith(ROBOTS_DISALLOW);
if (isRuleBlock4YaCyAgent || isRuleBlock4AllAgents) {
if (isRule4YaCyAgent || isRule4AllAgents) {
// cutting off comments at the line end
pos = line.indexOf(ROBOTS_COMMENT);
if (pos != -1) line = line.substring(0,pos).trim();
@ -185,17 +193,36 @@ public final class robotsParser{
path = path.replaceAll(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR,"%3B");
// adding it to the pathlist
if (!isDisallowRule) path = "!" + path;
if (isRuleBlock4AllAgents) deny4AllAgents.add(path);
if (isRuleBlock4YaCyAgent) deny4YaCyAgent.add(path);
if (isDisallowRule) {
if (isRule4AllAgents) deny4AllAgents.add(path);
if (isRule4YaCyAgent) deny4YaCyAgent.add(path);
} else {
if (isRule4AllAgents) allow4AllAgents.add(path);
if (isRule4YaCyAgent) allow4YaCyAgent.add(path);
}
}
}
}
}
} catch (IOException e) {}
ArrayList<String> denyList = (rule4YaCyFound) ? deny4YaCyAgent : deny4AllAgents;
return new Object[]{denyList, sitemap, crawlDelay};
allowList = (rule4YaCyFound) ? allow4YaCyAgent : allow4AllAgents;
denyList = (rule4YaCyFound) ? deny4YaCyAgent : deny4AllAgents;
}
public int crawlDelay() {
return this.crawlDelay;
}
public String sitemap() {
return this.sitemap;
}
public ArrayList<String> allowList() {
return this.allowList;
}
public ArrayList<String> denyList() {
return this.denyList;
}
}

@ -86,7 +86,6 @@
package de.anomic.plasma;
import java.awt.GraphicsEnvironment;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;

@ -38,15 +38,15 @@ public class consoleInterface extends Thread
/**
* FIXME just for debugging
*/
private final String name;
//private final String name;
private serverLog log;
public consoleInterface (final InputStream stream, String name, serverLog log)
public consoleInterface(final InputStream stream, String name, serverLog log)
{
this.log = log;
this.stream = stream;
this.name = name;
//this.name = name;
// block reading {@see getOutput()}
try {
dataIsRead.acquire();

Loading…
Cancel
Save