From 50ef5c406fdee5ed4fd75fdf6aa8faf3d6b2b831 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 24 Jul 2008 11:54:37 +0000 Subject: [PATCH] - refactoring of robots parser (removed opaque Objects[] result vector) - added Allow-component to robots result object git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5016 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/crawler/RobotsTxt.java | 82 ++++++++----- source/de/anomic/crawler/robotsParser.java | 111 +++++++++++------- .../de/anomic/plasma/plasmaSwitchboard.java | 1 - source/de/anomic/tools/consoleInterface.java | 6 +- 4 files changed, 125 insertions(+), 75 deletions(-) diff --git a/source/de/anomic/crawler/RobotsTxt.java b/source/de/anomic/crawler/RobotsTxt.java index 3f6325acf..d616bb166 100644 --- a/source/de/anomic/crawler/RobotsTxt.java +++ b/source/de/anomic/crawler/RobotsTxt.java @@ -180,6 +180,7 @@ public class RobotsTxt { robotsTxt4Host = new Entry( urlHostPort, new ArrayList(), + new ArrayList(), new Date(), new Date(), null, @@ -192,8 +193,8 @@ public class RobotsTxt { // store the data into the robots DB addEntry(robotsTxt4Host); } else { - Object[] parserResult = robotsParser.parse((byte[]) result[DOWNLOAD_ROBOTS_TXT]); - ArrayList denyPath = (ArrayList) parserResult[0]; + robotsParser parserResult = new robotsParser((byte[]) result[DOWNLOAD_ROBOTS_TXT]); + ArrayList denyPath = parserResult.denyList(); if (((Boolean) result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue()) { denyPath = new ArrayList(); denyPath.add("/"); @@ -202,12 +203,13 @@ public class RobotsTxt { // store the data into the robots DB robotsTxt4Host = addEntry( urlHostPort, + parserResult.allowList(), denyPath, new Date(), (Date) result[DOWNLOAD_MODDATE], (String) result[DOWNLOAD_ETAG], - (String) parserResult[1], - (Integer) parserResult[2]); + parserResult.sitemap(), + parserResult.crawlDelay()); } } } @@ -223,15 +225,16 @@ public class RobotsTxt { private Entry addEntry( String hostName, - ArrayList disallowPathList, - Date loadedDate, + ArrayList allowPathList, + ArrayList denyPathList, + Date loadedDate, Date modDate, String eTag, String sitemap, - Integer crawlDelay + int crawlDelay ) { Entry entry = new Entry( - hostName, disallowPathList, loadedDate, modDate, + hostName, allowPathList, denyPathList, loadedDate, modDate, eTag, sitemap, crawlDelay); addEntry(entry); return entry; @@ -248,16 +251,17 @@ public class RobotsTxt { } public class Entry { + public static final String ALLOW_PATH_LIST = "allow"; public static final String DISALLOW_PATH_LIST = "disallow"; - public static final String LOADED_DATE = "date"; - public static final String MOD_DATE = "modDate"; - public static final String ETAG = "etag"; - public static final String SITEMAP = "sitemap"; - public static final String CRAWL_DELAY = "crawlDelay"; + public static final String LOADED_DATE = "date"; + public static final String MOD_DATE = "modDate"; + public static final String ETAG = "etag"; + public static final String SITEMAP = "sitemap"; + public static final String CRAWL_DELAY = "crawlDelay"; - // this is a simple record structure that hold all properties of a single crawl start + // this is a simple record structure that holds all properties of a single crawl start HashMap mem; - private LinkedList disallowPathList; + private LinkedList allowPathList, denyPathList; String hostName; public Entry(String hostName, HashMap mem) { @@ -265,42 +269,67 @@ public class RobotsTxt { this.mem = mem; if (this.mem.containsKey(DISALLOW_PATH_LIST)) { - this.disallowPathList = new LinkedList(); + this.denyPathList = new LinkedList(); String csPl = this.mem.get(DISALLOW_PATH_LIST); if (csPl.length() > 0){ String[] pathArray = csPl.split(ROBOTS_DB_PATH_SEPARATOR); if ((pathArray != null)&&(pathArray.length > 0)) { - this.disallowPathList.addAll(Arrays.asList(pathArray)); + this.denyPathList.addAll(Arrays.asList(pathArray)); + } + } + } else { + this.denyPathList = new LinkedList(); + } + if (this.mem.containsKey(ALLOW_PATH_LIST)) { + this.allowPathList = new LinkedList(); + String csPl = this.mem.get(ALLOW_PATH_LIST); + if (csPl.length() > 0){ + String[] pathArray = csPl.split(ROBOTS_DB_PATH_SEPARATOR); + if ((pathArray != null)&&(pathArray.length > 0)) { + this.allowPathList.addAll(Arrays.asList(pathArray)); } } } else { - this.disallowPathList = new LinkedList(); + this.allowPathList = new LinkedList(); } } public Entry( String hostName, + ArrayList allowPathList, ArrayList disallowPathList, Date loadedDate, Date modDate, String eTag, String sitemap, - Integer crawlDelay + int crawlDelay ) { if ((hostName == null) || (hostName.length() == 0)) throw new IllegalArgumentException("The hostname is missing"); this.hostName = hostName.trim().toLowerCase(); - this.disallowPathList = new LinkedList(); + this.allowPathList = new LinkedList(); + this.denyPathList = new LinkedList(); this.mem = new HashMap(5); if (loadedDate != null) this.mem.put(LOADED_DATE,Long.toString(loadedDate.getTime())); if (modDate != null) this.mem.put(MOD_DATE,Long.toString(modDate.getTime())); if (eTag != null) this.mem.put(ETAG,eTag); if (sitemap != null) this.mem.put(SITEMAP,sitemap); - if (crawlDelay != null) this.mem.put(CRAWL_DELAY,crawlDelay.toString()); + if (crawlDelay != 0) this.mem.put(CRAWL_DELAY, Integer.toString(crawlDelay)); + + if ((allowPathList != null)&&(allowPathList.size()>0)) { + this.allowPathList.addAll(allowPathList); + + StringBuffer pathListStr = new StringBuffer(); + for (int i=0; i0)) { - this.disallowPathList.addAll(disallowPathList); + this.denyPathList.addAll(disallowPathList); StringBuffer pathListStr = new StringBuffer(); for (int i=0; i pathIter = this.disallowPathList.iterator(); + Iterator pathIter = this.denyPathList.iterator(); while (pathIter.hasNext()) { String nextPath = pathIter.next(); - // allow rule - if (nextPath.startsWith("!") && nextPath.length() > 1 && path.startsWith(nextPath.substring(1))) { - return false; - } // disallow rule if (path.startsWith(nextPath)) { diff --git a/source/de/anomic/crawler/robotsParser.java b/source/de/anomic/crawler/robotsParser.java index 6276fada4..3914b477c 100644 --- a/source/de/anomic/crawler/robotsParser.java +++ b/source/de/anomic/crawler/robotsParser.java @@ -22,13 +22,13 @@ //along with this program; if not, write to the Free Software //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// extended to return structured objects instead of a Object[] and +// extended to return a Allow-List by Michael Christen, 21.07.2008 + package de.anomic.crawler; import java.io.BufferedReader; import java.io.ByteArrayInputStream; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.URLDecoder; @@ -52,7 +52,9 @@ import java.util.ArrayList; * - Robot Exclusion Standard Revisited * See: http://www.kollar.com/robots.html */ -public final class robotsParser{ + +public final class robotsParser { + public static final String ROBOTS_USER_AGENT = "User-agent:".toUpperCase(); public static final String ROBOTS_DISALLOW = "Disallow:".toUpperCase(); public static final String ROBOTS_ALLOW = "Allow:".toUpperCase(); @@ -60,41 +62,47 @@ public final class robotsParser{ public static final String ROBOTS_SITEMAP = "Sitemap:".toUpperCase(); public static final String ROBOTS_CRAWL_DELAY = "Crawl-Delay:".toUpperCase(); - /*public robotsParser(URL robotsUrl){ - }*/ - /* - * this parses the robots.txt. - * at the Moment it only creates a list of Deny Paths - */ + private ArrayList allowList; + private ArrayList denyList; + private String sitemap; + private int crawlDelay; - public static Object[] parse(File robotsFile) { - BufferedReader reader = null; - try { - reader = new BufferedReader(new FileReader(robotsFile)); - if (reader != null) try{reader.close();}catch(Exception e){/* ignore this */} - return parse(reader); - } catch (FileNotFoundException e1) { + public robotsParser(byte[] robotsTxt) { + if ((robotsTxt == null)||(robotsTxt.length == 0)) { + allowList = new ArrayList(0); + denyList = new ArrayList(0); + sitemap = ""; + crawlDelay = 0; + } else { + ByteArrayInputStream bin = new ByteArrayInputStream(robotsTxt); + BufferedReader reader = new BufferedReader(new InputStreamReader(bin)); + parse(reader); } - return new Object[]{new ArrayList(), "", new Integer(0)}; } - @SuppressWarnings("unchecked") - public static Object[] parse(byte[] robotsTxt) { - if ((robotsTxt == null)||(robotsTxt.length == 0)) return new Object[]{new ArrayList(0),null,null}; - ByteArrayInputStream bin = new ByteArrayInputStream(robotsTxt); - BufferedReader reader = new BufferedReader(new InputStreamReader(bin)); - return parse(reader); + public robotsParser(BufferedReader reader) { + if (reader == null) { + allowList = new ArrayList(0); + denyList = new ArrayList(0); + sitemap = ""; + crawlDelay = 0; + } else { + parse(reader); + } } - public static Object[] parse(BufferedReader reader) { + private void parse(BufferedReader reader) { ArrayList deny4AllAgents = new ArrayList(); ArrayList deny4YaCyAgent = new ArrayList(); + ArrayList allow4AllAgents = new ArrayList(); + ArrayList allow4YaCyAgent = new ArrayList(); int pos; - String line = null, lineUpper = null, sitemap = null; - Integer crawlDelay = null; - boolean isRuleBlock4AllAgents = false, - isRuleBlock4YaCyAgent = false, + String line = null, lineUpper = null; + sitemap = null; + crawlDelay = 0; + boolean isRule4AllAgents = false, + isRule4YaCyAgent = false, rule4YaCyFound = false, inBlock = false; @@ -120,9 +128,9 @@ public final class robotsParser{ if (inBlock) { // we have detected the start of a new block inBlock = false; - isRuleBlock4AllAgents = false; - isRuleBlock4YaCyAgent = false; - crawlDelay = null; // each block has a separate delay + isRule4AllAgents = false; + isRule4YaCyAgent = false; + crawlDelay = 0; // each block has a separate delay } // cutting off comments at the line end @@ -136,15 +144,15 @@ public final class robotsParser{ pos = line.indexOf(" "); if (pos != -1) { String userAgent = line.substring(pos).trim(); - isRuleBlock4AllAgents |= userAgent.equals("*"); - isRuleBlock4YaCyAgent |= userAgent.toLowerCase().indexOf("yacy") >=0; - if (isRuleBlock4YaCyAgent) rule4YaCyFound = true; + isRule4AllAgents |= userAgent.equals("*"); + isRule4YaCyAgent |= userAgent.toLowerCase().indexOf("yacy") >=0; + if (isRule4YaCyAgent) rule4YaCyFound = true; } } else if (lineUpper.startsWith(ROBOTS_CRAWL_DELAY)) { pos = line.indexOf(" "); if (pos != -1) { try { - crawlDelay = Integer.valueOf(line.substring(pos).trim()); + crawlDelay = Integer.parseInt(line.substring(pos).trim()); } catch (NumberFormatException e) { // invalid crawling delay } @@ -154,7 +162,7 @@ public final class robotsParser{ inBlock = true; boolean isDisallowRule = lineUpper.startsWith(ROBOTS_DISALLOW); - if (isRuleBlock4YaCyAgent || isRuleBlock4AllAgents) { + if (isRule4YaCyAgent || isRule4AllAgents) { // cutting off comments at the line end pos = line.indexOf(ROBOTS_COMMENT); if (pos != -1) line = line.substring(0,pos).trim(); @@ -185,17 +193,36 @@ public final class robotsParser{ path = path.replaceAll(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR,"%3B"); // adding it to the pathlist - if (!isDisallowRule) path = "!" + path; - if (isRuleBlock4AllAgents) deny4AllAgents.add(path); - if (isRuleBlock4YaCyAgent) deny4YaCyAgent.add(path); + if (isDisallowRule) { + if (isRule4AllAgents) deny4AllAgents.add(path); + if (isRule4YaCyAgent) deny4YaCyAgent.add(path); + } else { + if (isRule4AllAgents) allow4AllAgents.add(path); + if (isRule4YaCyAgent) allow4YaCyAgent.add(path); + } } } } } } catch (IOException e) {} - ArrayList denyList = (rule4YaCyFound) ? deny4YaCyAgent : deny4AllAgents; - return new Object[]{denyList, sitemap, crawlDelay}; + allowList = (rule4YaCyFound) ? allow4YaCyAgent : allow4AllAgents; + denyList = (rule4YaCyFound) ? deny4YaCyAgent : deny4AllAgents; } + public int crawlDelay() { + return this.crawlDelay; + } + + public String sitemap() { + return this.sitemap; + } + + public ArrayList allowList() { + return this.allowList; + } + + public ArrayList denyList() { + return this.denyList; + } } diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 5fbc6f32c..7e94fca34 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -86,7 +86,6 @@ package de.anomic.plasma; -import java.awt.GraphicsEnvironment; import java.io.File; import java.io.FileInputStream; import java.io.IOException; diff --git a/source/de/anomic/tools/consoleInterface.java b/source/de/anomic/tools/consoleInterface.java index 9b0749522..4695ac521 100755 --- a/source/de/anomic/tools/consoleInterface.java +++ b/source/de/anomic/tools/consoleInterface.java @@ -38,15 +38,15 @@ public class consoleInterface extends Thread /** * FIXME just for debugging */ - private final String name; + //private final String name; private serverLog log; - public consoleInterface (final InputStream stream, String name, serverLog log) + public consoleInterface(final InputStream stream, String name, serverLog log) { this.log = log; this.stream = stream; - this.name = name; + //this.name = name; // block reading {@see getOutput()} try { dataIsRead.acquire();