//RobotsEntry.java //------------------------------------- //part of YACY //(C) by Michael Peter Christen; mc@yacy.net //first published on http://www.anomic.de //Frankfurt, Germany, 2004 // //This file is contributed by Martin Thelian // [MC] moved some methods from robotsParser file that had been created by Alexander Schier to this class // [MC] redesign: removed entry object from RobotsTxt Class into ths separate class //last major change: $LastChangedDate$ by $LastChangedBy$ //Revision: $LastChangedRevision$ // //This program is free software; you can redistribute it and/or modify //it under the terms of the GNU General Public License as published by //the Free Software Foundation; either version 2 of the License, or //(at your option) any later version. // //This program is distributed in the hope that it will be useful, //but WITHOUT ANY WARRANTY; without even the implied warranty of //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //GNU General Public License for more details. // //You should have received a copy of the GNU General Public License //along with this program; if not, write to the Free Software //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA package de.anomic.crawler; import java.util.ArrayList; import java.util.Arrays; import java.util.Date; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.Map; public class RobotsEntry { public static final String ROBOTS_DB_PATH_SEPARATOR = ";"; public static final String ALLOW_PATH_LIST = "allow"; public static final String DISALLOW_PATH_LIST = "disallow"; public static final String LOADED_DATE = "date"; public static final String MOD_DATE = "modDate"; public static final String ETAG = "etag"; public static final String SITEMAP = "sitemap"; public static final String CRAWL_DELAY = "crawlDelay"; public static final String CRAWL_DELAY_MILLIS = "crawlDelayMillis"; // this is a simple record structure that holds all properties of a single crawl start Map mem; private LinkedList allowPathList, denyPathList; String hostName; public RobotsEntry(final String hostName, final Map mem) { this.hostName = hostName.toLowerCase(); this.mem = mem; if (this.mem.containsKey(DISALLOW_PATH_LIST)) { this.denyPathList = new LinkedList(); final String csPl = this.mem.get(DISALLOW_PATH_LIST); if (csPl.length() > 0){ final String[] pathArray = csPl.split(ROBOTS_DB_PATH_SEPARATOR); if ((pathArray != null)&&(pathArray.length > 0)) { this.denyPathList.addAll(Arrays.asList(pathArray)); } } } else { this.denyPathList = new LinkedList(); } if (this.mem.containsKey(ALLOW_PATH_LIST)) { this.allowPathList = new LinkedList(); final String csPl = this.mem.get(ALLOW_PATH_LIST); if (csPl.length() > 0){ final String[] pathArray = csPl.split(ROBOTS_DB_PATH_SEPARATOR); if ((pathArray != null)&&(pathArray.length > 0)) { this.allowPathList.addAll(Arrays.asList(pathArray)); } } } else { this.allowPathList = new LinkedList(); } } public RobotsEntry( final String hostName, final ArrayList allowPathList, final ArrayList disallowPathList, final Date loadedDate, final Date modDate, final String eTag, final String sitemap, final long crawlDelayMillis ) { if ((hostName == null) || (hostName.length() == 0)) throw new IllegalArgumentException("The hostname is missing"); this.hostName = hostName.trim().toLowerCase(); this.allowPathList = new LinkedList(); this.denyPathList = new LinkedList(); this.mem = new HashMap(10); if (loadedDate != null) this.mem.put(LOADED_DATE,Long.toString(loadedDate.getTime())); if (modDate != null) this.mem.put(MOD_DATE,Long.toString(modDate.getTime())); if (eTag != null) this.mem.put(ETAG,eTag); if (sitemap != null) this.mem.put(SITEMAP,sitemap); if (crawlDelayMillis > 0) this.mem.put(CRAWL_DELAY_MILLIS, Long.toString(crawlDelayMillis)); if (allowPathList != null && !allowPathList.isEmpty()) { this.allowPathList.addAll(allowPathList); final StringBuilder pathListStr = new StringBuilder(allowPathList.size() * 30); for (int i=0; i pathIter = this.denyPathList.iterator(); while (pathIter.hasNext()) { final String nextPath = pathIter.next(); // disallow rule if (path.startsWith(nextPath)) { return true; } } return false; } }