- fixed a bug in robots.txt parser

- moved storage of robots.txt entries to WorkTables, so it is now possible to browse the robots entries with the table browser

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6710 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 54af9e6b49
commit 727dd9b193

@ -144,11 +144,7 @@ public class IndexControlRWIs_p {
segment.clear();
sb.crawlQueues.clear();
sb.crawlStacker.clear();
try {
sb.robots.clear();
} catch (final IOException e) {
Log.logException(e);
}
sb.robots.clear();
post.remove("deletecomplete");
}

@ -7,7 +7,7 @@
//
//This file is contributed by Martin Thelian
// [MC] moved some methods from robotsParser file that had been created by Alexander Schier to this class
// [MC] redesign: removed entry object from RobotsTxt Class into ths separate class
// [MC] redesign: removed entry object from RobotsTxt Class into this separate class
//last major change: $LastChangedDate$ by $LastChangedBy$
//Revision: $LastChangedRevision$
@ -31,14 +31,17 @@ package de.anomic.crawler;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.Map;
import net.yacy.kelondro.data.meta.DigestURI;
public class RobotsEntry {
public static final String ROBOTS_DB_PATH_SEPARATOR = ";";
public static final String HOST_NAME = "hostname";
public static final String ALLOW_PATH_LIST = "allow";
public static final String DISALLOW_PATH_LIST = "disallow";
public static final String LOADED_DATE = "date";
@ -49,17 +52,17 @@ public class RobotsEntry {
public static final String CRAWL_DELAY_MILLIS = "crawlDelayMillis";
// this is a simple record structure that holds all properties of a single crawl start
Map<String, String> mem;
private Map<String, byte[]> mem;
private LinkedList<String> allowPathList, denyPathList;
String hostName;
public RobotsEntry(final String hostName, final Map<String, String> mem) {
public RobotsEntry(final String hostName, final Map<String, byte[]> mem) {
this.hostName = hostName.toLowerCase();
this.mem = mem;
if (this.mem.containsKey(DISALLOW_PATH_LIST)) {
this.denyPathList = new LinkedList<String>();
final String csPl = this.mem.get(DISALLOW_PATH_LIST);
final String csPl = new String(this.mem.get(DISALLOW_PATH_LIST));
if (csPl.length() > 0){
final String[] pathArray = csPl.split(ROBOTS_DB_PATH_SEPARATOR);
if ((pathArray != null)&&(pathArray.length > 0)) {
@ -71,7 +74,7 @@ public class RobotsEntry {
}
if (this.mem.containsKey(ALLOW_PATH_LIST)) {
this.allowPathList = new LinkedList<String>();
final String csPl = this.mem.get(ALLOW_PATH_LIST);
final String csPl = new String(this.mem.get(ALLOW_PATH_LIST));
if (csPl.length() > 0){
final String[] pathArray = csPl.split(ROBOTS_DB_PATH_SEPARATOR);
if ((pathArray != null)&&(pathArray.length > 0)) {
@ -84,7 +87,7 @@ public class RobotsEntry {
}
public RobotsEntry(
final String hostName,
final DigestURI theURL,
final ArrayList<String> allowPathList,
final ArrayList<String> disallowPathList,
final Date loadedDate,
@ -93,18 +96,19 @@ public class RobotsEntry {
final String sitemap,
final long crawlDelayMillis
) {
if ((hostName == null) || (hostName.length() == 0)) throw new IllegalArgumentException("The hostname is missing");
if (theURL == null) throw new IllegalArgumentException("The url is missing");
this.hostName = hostName.trim().toLowerCase();
this.hostName = RobotsTxt.getHostPort(theURL).toLowerCase();
this.allowPathList = new LinkedList<String>();
this.denyPathList = new LinkedList<String>();
this.mem = new HashMap<String, String>(10);
if (loadedDate != null) this.mem.put(LOADED_DATE,Long.toString(loadedDate.getTime()));
if (modDate != null) this.mem.put(MOD_DATE,Long.toString(modDate.getTime()));
if (eTag != null) this.mem.put(ETAG,eTag);
if (sitemap != null) this.mem.put(SITEMAP,sitemap);
if (crawlDelayMillis > 0) this.mem.put(CRAWL_DELAY_MILLIS, Long.toString(crawlDelayMillis));
this.mem = new LinkedHashMap<String, byte[]>(10);
this.mem.put(HOST_NAME, this.hostName.getBytes());
if (loadedDate != null) this.mem.put(LOADED_DATE, Long.toString(loadedDate.getTime()).getBytes());
if (modDate != null) this.mem.put(MOD_DATE, Long.toString(modDate.getTime()).getBytes());
if (eTag != null) this.mem.put(ETAG, eTag.getBytes());
if (sitemap != null) this.mem.put(SITEMAP, sitemap.getBytes());
if (crawlDelayMillis > 0) this.mem.put(CRAWL_DELAY_MILLIS, Long.toString(crawlDelayMillis).getBytes());
if (allowPathList != null && !allowPathList.isEmpty()) {
this.allowPathList.addAll(allowPathList);
@ -114,7 +118,7 @@ public class RobotsEntry {
pathListStr.append(allowPathList.get(i))
.append(ROBOTS_DB_PATH_SEPARATOR);
}
this.mem.put(ALLOW_PATH_LIST,pathListStr.substring(0,pathListStr.length()-1));
this.mem.put(ALLOW_PATH_LIST, pathListStr.substring(0,pathListStr.length()-1).getBytes());
}
if (disallowPathList != null && !disallowPathList.isEmpty()) {
@ -125,61 +129,61 @@ public class RobotsEntry {
pathListStr.append(disallowPathList.get(i))
.append(ROBOTS_DB_PATH_SEPARATOR);
}
this.mem.put(DISALLOW_PATH_LIST,pathListStr.substring(0,pathListStr.length()-1));
this.mem.put(DISALLOW_PATH_LIST,pathListStr.substring(0, pathListStr.length()-1).getBytes());
}
}
public Map<String, byte[]> getMem() {
if (!this.mem.containsKey(HOST_NAME)) this.mem.put(HOST_NAME, this.hostName.getBytes());
return this.mem;
}
public String toString() {
final StringBuilder str = new StringBuilder(6000);
str.append((this.hostName==null)?"null":this.hostName)
.append(": ");
if (this.mem != null) {
str.append(this.mem.toString());
}
str.append((this.hostName == null) ? "null" : this.hostName).append(": ");
if (this.mem != null) str.append(this.mem.toString());
return str.toString();
}
public String getSitemap() {
return this.mem.containsKey(SITEMAP)? this.mem.get(SITEMAP): null;
return this.mem.containsKey(SITEMAP)? new String(this.mem.get(SITEMAP)): null;
}
public Date getLoadedDate() {
if (this.mem.containsKey(LOADED_DATE)) {
return new Date(Long.parseLong(this.mem.get(LOADED_DATE)));
return new Date(Long.parseLong(new String(this.mem.get(LOADED_DATE))));
}
return null;
}
public void setLoadedDate(final Date newLoadedDate) {
if (newLoadedDate != null) {
this.mem.put(LOADED_DATE,Long.toString(newLoadedDate.getTime()));
this.mem.put(LOADED_DATE, Long.toString(newLoadedDate.getTime()).getBytes());
}
}
public Date getModDate() {
if (this.mem.containsKey(MOD_DATE)) {
return new Date(Long.parseLong(this.mem.get(MOD_DATE)));
return new Date(Long.parseLong(new String(this.mem.get(MOD_DATE))));
}
return null;
}
public String getETag() {
if (this.mem.containsKey(ETAG)) {
return this.mem.get(ETAG);
return new String(this.mem.get(ETAG));
}
return null;
}
public long getCrawlDelayMillis() {
if (this.mem.containsKey(CRAWL_DELAY_MILLIS)) try {
return Long.parseLong(this.mem.get(CRAWL_DELAY_MILLIS));
return Long.parseLong(new String(this.mem.get(CRAWL_DELAY_MILLIS)));
} catch (final NumberFormatException e) {
return 0;
}
if (this.mem.containsKey(CRAWL_DELAY)) try {
return 1000 * Integer.parseInt(this.mem.get(CRAWL_DELAY));
return 1000 * Integer.parseInt(new String(this.mem.get(CRAWL_DELAY)));
} catch (final NumberFormatException e) {
return 0;
}

@ -28,7 +28,6 @@ package de.anomic.crawler;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
@ -36,14 +35,12 @@ import java.util.Date;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import net.yacy.kelondro.blob.MapHeap;
import net.yacy.kelondro.blob.BEncodedHeap;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.NaturalOrder;
import net.yacy.kelondro.util.ByteBuffer;
import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.kelondroException;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.http.client.Client;
@ -56,8 +53,7 @@ public class RobotsTxt {
public static final String ROBOTS_DB_PATH_SEPARATOR = ";";
private static final Log log = new Log("ROBOTS");
MapHeap robotsTable;
private final File robotsTableFile;
BEncodedHeap robotsTable;
private final ConcurrentHashMap<String, DomSync> syncObjects;
//private static final HashSet<String> loadedRobots = new HashSet<String>(); // only for debugging
@ -65,53 +61,29 @@ public class RobotsTxt {
public DomSync() {}
}
public RobotsTxt(final File robotsTableFile) {
this.robotsTableFile = robotsTableFile;
robotsTableFile.getParentFile().mkdirs();
try {
robotsTable = new MapHeap(robotsTableFile, 64, NaturalOrder.naturalOrder, 1024 * 1024, 100, '_');
} catch (final IOException e) {
Log.logException(e);
}
public RobotsTxt(final BEncodedHeap robotsTable) {
this.robotsTable = robotsTable;
syncObjects = new ConcurrentHashMap<String, DomSync>();
}
private void resetDatabase() {
// deletes the robots.txt database and creates a new one
if (robotsTable != null) robotsTable.close();
FileUtils.deletedelete(robotsTableFile);
robotsTableFile.getParentFile().mkdirs();
public void clear() {
try {
robotsTable = new MapHeap(robotsTableFile, 64, NaturalOrder.naturalOrder, 1024 * 1024, 100, '_');
} catch (final IOException e) {
Log.logException(e);
this.robotsTable.clear();
} catch (IOException e) {
}
syncObjects.clear();
}
public void clear() throws IOException {
this.robotsTable.clear();
}
public void close() {
this.robotsTable.close();
}
public int size() {
return this.robotsTable.size();
}
private RobotsEntry getEntry(final String urlHostPort, final boolean fetchOnlineIfNotAvailableOrNotFresh) {
private RobotsEntry getEntry(final DigestURI theURL, final boolean fetchOnlineIfNotAvailableOrNotFresh) throws IOException {
// this method will always return a non-null value
String urlHostPort = getHostPort(theURL);
RobotsEntry robotsTxt4Host = null;
try {
final Map<String, String> record = this.robotsTable.get(urlHostPort);
if (record != null) robotsTxt4Host = new RobotsEntry(urlHostPort, record);
} catch (final kelondroException e) {
resetDatabase();
} catch (final IOException e) {
resetDatabase();
}
Map<String, byte[]> record = this.robotsTable.get(this.robotsTable.encodedKey(urlHostPort));
if (record != null) robotsTxt4Host = new RobotsEntry(urlHostPort, record);
if (fetchOnlineIfNotAvailableOrNotFresh && (
robotsTxt4Host == null ||
@ -133,14 +105,8 @@ public class RobotsTxt {
// check the robots table again for all threads that come here because they waited for another one
// to complete a download
try {
final Map<String, String> record = this.robotsTable.get(urlHostPort);
if (record != null) robotsTxt4Host = new RobotsEntry(urlHostPort, record);
} catch (final kelondroException e) {
resetDatabase();
} catch (final IOException e) {
resetDatabase();
}
record = this.robotsTable.get(this.robotsTable.encodedKey(urlHostPort));
if (record != null) robotsTxt4Host = new RobotsEntry(urlHostPort, record);
if (robotsTxt4Host != null &&
robotsTxt4Host.getLoadedDate() != null &&
System.currentTimeMillis() - robotsTxt4Host.getLoadedDate().getTime() <= 1*24*60*60*1000) {
@ -178,7 +144,7 @@ public class RobotsTxt {
if (robotsTxt4Host == null) {
// generate artificial entry
robotsTxt4Host = new RobotsEntry(
urlHostPort,
robotsURL,
new ArrayList<String>(),
new ArrayList<String>(),
new Date(),
@ -195,7 +161,7 @@ public class RobotsTxt {
addEntry(robotsTxt4Host);
if (this.robotsTable.size() <= sz) {
Log.logSevere("RobotsTxt", "new entry in robots.txt table failed, resetting database");
this.resetDatabase();
this.clear();
addEntry(robotsTxt4Host);
}
} else {
@ -208,7 +174,7 @@ public class RobotsTxt {
// store the data into the robots DB
robotsTxt4Host = addEntry(
urlHostPort,
robotsURL,
parserResult.allowList(),
denyPath,
new Date(),
@ -224,7 +190,7 @@ public class RobotsTxt {
}
private RobotsEntry addEntry(
final String hostName,
final DigestURI theURL,
final ArrayList<String> allowPathList,
final ArrayList<String> denyPathList,
final Date loadedDate,
@ -234,8 +200,9 @@ public class RobotsTxt {
final long crawlDelayMillis
) {
final RobotsEntry entry = new RobotsEntry(
hostName, allowPathList, denyPathList, loadedDate, modDate,
eTag, sitemap, crawlDelayMillis);
theURL, allowPathList, denyPathList,
loadedDate, modDate,
eTag, sitemap, crawlDelayMillis);
addEntry(entry);
return entry;
}
@ -243,7 +210,7 @@ public class RobotsTxt {
private String addEntry(final RobotsEntry entry) {
// writes a new page and returns key
try {
this.robotsTable.put(entry.hostName, entry.mem);
this.robotsTable.put(this.robotsTable.encodedKey(entry.hostName), entry.getMem());
return entry.hostName;
} catch (final Exception e) {
Log.logException(e);
@ -258,7 +225,7 @@ public class RobotsTxt {
public static final int DOWNLOAD_ETAG = 2;
public static final int DOWNLOAD_MODDATE = 3;
private static final String getHostPort(final DigestURI theURL) {
static final String getHostPort(final DigestURI theURL) {
String urlHostPort = null;
final int port = getPort(theURL);
urlHostPort = theURL.getHost() + ":" + port;
@ -285,8 +252,12 @@ public class RobotsTxt {
DigestURI sitemapURL = null;
// generating the hostname:poart string needed to do a DB lookup
final String urlHostPort = getHostPort(theURL);
final RobotsEntry robotsTxt4Host = this.getEntry(urlHostPort, true);
RobotsEntry robotsTxt4Host;
try {
robotsTxt4Host = this.getEntry(theURL, true);
} catch (IOException e1) {
return null;
}
try {
final String sitemapUrlStr = robotsTxt4Host.getSitemap();
@ -297,9 +268,14 @@ public class RobotsTxt {
}
public Long getCrawlDelayMillis(final DigestURI theURL) {
if (theURL == null) throw new IllegalArgumentException();
final String urlHostPort = getHostPort(theURL);
final RobotsEntry robotsEntry = getEntry(urlHostPort, true);
if (theURL == null) throw new IllegalArgumentException();
RobotsEntry robotsEntry;
try {
robotsEntry = getEntry(theURL, true);
} catch (IOException e) {
Log.logException(e);
return new Long(0);
}
return robotsEntry.getCrawlDelayMillis();
}
@ -307,9 +283,13 @@ public class RobotsTxt {
if (nexturl == null) throw new IllegalArgumentException();
// generating the hostname:port string needed to do a DB lookup
final String urlHostPort = getHostPort(nexturl);
RobotsEntry robotsTxt4Host = null;
robotsTxt4Host = getEntry(urlHostPort, true);
try {
robotsTxt4Host = getEntry(nexturl, true);
} catch (IOException e) {
Log.logException(e);
return false;
}
return robotsTxt4Host.isDisallowed(nexturl.getFile());
}

@ -163,7 +163,7 @@ public final class robotsParser {
pos = line.indexOf(' ');
if (pos != -1) {
final String userAgent = line.substring(pos).trim();
isRule4AllAgents |= userAgent.equals('*');
isRule4AllAgents |= userAgent.equals("*");
isRule4YaCyAgent |= userAgent.toLowerCase().indexOf("yacy") >=0;
isRule4YaCyAgent |= userAgent.toLowerCase().indexOf("yacybot") >=0;
if (isRule4YaCyAgent) rule4YaCyFound = true;

@ -47,6 +47,8 @@ public class WorkTables extends Tables {
public final static String TABLE_API_COL_DATE = "date";
public final static String TABLE_API_COL_URL = "url";
public final static String TABLE_ROBOTS_NAME = "robots";
public WorkTables(File workPath) {
super(workPath, 12);

@ -217,7 +217,7 @@ public final class Switchboard extends serverSwitch {
public Dispatcher dhtDispatcher;
public List<String> trail;
public yacySeedDB peers;
public WorkTables tables;
public WorkTables tables;
public WorkflowProcessor<indexingQueueEntry> indexingDocumentProcessor;
public WorkflowProcessor<indexingQueueEntry> indexingCondensementProcessor;
@ -419,7 +419,7 @@ public final class Switchboard extends serverSwitch {
// loading the robots.txt db
this.log.logConfig("Initializing robots.txt DB");
final File robotsDBFile = new File(queuesRoot, "crawlRobotsTxt.heap");
robots = new RobotsTxt(robotsDBFile);
robots = new RobotsTxt(this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME));
this.log.logConfig("Loaded robots.txt DB from file " + robotsDBFile.getName() +
", " + robots.size() + " entries" +
", " + ppRamString(robotsDBFile.length()/1024));
@ -775,7 +775,6 @@ public final class Switchboard extends serverSwitch {
this.crawlStacker.announceClose();
this.crawlStacker.close();
this.webStructure.close();
this.robots.close();
log.logInfo("SWITCH NETWORK: START UP OF NEW INDEX DATABASE...");
@ -833,7 +832,6 @@ public final class Switchboard extends serverSwitch {
// load the robots.txt database
this.log.logConfig("Initializing robots.txt DB");
final File robotsDBFile = new File(this.queuesRoot, "crawlRobotsTxt.heap");
this.robots = new RobotsTxt(robotsDBFile);
this.log.logConfig("Loaded robots.txt DB from file " + robotsDBFile.getName() +
", " + robots.size() + " entries" +
", " + ppRamString(robotsDBFile.length()/1024));
@ -1105,7 +1103,6 @@ public final class Switchboard extends serverSwitch {
userDB.close();
bookmarksDB.close();
messageDB.close();
robots.close();
webStructure.flushCitationReference("crg");
webStructure.close();
crawlQueues.close();

@ -36,7 +36,9 @@ import java.util.Map.Entry;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.order.ByteOrder;
import net.yacy.kelondro.order.Digest;
import net.yacy.kelondro.order.NaturalOrder;
import net.yacy.kelondro.util.BDecoder;
import net.yacy.kelondro.util.BEncoder;
@ -81,6 +83,10 @@ public class BEncodedHeap implements Iterable<Map.Entry<byte[], Map<String, byte
this.columnames = new LinkedHashSet<String>();
}
public byte[] encodedKey(String key) {
return Base64Order.enhancedCoder.encodeSubstring(Digest.encodeMD5Raw(key), this.table.keylength);
}
public File getFile() {
return this.table.heapFile;
}

@ -112,7 +112,7 @@ public class Tables {
}
}
BEncodedHeap getHeap(final String tablename) throws IOException {
public BEncodedHeap getHeap(final String tablename) throws IOException {
final String table = tablename + suffix;
BEncodedHeap heap = this.tables.get(tablename);
if (heap != null) return heap;

Loading…
Cancel
Save