diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index c5dd47de9..de0f7b4de 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -144,11 +144,7 @@ public class IndexControlRWIs_p { segment.clear(); sb.crawlQueues.clear(); sb.crawlStacker.clear(); - try { - sb.robots.clear(); - } catch (final IOException e) { - Log.logException(e); - } + sb.robots.clear(); post.remove("deletecomplete"); } diff --git a/source/de/anomic/crawler/RobotsEntry.java b/source/de/anomic/crawler/RobotsEntry.java index 47029824e..9185f3e6b 100644 --- a/source/de/anomic/crawler/RobotsEntry.java +++ b/source/de/anomic/crawler/RobotsEntry.java @@ -7,7 +7,7 @@ // //This file is contributed by Martin Thelian // [MC] moved some methods from robotsParser file that had been created by Alexander Schier to this class -// [MC] redesign: removed entry object from RobotsTxt Class into ths separate class +// [MC] redesign: removed entry object from RobotsTxt Class into this separate class //last major change: $LastChangedDate$ by $LastChangedBy$ //Revision: $LastChangedRevision$ @@ -31,14 +31,17 @@ package de.anomic.crawler; import java.util.ArrayList; import java.util.Arrays; import java.util.Date; -import java.util.HashMap; import java.util.Iterator; +import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.Map; +import net.yacy.kelondro.data.meta.DigestURI; + public class RobotsEntry { public static final String ROBOTS_DB_PATH_SEPARATOR = ";"; + public static final String HOST_NAME = "hostname"; public static final String ALLOW_PATH_LIST = "allow"; public static final String DISALLOW_PATH_LIST = "disallow"; public static final String LOADED_DATE = "date"; @@ -49,17 +52,17 @@ public class RobotsEntry { public static final String CRAWL_DELAY_MILLIS = "crawlDelayMillis"; // this is a simple record structure that holds all properties of a single crawl start - Map mem; + private Map mem; private LinkedList allowPathList, denyPathList; String hostName; - public RobotsEntry(final String hostName, final Map mem) { + public RobotsEntry(final String hostName, final Map mem) { this.hostName = hostName.toLowerCase(); this.mem = mem; if (this.mem.containsKey(DISALLOW_PATH_LIST)) { this.denyPathList = new LinkedList(); - final String csPl = this.mem.get(DISALLOW_PATH_LIST); + final String csPl = new String(this.mem.get(DISALLOW_PATH_LIST)); if (csPl.length() > 0){ final String[] pathArray = csPl.split(ROBOTS_DB_PATH_SEPARATOR); if ((pathArray != null)&&(pathArray.length > 0)) { @@ -71,7 +74,7 @@ public class RobotsEntry { } if (this.mem.containsKey(ALLOW_PATH_LIST)) { this.allowPathList = new LinkedList(); - final String csPl = this.mem.get(ALLOW_PATH_LIST); + final String csPl = new String(this.mem.get(ALLOW_PATH_LIST)); if (csPl.length() > 0){ final String[] pathArray = csPl.split(ROBOTS_DB_PATH_SEPARATOR); if ((pathArray != null)&&(pathArray.length > 0)) { @@ -84,7 +87,7 @@ public class RobotsEntry { } public RobotsEntry( - final String hostName, + final DigestURI theURL, final ArrayList allowPathList, final ArrayList disallowPathList, final Date loadedDate, @@ -93,18 +96,19 @@ public class RobotsEntry { final String sitemap, final long crawlDelayMillis ) { - if ((hostName == null) || (hostName.length() == 0)) throw new IllegalArgumentException("The hostname is missing"); + if (theURL == null) throw new IllegalArgumentException("The url is missing"); - this.hostName = hostName.trim().toLowerCase(); + this.hostName = RobotsTxt.getHostPort(theURL).toLowerCase(); this.allowPathList = new LinkedList(); this.denyPathList = new LinkedList(); - this.mem = new HashMap(10); - if (loadedDate != null) this.mem.put(LOADED_DATE,Long.toString(loadedDate.getTime())); - if (modDate != null) this.mem.put(MOD_DATE,Long.toString(modDate.getTime())); - if (eTag != null) this.mem.put(ETAG,eTag); - if (sitemap != null) this.mem.put(SITEMAP,sitemap); - if (crawlDelayMillis > 0) this.mem.put(CRAWL_DELAY_MILLIS, Long.toString(crawlDelayMillis)); + this.mem = new LinkedHashMap(10); + this.mem.put(HOST_NAME, this.hostName.getBytes()); + if (loadedDate != null) this.mem.put(LOADED_DATE, Long.toString(loadedDate.getTime()).getBytes()); + if (modDate != null) this.mem.put(MOD_DATE, Long.toString(modDate.getTime()).getBytes()); + if (eTag != null) this.mem.put(ETAG, eTag.getBytes()); + if (sitemap != null) this.mem.put(SITEMAP, sitemap.getBytes()); + if (crawlDelayMillis > 0) this.mem.put(CRAWL_DELAY_MILLIS, Long.toString(crawlDelayMillis).getBytes()); if (allowPathList != null && !allowPathList.isEmpty()) { this.allowPathList.addAll(allowPathList); @@ -114,7 +118,7 @@ public class RobotsEntry { pathListStr.append(allowPathList.get(i)) .append(ROBOTS_DB_PATH_SEPARATOR); } - this.mem.put(ALLOW_PATH_LIST,pathListStr.substring(0,pathListStr.length()-1)); + this.mem.put(ALLOW_PATH_LIST, pathListStr.substring(0,pathListStr.length()-1).getBytes()); } if (disallowPathList != null && !disallowPathList.isEmpty()) { @@ -125,61 +129,61 @@ public class RobotsEntry { pathListStr.append(disallowPathList.get(i)) .append(ROBOTS_DB_PATH_SEPARATOR); } - this.mem.put(DISALLOW_PATH_LIST,pathListStr.substring(0,pathListStr.length()-1)); + this.mem.put(DISALLOW_PATH_LIST,pathListStr.substring(0, pathListStr.length()-1).getBytes()); } } + public Map getMem() { + if (!this.mem.containsKey(HOST_NAME)) this.mem.put(HOST_NAME, this.hostName.getBytes()); + return this.mem; + } + public String toString() { final StringBuilder str = new StringBuilder(6000); - str.append((this.hostName==null)?"null":this.hostName) - .append(": "); - - if (this.mem != null) { - str.append(this.mem.toString()); - } - + str.append((this.hostName == null) ? "null" : this.hostName).append(": "); + if (this.mem != null) str.append(this.mem.toString()); return str.toString(); } public String getSitemap() { - return this.mem.containsKey(SITEMAP)? this.mem.get(SITEMAP): null; + return this.mem.containsKey(SITEMAP)? new String(this.mem.get(SITEMAP)): null; } public Date getLoadedDate() { if (this.mem.containsKey(LOADED_DATE)) { - return new Date(Long.parseLong(this.mem.get(LOADED_DATE))); + return new Date(Long.parseLong(new String(this.mem.get(LOADED_DATE)))); } return null; } public void setLoadedDate(final Date newLoadedDate) { if (newLoadedDate != null) { - this.mem.put(LOADED_DATE,Long.toString(newLoadedDate.getTime())); + this.mem.put(LOADED_DATE, Long.toString(newLoadedDate.getTime()).getBytes()); } } public Date getModDate() { if (this.mem.containsKey(MOD_DATE)) { - return new Date(Long.parseLong(this.mem.get(MOD_DATE))); + return new Date(Long.parseLong(new String(this.mem.get(MOD_DATE)))); } return null; } public String getETag() { if (this.mem.containsKey(ETAG)) { - return this.mem.get(ETAG); + return new String(this.mem.get(ETAG)); } return null; } public long getCrawlDelayMillis() { if (this.mem.containsKey(CRAWL_DELAY_MILLIS)) try { - return Long.parseLong(this.mem.get(CRAWL_DELAY_MILLIS)); + return Long.parseLong(new String(this.mem.get(CRAWL_DELAY_MILLIS))); } catch (final NumberFormatException e) { return 0; } if (this.mem.containsKey(CRAWL_DELAY)) try { - return 1000 * Integer.parseInt(this.mem.get(CRAWL_DELAY)); + return 1000 * Integer.parseInt(new String(this.mem.get(CRAWL_DELAY))); } catch (final NumberFormatException e) { return 0; } diff --git a/source/de/anomic/crawler/RobotsTxt.java b/source/de/anomic/crawler/RobotsTxt.java index 7e9a42e64..3d1d7b092 100644 --- a/source/de/anomic/crawler/RobotsTxt.java +++ b/source/de/anomic/crawler/RobotsTxt.java @@ -28,7 +28,6 @@ package de.anomic.crawler; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; -import java.io.File; import java.io.IOException; import java.net.MalformedURLException; import java.util.ArrayList; @@ -36,14 +35,12 @@ import java.util.Date; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; -import net.yacy.kelondro.blob.MapHeap; +import net.yacy.kelondro.blob.BEncodedHeap; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; -import net.yacy.kelondro.order.NaturalOrder; import net.yacy.kelondro.util.ByteBuffer; import net.yacy.kelondro.util.DateFormatter; import net.yacy.kelondro.util.FileUtils; -import net.yacy.kelondro.util.kelondroException; import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.http.client.Client; @@ -56,8 +53,7 @@ public class RobotsTxt { public static final String ROBOTS_DB_PATH_SEPARATOR = ";"; private static final Log log = new Log("ROBOTS"); - MapHeap robotsTable; - private final File robotsTableFile; + BEncodedHeap robotsTable; private final ConcurrentHashMap syncObjects; //private static final HashSet loadedRobots = new HashSet(); // only for debugging @@ -65,53 +61,29 @@ public class RobotsTxt { public DomSync() {} } - public RobotsTxt(final File robotsTableFile) { - this.robotsTableFile = robotsTableFile; - robotsTableFile.getParentFile().mkdirs(); - try { - robotsTable = new MapHeap(robotsTableFile, 64, NaturalOrder.naturalOrder, 1024 * 1024, 100, '_'); - } catch (final IOException e) { - Log.logException(e); - } + public RobotsTxt(final BEncodedHeap robotsTable) { + this.robotsTable = robotsTable; syncObjects = new ConcurrentHashMap(); } - private void resetDatabase() { - // deletes the robots.txt database and creates a new one - if (robotsTable != null) robotsTable.close(); - FileUtils.deletedelete(robotsTableFile); - robotsTableFile.getParentFile().mkdirs(); + public void clear() { try { - robotsTable = new MapHeap(robotsTableFile, 64, NaturalOrder.naturalOrder, 1024 * 1024, 100, '_'); - } catch (final IOException e) { - Log.logException(e); + this.robotsTable.clear(); + } catch (IOException e) { } syncObjects.clear(); } - public void clear() throws IOException { - this.robotsTable.clear(); - } - - public void close() { - this.robotsTable.close(); - } - public int size() { return this.robotsTable.size(); } - private RobotsEntry getEntry(final String urlHostPort, final boolean fetchOnlineIfNotAvailableOrNotFresh) { + private RobotsEntry getEntry(final DigestURI theURL, final boolean fetchOnlineIfNotAvailableOrNotFresh) throws IOException { // this method will always return a non-null value + String urlHostPort = getHostPort(theURL); RobotsEntry robotsTxt4Host = null; - try { - final Map record = this.robotsTable.get(urlHostPort); - if (record != null) robotsTxt4Host = new RobotsEntry(urlHostPort, record); - } catch (final kelondroException e) { - resetDatabase(); - } catch (final IOException e) { - resetDatabase(); - } + Map record = this.robotsTable.get(this.robotsTable.encodedKey(urlHostPort)); + if (record != null) robotsTxt4Host = new RobotsEntry(urlHostPort, record); if (fetchOnlineIfNotAvailableOrNotFresh && ( robotsTxt4Host == null || @@ -133,14 +105,8 @@ public class RobotsTxt { // check the robots table again for all threads that come here because they waited for another one // to complete a download - try { - final Map record = this.robotsTable.get(urlHostPort); - if (record != null) robotsTxt4Host = new RobotsEntry(urlHostPort, record); - } catch (final kelondroException e) { - resetDatabase(); - } catch (final IOException e) { - resetDatabase(); - } + record = this.robotsTable.get(this.robotsTable.encodedKey(urlHostPort)); + if (record != null) robotsTxt4Host = new RobotsEntry(urlHostPort, record); if (robotsTxt4Host != null && robotsTxt4Host.getLoadedDate() != null && System.currentTimeMillis() - robotsTxt4Host.getLoadedDate().getTime() <= 1*24*60*60*1000) { @@ -178,7 +144,7 @@ public class RobotsTxt { if (robotsTxt4Host == null) { // generate artificial entry robotsTxt4Host = new RobotsEntry( - urlHostPort, + robotsURL, new ArrayList(), new ArrayList(), new Date(), @@ -195,7 +161,7 @@ public class RobotsTxt { addEntry(robotsTxt4Host); if (this.robotsTable.size() <= sz) { Log.logSevere("RobotsTxt", "new entry in robots.txt table failed, resetting database"); - this.resetDatabase(); + this.clear(); addEntry(robotsTxt4Host); } } else { @@ -208,7 +174,7 @@ public class RobotsTxt { // store the data into the robots DB robotsTxt4Host = addEntry( - urlHostPort, + robotsURL, parserResult.allowList(), denyPath, new Date(), @@ -224,7 +190,7 @@ public class RobotsTxt { } private RobotsEntry addEntry( - final String hostName, + final DigestURI theURL, final ArrayList allowPathList, final ArrayList denyPathList, final Date loadedDate, @@ -234,8 +200,9 @@ public class RobotsTxt { final long crawlDelayMillis ) { final RobotsEntry entry = new RobotsEntry( - hostName, allowPathList, denyPathList, loadedDate, modDate, - eTag, sitemap, crawlDelayMillis); + theURL, allowPathList, denyPathList, + loadedDate, modDate, + eTag, sitemap, crawlDelayMillis); addEntry(entry); return entry; } @@ -243,7 +210,7 @@ public class RobotsTxt { private String addEntry(final RobotsEntry entry) { // writes a new page and returns key try { - this.robotsTable.put(entry.hostName, entry.mem); + this.robotsTable.put(this.robotsTable.encodedKey(entry.hostName), entry.getMem()); return entry.hostName; } catch (final Exception e) { Log.logException(e); @@ -258,7 +225,7 @@ public class RobotsTxt { public static final int DOWNLOAD_ETAG = 2; public static final int DOWNLOAD_MODDATE = 3; - private static final String getHostPort(final DigestURI theURL) { + static final String getHostPort(final DigestURI theURL) { String urlHostPort = null; final int port = getPort(theURL); urlHostPort = theURL.getHost() + ":" + port; @@ -285,8 +252,12 @@ public class RobotsTxt { DigestURI sitemapURL = null; // generating the hostname:poart string needed to do a DB lookup - final String urlHostPort = getHostPort(theURL); - final RobotsEntry robotsTxt4Host = this.getEntry(urlHostPort, true); + RobotsEntry robotsTxt4Host; + try { + robotsTxt4Host = this.getEntry(theURL, true); + } catch (IOException e1) { + return null; + } try { final String sitemapUrlStr = robotsTxt4Host.getSitemap(); @@ -297,9 +268,14 @@ public class RobotsTxt { } public Long getCrawlDelayMillis(final DigestURI theURL) { - if (theURL == null) throw new IllegalArgumentException(); - final String urlHostPort = getHostPort(theURL); - final RobotsEntry robotsEntry = getEntry(urlHostPort, true); + if (theURL == null) throw new IllegalArgumentException(); + RobotsEntry robotsEntry; + try { + robotsEntry = getEntry(theURL, true); + } catch (IOException e) { + Log.logException(e); + return new Long(0); + } return robotsEntry.getCrawlDelayMillis(); } @@ -307,9 +283,13 @@ public class RobotsTxt { if (nexturl == null) throw new IllegalArgumentException(); // generating the hostname:port string needed to do a DB lookup - final String urlHostPort = getHostPort(nexturl); RobotsEntry robotsTxt4Host = null; - robotsTxt4Host = getEntry(urlHostPort, true); + try { + robotsTxt4Host = getEntry(nexturl, true); + } catch (IOException e) { + Log.logException(e); + return false; + } return robotsTxt4Host.isDisallowed(nexturl.getFile()); } diff --git a/source/de/anomic/crawler/robotsParser.java b/source/de/anomic/crawler/robotsParser.java index dd5dc536b..61db83193 100644 --- a/source/de/anomic/crawler/robotsParser.java +++ b/source/de/anomic/crawler/robotsParser.java @@ -163,7 +163,7 @@ public final class robotsParser { pos = line.indexOf(' '); if (pos != -1) { final String userAgent = line.substring(pos).trim(); - isRule4AllAgents |= userAgent.equals('*'); + isRule4AllAgents |= userAgent.equals("*"); isRule4YaCyAgent |= userAgent.toLowerCase().indexOf("yacy") >=0; isRule4YaCyAgent |= userAgent.toLowerCase().indexOf("yacybot") >=0; if (isRule4YaCyAgent) rule4YaCyFound = true; diff --git a/source/de/anomic/data/WorkTables.java b/source/de/anomic/data/WorkTables.java index 4420ea2bc..7b499736f 100644 --- a/source/de/anomic/data/WorkTables.java +++ b/source/de/anomic/data/WorkTables.java @@ -47,6 +47,8 @@ public class WorkTables extends Tables { public final static String TABLE_API_COL_DATE = "date"; public final static String TABLE_API_COL_URL = "url"; + public final static String TABLE_ROBOTS_NAME = "robots"; + public WorkTables(File workPath) { super(workPath, 12); diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 75ddffd04..34f63c569 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -217,7 +217,7 @@ public final class Switchboard extends serverSwitch { public Dispatcher dhtDispatcher; public List trail; public yacySeedDB peers; - public WorkTables tables; + public WorkTables tables; public WorkflowProcessor indexingDocumentProcessor; public WorkflowProcessor indexingCondensementProcessor; @@ -419,7 +419,7 @@ public final class Switchboard extends serverSwitch { // loading the robots.txt db this.log.logConfig("Initializing robots.txt DB"); final File robotsDBFile = new File(queuesRoot, "crawlRobotsTxt.heap"); - robots = new RobotsTxt(robotsDBFile); + robots = new RobotsTxt(this.tables.getHeap(WorkTables.TABLE_ROBOTS_NAME)); this.log.logConfig("Loaded robots.txt DB from file " + robotsDBFile.getName() + ", " + robots.size() + " entries" + ", " + ppRamString(robotsDBFile.length()/1024)); @@ -775,7 +775,6 @@ public final class Switchboard extends serverSwitch { this.crawlStacker.announceClose(); this.crawlStacker.close(); this.webStructure.close(); - this.robots.close(); log.logInfo("SWITCH NETWORK: START UP OF NEW INDEX DATABASE..."); @@ -833,7 +832,6 @@ public final class Switchboard extends serverSwitch { // load the robots.txt database this.log.logConfig("Initializing robots.txt DB"); final File robotsDBFile = new File(this.queuesRoot, "crawlRobotsTxt.heap"); - this.robots = new RobotsTxt(robotsDBFile); this.log.logConfig("Loaded robots.txt DB from file " + robotsDBFile.getName() + ", " + robots.size() + " entries" + ", " + ppRamString(robotsDBFile.length()/1024)); @@ -1105,7 +1103,6 @@ public final class Switchboard extends serverSwitch { userDB.close(); bookmarksDB.close(); messageDB.close(); - robots.close(); webStructure.flushCitationReference("crg"); webStructure.close(); crawlQueues.close(); diff --git a/source/net/yacy/kelondro/blob/BEncodedHeap.java b/source/net/yacy/kelondro/blob/BEncodedHeap.java index 8a4df1a2d..59033e8fa 100644 --- a/source/net/yacy/kelondro/blob/BEncodedHeap.java +++ b/source/net/yacy/kelondro/blob/BEncodedHeap.java @@ -36,7 +36,9 @@ import java.util.Map.Entry; import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; +import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.order.ByteOrder; +import net.yacy.kelondro.order.Digest; import net.yacy.kelondro.order.NaturalOrder; import net.yacy.kelondro.util.BDecoder; import net.yacy.kelondro.util.BEncoder; @@ -81,6 +83,10 @@ public class BEncodedHeap implements Iterable(); } + public byte[] encodedKey(String key) { + return Base64Order.enhancedCoder.encodeSubstring(Digest.encodeMD5Raw(key), this.table.keylength); + } + public File getFile() { return this.table.heapFile; } diff --git a/source/net/yacy/kelondro/blob/Tables.java b/source/net/yacy/kelondro/blob/Tables.java index 9e4dac9f3..6d2bee0c2 100644 --- a/source/net/yacy/kelondro/blob/Tables.java +++ b/source/net/yacy/kelondro/blob/Tables.java @@ -112,7 +112,7 @@ public class Tables { } } - BEncodedHeap getHeap(final String tablename) throws IOException { + public BEncodedHeap getHeap(final String tablename) throws IOException { final String table = tablename + suffix; BEncodedHeap heap = this.tables.get(tablename); if (heap != null) return heap;