added a handling of appearances of yacy bot entries in robots.txt if this entry addresses the yacy peer

(directly or indirectly) and it grants a crawl-delay of 0. Then all forced pause mechanisms in YaCy are switched off and the domain is crawled at full speed. crawl delay values can be assigned to either - all yacy peers using the user-agent yacybot - a specific peer with peer name <peer-name>.yacy or - a specific peer with peer hash <peer-hash>.yacyh git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7639 6c8d7289-2bf4-0310-a012-ef5d649a1542
14 years ago · b2fe4b7b1a
parent 21fe5e6c6a
commit b2fe4b7b1a
11 changed files with 209 additions and 152 deletions
--- a/htroot/ConfigBasic.java
+++ b/htroot/ConfigBasic.java
@ -106,7 +106,7 @@ public class ConfigBasic {
        if (oldSeed == null &&
            !peerName.equals(sb.peers.mySeed().getName()) &&
            Pattern.compile("[A-Za-z0-9\\-_]{3,80}").matcher(peerName).matches()) {
-            sb.peers.mySeed().setName(peerName);
+            sb.peers.setMyName(peerName);
            sb.peers.saveMySeed();
        }
        
--- a/htroot/api/util/getpageinfo_p.java
+++ b/htroot/api/util/getpageinfo_p.java
@ -9,6 +9,7 @@ import net.yacy.document.parser.html.ContentScraper;
 import net.yacy.kelondro.data.meta.DigestURI;

 import de.anomic.crawler.CrawlProfile;
+import de.anomic.crawler.RobotsEntry;
 import de.anomic.search.Switchboard;
 import de.anomic.server.serverObjects;
 import de.anomic.server.serverSwitch;
@ -105,11 +106,17 @@ public class getpageinfo_p {
                    final DigestURI theURL = new DigestURI(url);
                    
                	// determine if crawling of the current URL is allowed
-                	prop.put("robots-allowed", sb.robots.isDisallowed(theURL) ? "0" : "1");
-                    
+                    RobotsEntry robotsEntry;
+                    try {
+                        robotsEntry = sb.robots.getEntry(theURL, sb.peers.myBotIDs());
+                    } catch (IOException e) {
+                        robotsEntry = null;
+                    }
+                	prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1);
+
                    // get the sitemap URL of the domain
-                    final MultiProtocolURI sitemapURL = sb.robots.getSitemapURL(theURL);
-                    prop.putXML("sitemap", (sitemapURL==null) ? "" : sitemapURL.toString());
+                    final MultiProtocolURI sitemapURL = robotsEntry == null ? null : robotsEntry.getSitemap();
+                    prop.putXML("sitemap", sitemapURL == null ? "" : sitemapURL.toString());
                } catch (final MalformedURLException e) {}
            }
            
--- a/source/de/anomic/crawler/Balancer.java
+++ b/source/de/anomic/crawler/Balancer.java
@ -32,6 +32,7 @@ import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 import java.util.SortedMap;
 import java.util.TreeMap;
 import java.util.concurrent.ConcurrentHashMap;
@ -73,12 +74,14 @@ public class Balancer {
    private long         minimumGlobalDelta;
    private long         lastDomainStackFill;
    private int          domStackInitSize;
+    private Set<String>  myAgentIDs;
    
    public Balancer(
            final File cachePath,
            final String stackname,
            final long minimumLocalDelta,
            final long minimumGlobalDelta,
+            final Set<String> myAgentIDs,
            final boolean useTailCache,
            final boolean exceed134217727) {
        this.cacheStacksPath = cachePath;
@ -87,6 +90,7 @@ public class Balancer {
        this.delayed = new TreeMap<Long, byte[]>();
        this.minimumLocalDelta = minimumLocalDelta;
        this.minimumGlobalDelta = minimumGlobalDelta;
+        this.myAgentIDs = myAgentIDs;
        this.domStackInitSize = Integer.MAX_VALUE;
        this.ddc = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
        this.double_push_check = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
@ -411,7 +415,7 @@ public class Balancer {
 		        sleeptime = (
 		                profileEntry.cacheStrategy() == CrawlProfile.CacheStrategy.CACHEONLY ||
 		                (profileEntry.cacheStrategy() == CrawlProfile.CacheStrategy.IFEXIST && Cache.has(crawlEntry.url()))
-		                ) ? 0 : Latency.waitingRemaining(crawlEntry.url(), minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
+		                ) ? 0 : Latency.waitingRemaining(crawlEntry.url(), myAgentIDs, minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
 		        
 		        assert Base64Order.enhancedCoder.equal(nexthash, rowEntry.getPrimaryKeyBytes()) : "result = " + UTF8.String(nexthash) + ", rowEntry.getPrimaryKeyBytes() = " + UTF8.String(rowEntry.getPrimaryKeyBytes());
 		        assert Base64Order.enhancedCoder.equal(nexthash, crawlEntry.url().hash()) : "result = " + UTF8.String(nexthash) + ", crawlEntry.url().hash() = " + UTF8.String(crawlEntry.url().hash());
@ -450,7 +454,7 @@ public class Balancer {
            // in best case, this should never happen if the balancer works propertly
            // this is only to protection against the worst case, where the crawler could
            // behave in a DoS-manner
-            Log.logInfo("BALANCER", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), minimumLocalDelta, minimumGlobalDelta) + ", top.size() = " + top.size() + ", delayed.size() = " + delayed.size() + ", domainStacks.size() = " + domainStacks.size() + ", domainStacksInitSize = " + this.domStackInitSize);
+            Log.logInfo("BALANCER", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), myAgentIDs, minimumLocalDelta, minimumGlobalDelta) + ", top.size() = " + top.size() + ", delayed.size() = " + delayed.size() + ", domainStacks.size() = " + domainStacks.size() + ", domainStacksInitSize = " + this.domStackInitSize);
            long loops = sleeptime / 1000;
            long rest = sleeptime % 1000;
            if (loops < 3) {
--- a/source/de/anomic/crawler/CrawlQueues.java
+++ b/source/de/anomic/crawler/CrawlQueues.java
@ -78,7 +78,7 @@ public class CrawlQueues {
        
        // start crawling management
        log.logConfig("Starting Crawling Management");
-        noticeURL = new NoticedURL(queuePath, sb.useTailCache, sb.exceed134217727);
+        noticeURL = new NoticedURL(queuePath, sb.peers.myBotIDs(), sb.useTailCache, sb.exceed134217727);
        FileUtils.deletedelete(new File(queuePath, ERROR_DB_FILENAME));
        errorURL = new ZURL(queuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727);
        delegatedURL = new ZURL(queuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727);
@ -90,7 +90,7 @@ public class CrawlQueues {
        this.workers = new ConcurrentHashMap<Integer, Loader>();
        this.remoteCrawlProviderHashes.clear();
        
-        noticeURL = new NoticedURL(newQueuePath, sb.useTailCache, sb.exceed134217727);
+        noticeURL = new NoticedURL(newQueuePath, sb.peers.myBotIDs(), sb.useTailCache, sb.exceed134217727);
        FileUtils.deletedelete(new File(newQueuePath, ERROR_DB_FILENAME));
        errorURL = new ZURL(newQueuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727);
        delegatedURL = new ZURL(newQueuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727);
@ -571,7 +571,10 @@ public class CrawlQueues {
            try {
                // checking robots.txt for http(s) resources
                this.request.setStatus("worker-checkingrobots", WorkflowJob.STATUS_STARTED);
-                if ((request.url().getProtocol().equals("http") || request.url().getProtocol().equals("https")) && sb.robots.isDisallowed(request.url())) {
+                RobotsEntry robotsEntry;
+                if ((request.url().getProtocol().equals("http") || request.url().getProtocol().equals("https")) &&
+                    (robotsEntry = sb.robots.getEntry(request.url(), sb.peers.myBotIDs())) != null &&
+                    robotsEntry.isDisallowed(request.url())) {
                    //if (log.isFine()) log.logFine("Crawling of URL '" + request.url().toString() + "' disallowed by robots.txt.");
                    errorURL.push(
                            this.request,
--- a/source/de/anomic/crawler/Latency.java
+++ b/source/de/anomic/crawler/Latency.java
@ -23,8 +23,10 @@

 package de.anomic.crawler;

+import java.io.IOException;
 import java.util.Iterator;
 import java.util.Map;
+import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;

 import net.yacy.cora.document.MultiProtocolURI;
@ -159,7 +161,7 @@ public class Latency {
     * @param minimumGlobalDelta
     * @return the remaining waiting time in milliseconds
     */
-    public static long waitingRemaining(MultiProtocolURI url, final long minimumLocalDelta, final long minimumGlobalDelta) {
+    public static long waitingRemaining(MultiProtocolURI url, final Set<String> thisAgents, final long minimumLocalDelta, final long minimumGlobalDelta) {

        // first check if the domain was _ever_ accessed before
        Host host = host(url);
@ -171,7 +173,7 @@ public class Latency {
        long waiting = (local) ? minimumLocalDelta : minimumGlobalDelta;
        
        // the time since last access to the domain is the basis of the remaining calculation
-        final long timeSinceLastAccess = (host == null) ? 0 : System.currentTimeMillis() - host.lastacc();
+        final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc();
        
        // for CGI accesses, we double the minimum time
        // mostly there is a database access in the background
@ -182,13 +184,23 @@ public class Latency {
        if (!local && host != null) waiting += host.flux(waiting);
        
        // find the delay as given by robots.txt on target site
-        long robotsDelay = (local) ? 0 : Switchboard.getSwitchboard().robots.getCrawlDelayMillis(url);
+        long robotsDelay = 0;
+        if (!local) {
+            RobotsEntry robotsEntry;
+            try {
+                robotsEntry = Switchboard.getSwitchboard().robots.getEntry(url, thisAgents);
+            } catch (IOException e) {
+                robotsEntry = null;
+            }
+            robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis();
+            if (robotsEntry != null && robotsDelay == 0) return 0; // no limits if granted exclusively for this peer
+        }
        waiting = Math.max(waiting, robotsDelay);
        
        // use the access latency as rule how fast we can access the server
        // this applies also to localhost, but differently, because it is not necessary to
        // consider so many external accesses
-        if (host != null) waiting = Math.max(waiting, (local) ? host.average() / 2 : host.average() * 2);
+        waiting = Math.max(waiting, (local) ? host.average() / 2 : host.average() * 2);
        
        // prevent that that a robots file can stop our indexer completely
        waiting = Math.min(60000, waiting);
@ -199,7 +211,7 @@ public class Latency {
    }
    
    
-    public static String waitingRemainingExplain(MultiProtocolURI url, final long minimumLocalDelta, final long minimumGlobalDelta) {
+    public static String waitingRemainingExplain(MultiProtocolURI url, final Set<String> thisAgents, final long minimumLocalDelta, final long minimumGlobalDelta) {
        
        // first check if the domain was _ever_ accessed before
        Host host = host(url);
@ -225,7 +237,17 @@ public class Latency {
        if (!local && host != null) s.append(", flux = ").append(host.flux(waiting));
        
        // find the delay as given by robots.txt on target site
-        long robotsDelay = (local) ? 0 : Switchboard.getSwitchboard().robots.getCrawlDelayMillis(url);
+        long robotsDelay = 0;
+        if (!local) {
+            RobotsEntry robotsEntry;
+            try {
+                robotsEntry = Switchboard.getSwitchboard().robots.getEntry(url, thisAgents);
+            } catch (IOException e) {
+                robotsEntry = null;
+            }
+            robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis();
+            if (robotsEntry != null && robotsDelay == 0) return "no waiting for exclusive granted peer"; // no limits if granted exclusively for this peer
+        }
        s.append(", robots.delay = ").append(robotsDelay);
        
        // use the access latency as rule how fast we can access the server
--- a/source/de/anomic/crawler/NoticedURL.java
+++ b/source/de/anomic/crawler/NoticedURL.java
@ -32,6 +32,7 @@ import java.io.IOException;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
+import java.util.Set;

 import net.yacy.kelondro.index.HandleSet;
 import net.yacy.kelondro.index.RowSpaceExceededException;
@ -56,14 +57,15 @@ public class NoticedURL {
    
    public NoticedURL(
            final File cachePath,
+            final Set<String> myAgentIDs,
            final boolean useTailCache,
            final boolean exceed134217727) {
        Log.logInfo("NoticedURL", "CREATING STACKS at " + cachePath.toString());
-        this.coreStack = new Balancer(cachePath, "urlNoticeCoreStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, useTailCache, exceed134217727);
-        this.limitStack = new Balancer(cachePath, "urlNoticeLimitStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, useTailCache, exceed134217727);
+        this.coreStack = new Balancer(cachePath, "urlNoticeCoreStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, myAgentIDs, useTailCache, exceed134217727);
+        this.limitStack = new Balancer(cachePath, "urlNoticeLimitStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, myAgentIDs, useTailCache, exceed134217727);
        //overhangStack = new plasmaCrawlBalancer(overhangStackFile);
-        this.remoteStack = new Balancer(cachePath, "urlNoticeRemoteStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, useTailCache, exceed134217727);
-        this.noloadStack = new Balancer(cachePath, "urlNoticeNoLoadStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, useTailCache, exceed134217727);
+        this.remoteStack = new Balancer(cachePath, "urlNoticeRemoteStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, myAgentIDs, useTailCache, exceed134217727);
+        this.noloadStack = new Balancer(cachePath, "urlNoticeNoLoadStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, myAgentIDs, useTailCache, exceed134217727);
    }

    public long getMinimumLocalDelta() {
--- a/source/de/anomic/crawler/RobotsEntry.java
+++ b/source/de/anomic/crawler/RobotsEntry.java
@ -28,6 +28,7 @@

 package de.anomic.crawler;

+import java.net.MalformedURLException;
 import java.util.Arrays;
 import java.util.Date;
 import java.util.LinkedHashMap;
@ -53,9 +54,9 @@ public class RobotsEntry {
    public static final String CRAWL_DELAY_MILLIS = "crawlDelayMillis";
    
    // this is a simple record structure that holds all properties of a single crawl start
-    private Map<String, byte[]> mem;
-    private List<String> allowPathList, denyPathList;
-    String hostName;
+    private final Map<String, byte[]> mem;
+    private final List<String> allowPathList, denyPathList;
+    private final String hostName;
    
    public RobotsEntry(final String hostName, final Map<String, byte[]> mem) {
        this.hostName = hostName.toLowerCase();
@ -134,6 +135,10 @@ public class RobotsEntry {
        }
    }
    
+    public String getHostName() {
+        return this.hostName;
+    }
+    
    public Map<String, byte[]> getMem() {
        if (!this.mem.containsKey(HOST_NAME)) this.mem.put(HOST_NAME, this.hostName.getBytes());
        return this.mem;
@ -147,8 +152,18 @@ public class RobotsEntry {
        return str.toString();
    }    
    
-    public String getSitemap() {
-        return this.mem.containsKey(SITEMAP)? UTF8.String(this.mem.get(SITEMAP)): null;
+    /**
+     * get the sitemap url
+     * @return the sitemap url or null if no sitemap url is given
+     */
+    public MultiProtocolURI getSitemap() {
+        String url = this.mem.containsKey(SITEMAP)? UTF8.String(this.mem.get(SITEMAP)): null;
+        if (url == null) return null;
+        try {
+            return new MultiProtocolURI(url);
+        } catch (MalformedURLException e) {
+            return null;
+        }
    }
    
    public Date getLoadedDate() {
@ -192,7 +207,8 @@ public class RobotsEntry {
        return 0;           
    }
    
-    public boolean isDisallowed(String path) {
+    public boolean isDisallowed(MultiProtocolURI subpathURL) {
+        String path = subpathURL.getFile();
        if ((this.mem == null) || (this.denyPathList.isEmpty())) return false;   
        
        // if the path is null or empty we set it to /
--- a/source/de/anomic/crawler/RobotsTxt.java
+++ b/source/de/anomic/crawler/RobotsTxt.java
@ -31,6 +31,7 @@ import java.net.MalformedURLException;
 import java.util.ArrayList;
 import java.util.Date;
 import java.util.Map;
+import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.regex.Pattern;

@ -76,8 +77,14 @@ public class RobotsTxt {
        return this.robotsTable.size();
    }
    
-    private RobotsEntry getEntry(final MultiProtocolURI theURL, final boolean fetchOnlineIfNotAvailableOrNotFresh) throws IOException {
-        // this method will always return a non-null value
+    public RobotsEntry getEntry(final MultiProtocolURI theURL, final Set<String> thisAgents) throws IOException {
+        if (theURL == null) throw new IllegalArgumentException();
+        if (!theURL.getProtocol().startsWith("http")) return null;
+        return getEntry(theURL, thisAgents, true);
+    }
+    
+    private RobotsEntry getEntry(final MultiProtocolURI theURL, final Set<String> thisAgents, final boolean fetchOnlineIfNotAvailableOrNotFresh) throws IOException {
+            // this method will always return a non-null value
        String urlHostPort = getHostPort(theURL);
        RobotsEntry robotsTxt4Host = null;
        Map<String, byte[]> record;
@ -174,7 +181,7 @@ public class RobotsTxt {
                    	addEntry(robotsTxt4Host);
                    }
                } else {
-                    final robotsParser parserResult = new robotsParser((byte[]) result[DOWNLOAD_ROBOTS_TXT]);
+                    final robotsParser parserResult = new robotsParser((byte[]) result[DOWNLOAD_ROBOTS_TXT], thisAgents);
                    ArrayList<String> denyPath = parserResult.denyList();
                    if (((Boolean) result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue()) {
                        denyPath = new ArrayList<String>();
@ -219,8 +226,8 @@ public class RobotsTxt {
    private String addEntry(final RobotsEntry entry) {
        // writes a new page and returns key
        try {
-            this.robotsTable.insert(this.robotsTable.encodedKey(entry.hostName), entry.getMem());
-            return entry.hostName;
+            this.robotsTable.insert(this.robotsTable.encodedKey(entry.getHostName()), entry.getMem());
+            return entry.getHostName();
        } catch (final Exception e) {
            log.warn("cannot write robots.txt entry", e);
            return null;
@ -255,57 +262,7 @@ public class RobotsTxt {
        }
        return port;
    }
-   
-    public MultiProtocolURI getSitemapURL(final MultiProtocolURI theURL) {
-        if (theURL == null) throw new IllegalArgumentException();
-        if (!theURL.getProtocol().startsWith("http")) return null;
-        MultiProtocolURI sitemapURL = null;
-        
-        // generating the hostname:poart string needed to do a DB lookup
-        RobotsEntry robotsTxt4Host;
-        try {
-            robotsTxt4Host = this.getEntry(theURL, true);
-        } catch (IOException e1) {
-            return null;
-        }
-                       
-        try {
-            final String sitemapUrlStr = robotsTxt4Host.getSitemap();
-            if (sitemapUrlStr != null) sitemapURL = new MultiProtocolURI(sitemapUrlStr);
-        } catch (final MalformedURLException e) {/* ignore this */}
-        
-        return sitemapURL;
-    }
-    
-    public long getCrawlDelayMillis(final MultiProtocolURI theURL) {
-        if (theURL == null) throw new IllegalArgumentException();
-        if (!theURL.getProtocol().startsWith("http")) return 0;
-        
-        RobotsEntry robotsEntry;
-        try {
-            robotsEntry = getEntry(theURL, true);
-        } catch (IOException e) {
-            log.warn("cannot load robots.txt entry", e);
-            return 0;
-        }
-        return robotsEntry.getCrawlDelayMillis();
-    }
-    
-    public boolean isDisallowed(final MultiProtocolURI nexturl) {
-        if (nexturl == null) throw new IllegalArgumentException();
-        if (!nexturl.getProtocol().startsWith("http")) return false;
-        
-        // generating the hostname:port string needed to do a DB lookup
-        RobotsEntry robotsTxt4Host = null;
-        try {
-            robotsTxt4Host = getEntry(nexturl, true);
-        } catch (IOException e) {
-            log.warn("cannot load robots.txt entry", e);
-            return false;
-        }
-        return robotsTxt4Host.isDisallowed(nexturl.getFile());
-    }
-    
+
    private static Object[] downloadRobotsTxt(final MultiProtocolURI robotsURL, int redirectionCount, final RobotsEntry entry) throws Exception {
        if (robotsURL == null || !robotsURL.getProtocol().startsWith("http")) return null;
        
--- a/source/de/anomic/crawler/robotsParser.java
+++ b/source/de/anomic/crawler/robotsParser.java
@ -1,29 +1,33 @@
-//robotsParser.java 
-//-------------------------------------
-//part of YACY
-//
-//(C) 2005, 2006 by Alexander Schier
-//                  Martin Thelian
-//
-//last change: $LastChangedDate$ by $LastChangedBy$
-//Revision: $LastChangedRevision$
-//
-//This program is free software; you can redistribute it and/or modify
-//it under the terms of the GNU General Public License as published by
-//the Free Software Foundation; either version 2 of the License, or
-//(at your option) any later version.
-//
-//This program is distributed in the hope that it will be useful,
-//but WITHOUT ANY WARRANTY; without even the implied warranty of
-//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-//GNU General Public License for more details.
-//
-//You should have received a copy of the GNU General Public License
-//along with this program; if not, write to the Free Software
-//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+/*
+  robotsParser.java 
+  -------------------------------------
+  part of YACY
+  
+  (C) 2005, 2006 by Alexander Schier
+                    Martin Thelian
+  
+  last change: $LastChangedDate$LastChangedBy: orbiter $
+  Revision: $LastChangedRevision$
+  
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+  
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

-// extended to return structured objects instead of a Object[] and
-// extended to return a Allow-List by Michael Christen, 21.07.2008
+   extended to return structured objects instead of a Object[] and
+   extended to return a Allow-List by Michael Christen, 21.07.2008
+   extended to allow multiple user agents given by definition and
+   returning the used user agent my Michael Christen 3.4.2011
+*/

 package de.anomic.crawler;

@ -33,6 +37,7 @@ import java.io.IOException;
 import java.io.InputStreamReader;
 import java.net.URLDecoder;
 import java.util.ArrayList;
+import java.util.Set;
 import java.util.regex.Pattern;

 /*
@ -65,48 +70,48 @@ public final class robotsParser {
    public static final String ROBOTS_SITEMAP = "Sitemap:".toUpperCase();
    public static final String ROBOTS_CRAWL_DELAY = "Crawl-delay:".toUpperCase();
    
-    private ArrayList<String> allowList;
-    private ArrayList<String> denyList;
-    private String sitemap;
-    private long crawlDelayMillis;
+    private final ArrayList<String> allowList;
+    private final ArrayList<String> denyList;
+    private       String sitemap;
+    private       long crawlDelayMillis;
+    private final Set<String> myNames; // a list of own name lists
+    private       String agentName; // the name of the agent that was used to return the result
    
-    public robotsParser(final byte[] robotsTxt) {
-        if ((robotsTxt == null)||(robotsTxt.length == 0)) {
-            allowList = new ArrayList<String>(0);
-            denyList = new ArrayList<String>(0);
-            sitemap = "";
-            crawlDelayMillis = 0;
-        } else {
+    public robotsParser(final byte[] robotsTxt, final Set<String> myNames) {
+        this.allowList = new ArrayList<String>(0);
+        this.denyList = new ArrayList<String>(0);
+        this.sitemap = "";
+        this.crawlDelayMillis = 0;
+        this.myNames = myNames;
+        this.agentName = null;
+        if (robotsTxt != null && robotsTxt.length != 0) {
            final ByteArrayInputStream bin = new ByteArrayInputStream(robotsTxt);
            final BufferedReader reader = new BufferedReader(new InputStreamReader(bin));
            parse(reader);
        }
    }
    
-    public robotsParser(final BufferedReader reader) {
-        if (reader == null) {
-            allowList = new ArrayList<String>(0);
-            denyList = new ArrayList<String>(0);
-            sitemap = "";
-            crawlDelayMillis = 0;
-        } else {
-            parse(reader);
-        }
+    public robotsParser(final BufferedReader reader, final Set<String> myNames) {
+        this.allowList = new ArrayList<String>(0);
+        this.denyList = new ArrayList<String>(0);
+        this.sitemap = "";
+        this.crawlDelayMillis = 0;
+        this.myNames = myNames;
+        this.agentName = null;
+        if (reader != null) parse(reader);
    }
    
    private void parse(final BufferedReader reader) {
        final ArrayList<String> deny4AllAgents = new ArrayList<String>();
-        final ArrayList<String> deny4YaCyAgent = new ArrayList<String>();
+        final ArrayList<String> deny4ThisAgents = new ArrayList<String>();
        final ArrayList<String> allow4AllAgents = new ArrayList<String>();
-        final ArrayList<String> allow4YaCyAgent = new ArrayList<String>();
+        final ArrayList<String> allow4ThisAgents = new ArrayList<String>();
        
        int pos;
        String line = null, lineUpper = null;
-        sitemap = null;
-        crawlDelayMillis = 0;
        boolean isRule4AllAgents = false,
-                isRule4YaCyAgent = false,
-                rule4YaCyFound = false,
+                isRule4ThisAgents = false,
+                rule4ThisAgentsFound = false,
                inBlock = false;        
        
        try {
@ -118,7 +123,7 @@ public final class robotsParser {
                // parse empty line
                if (line.length() == 0) {
                    // we have reached the end of the rule block
-                    if (rule4YaCyFound) {
+                    if (rule4ThisAgentsFound) {
                        // stop here because other robot blocks are either not for YaCy
                        // or global settings which shall not overwrite YaCys settings.
                        break lineparser;
@ -147,7 +152,7 @@ public final class robotsParser {
                    
                    if (inBlock) {
                        // we have detected the start of a new block
-                        if (rule4YaCyFound) {
+                        if (rule4ThisAgentsFound) {
                            // stop here because other robot blocks are either not for YaCy
                            // or global settings which shall not overwrite YaCys settings.
                            break lineparser;
@ -155,7 +160,7 @@ public final class robotsParser {
                        
                        inBlock = false;
                        isRule4AllAgents = false;
-                        isRule4YaCyAgent = false;
+                        isRule4ThisAgents = false;
                        crawlDelayMillis = 0; // each block has a separate delay
                    }
                    
@ -168,9 +173,14 @@ public final class robotsParser {
                    if (pos != -1) {
                        final String userAgent = line.substring(pos).trim();
                        isRule4AllAgents |= userAgent.equals("*");
-                        isRule4YaCyAgent |= userAgent.toLowerCase().indexOf("yacy") >=0;
-                        isRule4YaCyAgent |= userAgent.toLowerCase().indexOf("yacybot") >=0;
-                        if (isRule4YaCyAgent) rule4YaCyFound = true;
+                        for (String agent: this.myNames) {
+                            if (userAgent.toLowerCase().indexOf(agent) >= 0) {
+                                this.agentName = agent;
+                                isRule4ThisAgents = true;
+                                break;
+                            }
+                        }
+                        if (isRule4ThisAgents) rule4ThisAgentsFound = true;
                    }
                    continue lineparser;
                }
@ -178,7 +188,7 @@ public final class robotsParser {
                // parse crawl delay
                if (lineUpper.startsWith(ROBOTS_CRAWL_DELAY)) {
                    inBlock = true;
-                	if (isRule4YaCyAgent || isRule4AllAgents) {
+                	if (isRule4ThisAgents || isRule4AllAgents) {
                		pos = line.indexOf(' ');
                		if (pos != -1) {
                			try {
@ -197,7 +207,7 @@ public final class robotsParser {
                    inBlock = true;
                    final boolean isDisallowRule = lineUpper.startsWith(ROBOTS_DISALLOW);
                    
-                    if (isRule4YaCyAgent || isRule4AllAgents) {
+                    if (isRule4ThisAgents || isRule4AllAgents) {
                        // cutting off comments at the line end
                        pos = line.indexOf(ROBOTS_COMMENT);
                        if (pos != -1) line = line.substring(0,pos).trim();
@ -227,10 +237,10 @@ public final class robotsParser {
                            // adding it to the pathlist
                            if (isDisallowRule) {
                                if (isRule4AllAgents) deny4AllAgents.add(path);
-                                if (isRule4YaCyAgent) deny4YaCyAgent.add(path);
+                                if (isRule4ThisAgents) deny4ThisAgents.add(path);
                            } else {
                                if (isRule4AllAgents) allow4AllAgents.add(path);
-                                if (isRule4YaCyAgent) allow4YaCyAgent.add(path);
+                                if (isRule4ThisAgents) allow4ThisAgents.add(path);
                            }
                        }
                    }
@ -239,14 +249,32 @@ public final class robotsParser {
            }
        } catch (final IOException e) {}
        
-        allowList = (rule4YaCyFound) ? allow4YaCyAgent : allow4AllAgents;
-        denyList = (rule4YaCyFound) ? deny4YaCyAgent : deny4AllAgents;
+        allowList.addAll(rule4ThisAgentsFound ? allow4ThisAgents : allow4AllAgents);
+        denyList.addAll(rule4ThisAgentsFound ? deny4ThisAgents : deny4AllAgents);
    }
    
+    /**
+     * a crawl delay can be assigned to every agent or for all agents
+     * a special case is where the user agent of this yacy peer is given explicitely
+     * using the peer name and then if the crawl delay is given as '0' the crawler
+     * does not make any no-DOS-forced crawl pause.
+     * @return the crawl delay between two crawl access times in milliseconds
+     */
    public long crawlDelayMillis() {
        return this.crawlDelayMillis;
    }
    
+    /**
+     * the user agent that was applied to get the crawl properties is recorded
+     * because it is possible that this robots.txt parser applies to several user agents
+     * which may be i.e. 'yacy', 'yacybot', <peer-name>'.yacy' or <peer-hash>'.yacyh'
+     * Effects: see also comment to crawlDelayMillis()
+     * @return the name of the user agent that was used for the result properties or null if no user agent name was used to identify the agent
+     */
+    public String agentName() {
+        return this.agentName;
+    }
+    
    public String sitemap() {
        return this.sitemap;
    }
--- a/source/de/anomic/http/server/HTTPDProxyHandler.java
+++ b/source/de/anomic/http/server/HTTPDProxyHandler.java
@ -99,7 +99,7 @@ import de.anomic.server.serverObjects;
 public final class HTTPDProxyHandler {
    

-    public  static final String yacyUserAgent = "yacy (" + MultiProtocolURI.systemOST +") yacy.net";
+    public  static final String yacyUserAgent = "yacyproxy (" + MultiProtocolURI.systemOST +") http://yacy.net/bot.html";
    
    // static variables
    // can only be instantiated upon first instantiation of this class object
--- a/source/de/anomic/yacy/yacySeedDB.java
+++ b/source/de/anomic/yacy/yacySeedDB.java
@ -32,9 +32,11 @@ import java.io.PrintWriter;
 import java.lang.ref.SoftReference;
 import java.net.InetAddress;
 import java.util.ArrayList;
+import java.util.HashSet;
 import java.util.Hashtable;
 import java.util.Iterator;
 import java.util.Map;
+import java.util.Set;
 import java.util.TreeMap;
 import java.util.concurrent.ConcurrentHashMap;

@ -95,7 +97,7 @@ public final class yacySeedDB implements AlternativeDomainNames {
    public  PartitionScheme scheme;
    
    private yacySeed mySeed; // my own seed
-    
+    private Set<String> myBotIDs; // list of id's that this bot accepts as robots.txt identification
    private final Hashtable<String, String> nameLookupCache; // a name-to-hash relation
    private final Hashtable<InetAddress, SoftReference<yacySeed>> ipLookupCache;
    
@ -114,6 +116,9 @@ public final class yacySeedDB implements AlternativeDomainNames {
        this.seedPotentialDBFile = new File(networkRoot, seedPotentialDBFileName);
        this.mySeed = null; // my own seed
        this.myOwnSeedFile = myOwnSeedFile;
+        this.myBotIDs = new HashSet<String>();
+        this.myBotIDs.add("yacy");
+        this.myBotIDs.add("yacybot");
        this.netRedundancy = redundancy;
        this.scheme = new VerticalWordPartitionScheme(partitionExponent);
        
@ -161,13 +166,15 @@ public final class yacySeedDB implements AlternativeDomainNames {
        this.seedActiveDBFile = new File(newNetworkRoot, seedActiveDBFile.getName());
        this.seedPassiveDBFile = new File(newNetworkRoot, seedPassiveDBFile.getName());
        this.seedPotentialDBFile = new File(newNetworkRoot, seedPotentialDBFile.getName());
-        

-        // read current peer name
-        String peername = this.myName();
-        
+        // replace my (old) seed with new seed definition from other network
+        // but keep the seed name
+        String peername = this.myName();        
        this.mySeed = null; // my own seed
        this.myOwnSeedFile = new File(newNetworkRoot, yacySeedDB.DBFILE_OWN_SEED);
+        initMySeed();
+        this.mySeed.setName(peername);
+        
        this.netRedundancy = redundancy;
        this.scheme = new VerticalWordPartitionScheme(partitionExponent);
        
@ -228,11 +235,16 @@ public final class yacySeedDB implements AlternativeDomainNames {
                System.exit(-1);
            }
        }
-        
+        this.myBotIDs.add(this.mySeed.getName() + ".yacy");
+        this.myBotIDs.add(this.mySeed.hash + ".yacyh");
        mySeed.setIP("");       // we delete the old information to see what we have now
        mySeed.put(yacySeed.PEERTYPE, yacySeed.PEERTYPE_VIRGIN); // markup startup condition
    }

+    public Set<String> myBotIDs() {
+        return this.myBotIDs;
+    }
+    
    public int redundancy() {
        if (this.mySeed.isJunior()) return 1;
        return this.netRedundancy;
@ -250,6 +262,12 @@ public final class yacySeedDB implements AlternativeDomainNames {
        return this.mySeed;
    }
    
+    public void setMyName(String name) {
+        this.myBotIDs.remove(this.mySeed.getName() + ".yacy");
+        this.mySeed.setName(name);
+        this.myBotIDs.add(name + ".yacy");
+    }
+    
    public String myAlternativeAddress() {
        return mySeed().getName() + ".yacy";
    }