diff --git a/htroot/ConfigBasic.java b/htroot/ConfigBasic.java index 408a2d28d..f397cae34 100644 --- a/htroot/ConfigBasic.java +++ b/htroot/ConfigBasic.java @@ -106,7 +106,7 @@ public class ConfigBasic { if (oldSeed == null && !peerName.equals(sb.peers.mySeed().getName()) && Pattern.compile("[A-Za-z0-9\\-_]{3,80}").matcher(peerName).matches()) { - sb.peers.mySeed().setName(peerName); + sb.peers.setMyName(peerName); sb.peers.saveMySeed(); } diff --git a/htroot/api/util/getpageinfo_p.java b/htroot/api/util/getpageinfo_p.java index 026769b9c..5e2662334 100755 --- a/htroot/api/util/getpageinfo_p.java +++ b/htroot/api/util/getpageinfo_p.java @@ -9,6 +9,7 @@ import net.yacy.document.parser.html.ContentScraper; import net.yacy.kelondro.data.meta.DigestURI; import de.anomic.crawler.CrawlProfile; +import de.anomic.crawler.RobotsEntry; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -105,11 +106,17 @@ public class getpageinfo_p { final DigestURI theURL = new DigestURI(url); // determine if crawling of the current URL is allowed - prop.put("robots-allowed", sb.robots.isDisallowed(theURL) ? "0" : "1"); - + RobotsEntry robotsEntry; + try { + robotsEntry = sb.robots.getEntry(theURL, sb.peers.myBotIDs()); + } catch (IOException e) { + robotsEntry = null; + } + prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1); + // get the sitemap URL of the domain - final MultiProtocolURI sitemapURL = sb.robots.getSitemapURL(theURL); - prop.putXML("sitemap", (sitemapURL==null) ? "" : sitemapURL.toString()); + final MultiProtocolURI sitemapURL = robotsEntry == null ? null : robotsEntry.getSitemap(); + prop.putXML("sitemap", sitemapURL == null ? "" : sitemapURL.toString()); } catch (final MalformedURLException e) {} } diff --git a/source/de/anomic/crawler/Balancer.java b/source/de/anomic/crawler/Balancer.java index 0ab788020..ae75fa612 100644 --- a/source/de/anomic/crawler/Balancer.java +++ b/source/de/anomic/crawler/Balancer.java @@ -32,6 +32,7 @@ import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; import java.util.concurrent.ConcurrentHashMap; @@ -73,12 +74,14 @@ public class Balancer { private long minimumGlobalDelta; private long lastDomainStackFill; private int domStackInitSize; + private Set myAgentIDs; public Balancer( final File cachePath, final String stackname, final long minimumLocalDelta, final long minimumGlobalDelta, + final Set myAgentIDs, final boolean useTailCache, final boolean exceed134217727) { this.cacheStacksPath = cachePath; @@ -87,6 +90,7 @@ public class Balancer { this.delayed = new TreeMap(); this.minimumLocalDelta = minimumLocalDelta; this.minimumGlobalDelta = minimumGlobalDelta; + this.myAgentIDs = myAgentIDs; this.domStackInitSize = Integer.MAX_VALUE; this.ddc = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); this.double_push_check = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); @@ -411,7 +415,7 @@ public class Balancer { sleeptime = ( profileEntry.cacheStrategy() == CrawlProfile.CacheStrategy.CACHEONLY || (profileEntry.cacheStrategy() == CrawlProfile.CacheStrategy.IFEXIST && Cache.has(crawlEntry.url())) - ) ? 0 : Latency.waitingRemaining(crawlEntry.url(), minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server + ) ? 0 : Latency.waitingRemaining(crawlEntry.url(), myAgentIDs, minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server assert Base64Order.enhancedCoder.equal(nexthash, rowEntry.getPrimaryKeyBytes()) : "result = " + UTF8.String(nexthash) + ", rowEntry.getPrimaryKeyBytes() = " + UTF8.String(rowEntry.getPrimaryKeyBytes()); assert Base64Order.enhancedCoder.equal(nexthash, crawlEntry.url().hash()) : "result = " + UTF8.String(nexthash) + ", crawlEntry.url().hash() = " + UTF8.String(crawlEntry.url().hash()); @@ -450,7 +454,7 @@ public class Balancer { // in best case, this should never happen if the balancer works propertly // this is only to protection against the worst case, where the crawler could // behave in a DoS-manner - Log.logInfo("BALANCER", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), minimumLocalDelta, minimumGlobalDelta) + ", top.size() = " + top.size() + ", delayed.size() = " + delayed.size() + ", domainStacks.size() = " + domainStacks.size() + ", domainStacksInitSize = " + this.domStackInitSize); + Log.logInfo("BALANCER", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), myAgentIDs, minimumLocalDelta, minimumGlobalDelta) + ", top.size() = " + top.size() + ", delayed.size() = " + delayed.size() + ", domainStacks.size() = " + domainStacks.size() + ", domainStacksInitSize = " + this.domStackInitSize); long loops = sleeptime / 1000; long rest = sleeptime % 1000; if (loops < 3) { diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index 9b0d795f1..13fcd7b65 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -78,7 +78,7 @@ public class CrawlQueues { // start crawling management log.logConfig("Starting Crawling Management"); - noticeURL = new NoticedURL(queuePath, sb.useTailCache, sb.exceed134217727); + noticeURL = new NoticedURL(queuePath, sb.peers.myBotIDs(), sb.useTailCache, sb.exceed134217727); FileUtils.deletedelete(new File(queuePath, ERROR_DB_FILENAME)); errorURL = new ZURL(queuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727); delegatedURL = new ZURL(queuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727); @@ -90,7 +90,7 @@ public class CrawlQueues { this.workers = new ConcurrentHashMap(); this.remoteCrawlProviderHashes.clear(); - noticeURL = new NoticedURL(newQueuePath, sb.useTailCache, sb.exceed134217727); + noticeURL = new NoticedURL(newQueuePath, sb.peers.myBotIDs(), sb.useTailCache, sb.exceed134217727); FileUtils.deletedelete(new File(newQueuePath, ERROR_DB_FILENAME)); errorURL = new ZURL(newQueuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727); delegatedURL = new ZURL(newQueuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727); @@ -571,7 +571,10 @@ public class CrawlQueues { try { // checking robots.txt for http(s) resources this.request.setStatus("worker-checkingrobots", WorkflowJob.STATUS_STARTED); - if ((request.url().getProtocol().equals("http") || request.url().getProtocol().equals("https")) && sb.robots.isDisallowed(request.url())) { + RobotsEntry robotsEntry; + if ((request.url().getProtocol().equals("http") || request.url().getProtocol().equals("https")) && + (robotsEntry = sb.robots.getEntry(request.url(), sb.peers.myBotIDs())) != null && + robotsEntry.isDisallowed(request.url())) { //if (log.isFine()) log.logFine("Crawling of URL '" + request.url().toString() + "' disallowed by robots.txt."); errorURL.push( this.request, diff --git a/source/de/anomic/crawler/Latency.java b/source/de/anomic/crawler/Latency.java index 156699dc2..3989209da 100644 --- a/source/de/anomic/crawler/Latency.java +++ b/source/de/anomic/crawler/Latency.java @@ -23,8 +23,10 @@ package de.anomic.crawler; +import java.io.IOException; import java.util.Iterator; import java.util.Map; +import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import net.yacy.cora.document.MultiProtocolURI; @@ -159,7 +161,7 @@ public class Latency { * @param minimumGlobalDelta * @return the remaining waiting time in milliseconds */ - public static long waitingRemaining(MultiProtocolURI url, final long minimumLocalDelta, final long minimumGlobalDelta) { + public static long waitingRemaining(MultiProtocolURI url, final Set thisAgents, final long minimumLocalDelta, final long minimumGlobalDelta) { // first check if the domain was _ever_ accessed before Host host = host(url); @@ -171,7 +173,7 @@ public class Latency { long waiting = (local) ? minimumLocalDelta : minimumGlobalDelta; // the time since last access to the domain is the basis of the remaining calculation - final long timeSinceLastAccess = (host == null) ? 0 : System.currentTimeMillis() - host.lastacc(); + final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc(); // for CGI accesses, we double the minimum time // mostly there is a database access in the background @@ -182,13 +184,23 @@ public class Latency { if (!local && host != null) waiting += host.flux(waiting); // find the delay as given by robots.txt on target site - long robotsDelay = (local) ? 0 : Switchboard.getSwitchboard().robots.getCrawlDelayMillis(url); + long robotsDelay = 0; + if (!local) { + RobotsEntry robotsEntry; + try { + robotsEntry = Switchboard.getSwitchboard().robots.getEntry(url, thisAgents); + } catch (IOException e) { + robotsEntry = null; + } + robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis(); + if (robotsEntry != null && robotsDelay == 0) return 0; // no limits if granted exclusively for this peer + } waiting = Math.max(waiting, robotsDelay); // use the access latency as rule how fast we can access the server // this applies also to localhost, but differently, because it is not necessary to // consider so many external accesses - if (host != null) waiting = Math.max(waiting, (local) ? host.average() / 2 : host.average() * 2); + waiting = Math.max(waiting, (local) ? host.average() / 2 : host.average() * 2); // prevent that that a robots file can stop our indexer completely waiting = Math.min(60000, waiting); @@ -199,7 +211,7 @@ public class Latency { } - public static String waitingRemainingExplain(MultiProtocolURI url, final long minimumLocalDelta, final long minimumGlobalDelta) { + public static String waitingRemainingExplain(MultiProtocolURI url, final Set thisAgents, final long minimumLocalDelta, final long minimumGlobalDelta) { // first check if the domain was _ever_ accessed before Host host = host(url); @@ -225,7 +237,17 @@ public class Latency { if (!local && host != null) s.append(", flux = ").append(host.flux(waiting)); // find the delay as given by robots.txt on target site - long robotsDelay = (local) ? 0 : Switchboard.getSwitchboard().robots.getCrawlDelayMillis(url); + long robotsDelay = 0; + if (!local) { + RobotsEntry robotsEntry; + try { + robotsEntry = Switchboard.getSwitchboard().robots.getEntry(url, thisAgents); + } catch (IOException e) { + robotsEntry = null; + } + robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis(); + if (robotsEntry != null && robotsDelay == 0) return "no waiting for exclusive granted peer"; // no limits if granted exclusively for this peer + } s.append(", robots.delay = ").append(robotsDelay); // use the access latency as rule how fast we can access the server diff --git a/source/de/anomic/crawler/NoticedURL.java b/source/de/anomic/crawler/NoticedURL.java index 51bcb16f8..2cc6bbf99 100755 --- a/source/de/anomic/crawler/NoticedURL.java +++ b/source/de/anomic/crawler/NoticedURL.java @@ -32,6 +32,7 @@ import java.io.IOException; import java.util.HashSet; import java.util.Iterator; import java.util.List; +import java.util.Set; import net.yacy.kelondro.index.HandleSet; import net.yacy.kelondro.index.RowSpaceExceededException; @@ -56,14 +57,15 @@ public class NoticedURL { public NoticedURL( final File cachePath, + final Set myAgentIDs, final boolean useTailCache, final boolean exceed134217727) { Log.logInfo("NoticedURL", "CREATING STACKS at " + cachePath.toString()); - this.coreStack = new Balancer(cachePath, "urlNoticeCoreStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, useTailCache, exceed134217727); - this.limitStack = new Balancer(cachePath, "urlNoticeLimitStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, useTailCache, exceed134217727); + this.coreStack = new Balancer(cachePath, "urlNoticeCoreStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, myAgentIDs, useTailCache, exceed134217727); + this.limitStack = new Balancer(cachePath, "urlNoticeLimitStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, myAgentIDs, useTailCache, exceed134217727); //overhangStack = new plasmaCrawlBalancer(overhangStackFile); - this.remoteStack = new Balancer(cachePath, "urlNoticeRemoteStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, useTailCache, exceed134217727); - this.noloadStack = new Balancer(cachePath, "urlNoticeNoLoadStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, useTailCache, exceed134217727); + this.remoteStack = new Balancer(cachePath, "urlNoticeRemoteStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, myAgentIDs, useTailCache, exceed134217727); + this.noloadStack = new Balancer(cachePath, "urlNoticeNoLoadStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, myAgentIDs, useTailCache, exceed134217727); } public long getMinimumLocalDelta() { diff --git a/source/de/anomic/crawler/RobotsEntry.java b/source/de/anomic/crawler/RobotsEntry.java index dc6a89d79..8cf02a238 100644 --- a/source/de/anomic/crawler/RobotsEntry.java +++ b/source/de/anomic/crawler/RobotsEntry.java @@ -28,6 +28,7 @@ package de.anomic.crawler; +import java.net.MalformedURLException; import java.util.Arrays; import java.util.Date; import java.util.LinkedHashMap; @@ -53,9 +54,9 @@ public class RobotsEntry { public static final String CRAWL_DELAY_MILLIS = "crawlDelayMillis"; // this is a simple record structure that holds all properties of a single crawl start - private Map mem; - private List allowPathList, denyPathList; - String hostName; + private final Map mem; + private final List allowPathList, denyPathList; + private final String hostName; public RobotsEntry(final String hostName, final Map mem) { this.hostName = hostName.toLowerCase(); @@ -134,6 +135,10 @@ public class RobotsEntry { } } + public String getHostName() { + return this.hostName; + } + public Map getMem() { if (!this.mem.containsKey(HOST_NAME)) this.mem.put(HOST_NAME, this.hostName.getBytes()); return this.mem; @@ -147,8 +152,18 @@ public class RobotsEntry { return str.toString(); } - public String getSitemap() { - return this.mem.containsKey(SITEMAP)? UTF8.String(this.mem.get(SITEMAP)): null; + /** + * get the sitemap url + * @return the sitemap url or null if no sitemap url is given + */ + public MultiProtocolURI getSitemap() { + String url = this.mem.containsKey(SITEMAP)? UTF8.String(this.mem.get(SITEMAP)): null; + if (url == null) return null; + try { + return new MultiProtocolURI(url); + } catch (MalformedURLException e) { + return null; + } } public Date getLoadedDate() { @@ -192,7 +207,8 @@ public class RobotsEntry { return 0; } - public boolean isDisallowed(String path) { + public boolean isDisallowed(MultiProtocolURI subpathURL) { + String path = subpathURL.getFile(); if ((this.mem == null) || (this.denyPathList.isEmpty())) return false; // if the path is null or empty we set it to / diff --git a/source/de/anomic/crawler/RobotsTxt.java b/source/de/anomic/crawler/RobotsTxt.java index 1d06ea023..bbb461bf9 100644 --- a/source/de/anomic/crawler/RobotsTxt.java +++ b/source/de/anomic/crawler/RobotsTxt.java @@ -31,6 +31,7 @@ import java.net.MalformedURLException; import java.util.ArrayList; import java.util.Date; import java.util.Map; +import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.regex.Pattern; @@ -76,8 +77,14 @@ public class RobotsTxt { return this.robotsTable.size(); } - private RobotsEntry getEntry(final MultiProtocolURI theURL, final boolean fetchOnlineIfNotAvailableOrNotFresh) throws IOException { - // this method will always return a non-null value + public RobotsEntry getEntry(final MultiProtocolURI theURL, final Set thisAgents) throws IOException { + if (theURL == null) throw new IllegalArgumentException(); + if (!theURL.getProtocol().startsWith("http")) return null; + return getEntry(theURL, thisAgents, true); + } + + private RobotsEntry getEntry(final MultiProtocolURI theURL, final Set thisAgents, final boolean fetchOnlineIfNotAvailableOrNotFresh) throws IOException { + // this method will always return a non-null value String urlHostPort = getHostPort(theURL); RobotsEntry robotsTxt4Host = null; Map record; @@ -174,7 +181,7 @@ public class RobotsTxt { addEntry(robotsTxt4Host); } } else { - final robotsParser parserResult = new robotsParser((byte[]) result[DOWNLOAD_ROBOTS_TXT]); + final robotsParser parserResult = new robotsParser((byte[]) result[DOWNLOAD_ROBOTS_TXT], thisAgents); ArrayList denyPath = parserResult.denyList(); if (((Boolean) result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue()) { denyPath = new ArrayList(); @@ -219,8 +226,8 @@ public class RobotsTxt { private String addEntry(final RobotsEntry entry) { // writes a new page and returns key try { - this.robotsTable.insert(this.robotsTable.encodedKey(entry.hostName), entry.getMem()); - return entry.hostName; + this.robotsTable.insert(this.robotsTable.encodedKey(entry.getHostName()), entry.getMem()); + return entry.getHostName(); } catch (final Exception e) { log.warn("cannot write robots.txt entry", e); return null; @@ -255,57 +262,7 @@ public class RobotsTxt { } return port; } - - public MultiProtocolURI getSitemapURL(final MultiProtocolURI theURL) { - if (theURL == null) throw new IllegalArgumentException(); - if (!theURL.getProtocol().startsWith("http")) return null; - MultiProtocolURI sitemapURL = null; - - // generating the hostname:poart string needed to do a DB lookup - RobotsEntry robotsTxt4Host; - try { - robotsTxt4Host = this.getEntry(theURL, true); - } catch (IOException e1) { - return null; - } - - try { - final String sitemapUrlStr = robotsTxt4Host.getSitemap(); - if (sitemapUrlStr != null) sitemapURL = new MultiProtocolURI(sitemapUrlStr); - } catch (final MalformedURLException e) {/* ignore this */} - - return sitemapURL; - } - - public long getCrawlDelayMillis(final MultiProtocolURI theURL) { - if (theURL == null) throw new IllegalArgumentException(); - if (!theURL.getProtocol().startsWith("http")) return 0; - - RobotsEntry robotsEntry; - try { - robotsEntry = getEntry(theURL, true); - } catch (IOException e) { - log.warn("cannot load robots.txt entry", e); - return 0; - } - return robotsEntry.getCrawlDelayMillis(); - } - - public boolean isDisallowed(final MultiProtocolURI nexturl) { - if (nexturl == null) throw new IllegalArgumentException(); - if (!nexturl.getProtocol().startsWith("http")) return false; - - // generating the hostname:port string needed to do a DB lookup - RobotsEntry robotsTxt4Host = null; - try { - robotsTxt4Host = getEntry(nexturl, true); - } catch (IOException e) { - log.warn("cannot load robots.txt entry", e); - return false; - } - return robotsTxt4Host.isDisallowed(nexturl.getFile()); - } - + private static Object[] downloadRobotsTxt(final MultiProtocolURI robotsURL, int redirectionCount, final RobotsEntry entry) throws Exception { if (robotsURL == null || !robotsURL.getProtocol().startsWith("http")) return null; diff --git a/source/de/anomic/crawler/robotsParser.java b/source/de/anomic/crawler/robotsParser.java index 68bbb4129..0e0282ecb 100644 --- a/source/de/anomic/crawler/robotsParser.java +++ b/source/de/anomic/crawler/robotsParser.java @@ -1,29 +1,33 @@ -//robotsParser.java -//------------------------------------- -//part of YACY -// -//(C) 2005, 2006 by Alexander Schier -// Martin Thelian -// -//last change: $LastChangedDate$ by $LastChangedBy$ -//Revision: $LastChangedRevision$ -// -//This program is free software; you can redistribute it and/or modify -//it under the terms of the GNU General Public License as published by -//the Free Software Foundation; either version 2 of the License, or -//(at your option) any later version. -// -//This program is distributed in the hope that it will be useful, -//but WITHOUT ANY WARRANTY; without even the implied warranty of -//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -//GNU General Public License for more details. -// -//You should have received a copy of the GNU General Public License -//along with this program; if not, write to the Free Software -//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +/* + robotsParser.java + ------------------------------------- + part of YACY + + (C) 2005, 2006 by Alexander Schier + Martin Thelian + + last change: $LastChangedDate$LastChangedBy: orbiter $ + Revision: $LastChangedRevision$ + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -// extended to return structured objects instead of a Object[] and -// extended to return a Allow-List by Michael Christen, 21.07.2008 + extended to return structured objects instead of a Object[] and + extended to return a Allow-List by Michael Christen, 21.07.2008 + extended to allow multiple user agents given by definition and + returning the used user agent my Michael Christen 3.4.2011 +*/ package de.anomic.crawler; @@ -33,6 +37,7 @@ import java.io.IOException; import java.io.InputStreamReader; import java.net.URLDecoder; import java.util.ArrayList; +import java.util.Set; import java.util.regex.Pattern; /* @@ -65,48 +70,48 @@ public final class robotsParser { public static final String ROBOTS_SITEMAP = "Sitemap:".toUpperCase(); public static final String ROBOTS_CRAWL_DELAY = "Crawl-delay:".toUpperCase(); - private ArrayList allowList; - private ArrayList denyList; - private String sitemap; - private long crawlDelayMillis; + private final ArrayList allowList; + private final ArrayList denyList; + private String sitemap; + private long crawlDelayMillis; + private final Set myNames; // a list of own name lists + private String agentName; // the name of the agent that was used to return the result - public robotsParser(final byte[] robotsTxt) { - if ((robotsTxt == null)||(robotsTxt.length == 0)) { - allowList = new ArrayList(0); - denyList = new ArrayList(0); - sitemap = ""; - crawlDelayMillis = 0; - } else { + public robotsParser(final byte[] robotsTxt, final Set myNames) { + this.allowList = new ArrayList(0); + this.denyList = new ArrayList(0); + this.sitemap = ""; + this.crawlDelayMillis = 0; + this.myNames = myNames; + this.agentName = null; + if (robotsTxt != null && robotsTxt.length != 0) { final ByteArrayInputStream bin = new ByteArrayInputStream(robotsTxt); final BufferedReader reader = new BufferedReader(new InputStreamReader(bin)); parse(reader); } } - public robotsParser(final BufferedReader reader) { - if (reader == null) { - allowList = new ArrayList(0); - denyList = new ArrayList(0); - sitemap = ""; - crawlDelayMillis = 0; - } else { - parse(reader); - } + public robotsParser(final BufferedReader reader, final Set myNames) { + this.allowList = new ArrayList(0); + this.denyList = new ArrayList(0); + this.sitemap = ""; + this.crawlDelayMillis = 0; + this.myNames = myNames; + this.agentName = null; + if (reader != null) parse(reader); } private void parse(final BufferedReader reader) { final ArrayList deny4AllAgents = new ArrayList(); - final ArrayList deny4YaCyAgent = new ArrayList(); + final ArrayList deny4ThisAgents = new ArrayList(); final ArrayList allow4AllAgents = new ArrayList(); - final ArrayList allow4YaCyAgent = new ArrayList(); + final ArrayList allow4ThisAgents = new ArrayList(); int pos; String line = null, lineUpper = null; - sitemap = null; - crawlDelayMillis = 0; boolean isRule4AllAgents = false, - isRule4YaCyAgent = false, - rule4YaCyFound = false, + isRule4ThisAgents = false, + rule4ThisAgentsFound = false, inBlock = false; try { @@ -118,7 +123,7 @@ public final class robotsParser { // parse empty line if (line.length() == 0) { // we have reached the end of the rule block - if (rule4YaCyFound) { + if (rule4ThisAgentsFound) { // stop here because other robot blocks are either not for YaCy // or global settings which shall not overwrite YaCys settings. break lineparser; @@ -147,7 +152,7 @@ public final class robotsParser { if (inBlock) { // we have detected the start of a new block - if (rule4YaCyFound) { + if (rule4ThisAgentsFound) { // stop here because other robot blocks are either not for YaCy // or global settings which shall not overwrite YaCys settings. break lineparser; @@ -155,7 +160,7 @@ public final class robotsParser { inBlock = false; isRule4AllAgents = false; - isRule4YaCyAgent = false; + isRule4ThisAgents = false; crawlDelayMillis = 0; // each block has a separate delay } @@ -168,9 +173,14 @@ public final class robotsParser { if (pos != -1) { final String userAgent = line.substring(pos).trim(); isRule4AllAgents |= userAgent.equals("*"); - isRule4YaCyAgent |= userAgent.toLowerCase().indexOf("yacy") >=0; - isRule4YaCyAgent |= userAgent.toLowerCase().indexOf("yacybot") >=0; - if (isRule4YaCyAgent) rule4YaCyFound = true; + for (String agent: this.myNames) { + if (userAgent.toLowerCase().indexOf(agent) >= 0) { + this.agentName = agent; + isRule4ThisAgents = true; + break; + } + } + if (isRule4ThisAgents) rule4ThisAgentsFound = true; } continue lineparser; } @@ -178,7 +188,7 @@ public final class robotsParser { // parse crawl delay if (lineUpper.startsWith(ROBOTS_CRAWL_DELAY)) { inBlock = true; - if (isRule4YaCyAgent || isRule4AllAgents) { + if (isRule4ThisAgents || isRule4AllAgents) { pos = line.indexOf(' '); if (pos != -1) { try { @@ -197,7 +207,7 @@ public final class robotsParser { inBlock = true; final boolean isDisallowRule = lineUpper.startsWith(ROBOTS_DISALLOW); - if (isRule4YaCyAgent || isRule4AllAgents) { + if (isRule4ThisAgents || isRule4AllAgents) { // cutting off comments at the line end pos = line.indexOf(ROBOTS_COMMENT); if (pos != -1) line = line.substring(0,pos).trim(); @@ -227,10 +237,10 @@ public final class robotsParser { // adding it to the pathlist if (isDisallowRule) { if (isRule4AllAgents) deny4AllAgents.add(path); - if (isRule4YaCyAgent) deny4YaCyAgent.add(path); + if (isRule4ThisAgents) deny4ThisAgents.add(path); } else { if (isRule4AllAgents) allow4AllAgents.add(path); - if (isRule4YaCyAgent) allow4YaCyAgent.add(path); + if (isRule4ThisAgents) allow4ThisAgents.add(path); } } } @@ -239,14 +249,32 @@ public final class robotsParser { } } catch (final IOException e) {} - allowList = (rule4YaCyFound) ? allow4YaCyAgent : allow4AllAgents; - denyList = (rule4YaCyFound) ? deny4YaCyAgent : deny4AllAgents; + allowList.addAll(rule4ThisAgentsFound ? allow4ThisAgents : allow4AllAgents); + denyList.addAll(rule4ThisAgentsFound ? deny4ThisAgents : deny4AllAgents); } + /** + * a crawl delay can be assigned to every agent or for all agents + * a special case is where the user agent of this yacy peer is given explicitely + * using the peer name and then if the crawl delay is given as '0' the crawler + * does not make any no-DOS-forced crawl pause. + * @return the crawl delay between two crawl access times in milliseconds + */ public long crawlDelayMillis() { return this.crawlDelayMillis; } + /** + * the user agent that was applied to get the crawl properties is recorded + * because it is possible that this robots.txt parser applies to several user agents + * which may be i.e. 'yacy', 'yacybot', '.yacy' or '.yacyh' + * Effects: see also comment to crawlDelayMillis() + * @return the name of the user agent that was used for the result properties or null if no user agent name was used to identify the agent + */ + public String agentName() { + return this.agentName; + } + public String sitemap() { return this.sitemap; } diff --git a/source/de/anomic/http/server/HTTPDProxyHandler.java b/source/de/anomic/http/server/HTTPDProxyHandler.java index 04975cc17..ca326b3e9 100644 --- a/source/de/anomic/http/server/HTTPDProxyHandler.java +++ b/source/de/anomic/http/server/HTTPDProxyHandler.java @@ -99,7 +99,7 @@ import de.anomic.server.serverObjects; public final class HTTPDProxyHandler { - public static final String yacyUserAgent = "yacy (" + MultiProtocolURI.systemOST +") yacy.net"; + public static final String yacyUserAgent = "yacyproxy (" + MultiProtocolURI.systemOST +") http://yacy.net/bot.html"; // static variables // can only be instantiated upon first instantiation of this class object diff --git a/source/de/anomic/yacy/yacySeedDB.java b/source/de/anomic/yacy/yacySeedDB.java index ca3e98ff4..359726d41 100644 --- a/source/de/anomic/yacy/yacySeedDB.java +++ b/source/de/anomic/yacy/yacySeedDB.java @@ -32,9 +32,11 @@ import java.io.PrintWriter; import java.lang.ref.SoftReference; import java.net.InetAddress; import java.util.ArrayList; +import java.util.HashSet; import java.util.Hashtable; import java.util.Iterator; import java.util.Map; +import java.util.Set; import java.util.TreeMap; import java.util.concurrent.ConcurrentHashMap; @@ -95,7 +97,7 @@ public final class yacySeedDB implements AlternativeDomainNames { public PartitionScheme scheme; private yacySeed mySeed; // my own seed - + private Set myBotIDs; // list of id's that this bot accepts as robots.txt identification private final Hashtable nameLookupCache; // a name-to-hash relation private final Hashtable> ipLookupCache; @@ -114,6 +116,9 @@ public final class yacySeedDB implements AlternativeDomainNames { this.seedPotentialDBFile = new File(networkRoot, seedPotentialDBFileName); this.mySeed = null; // my own seed this.myOwnSeedFile = myOwnSeedFile; + this.myBotIDs = new HashSet(); + this.myBotIDs.add("yacy"); + this.myBotIDs.add("yacybot"); this.netRedundancy = redundancy; this.scheme = new VerticalWordPartitionScheme(partitionExponent); @@ -161,13 +166,15 @@ public final class yacySeedDB implements AlternativeDomainNames { this.seedActiveDBFile = new File(newNetworkRoot, seedActiveDBFile.getName()); this.seedPassiveDBFile = new File(newNetworkRoot, seedPassiveDBFile.getName()); this.seedPotentialDBFile = new File(newNetworkRoot, seedPotentialDBFile.getName()); - - // read current peer name - String peername = this.myName(); - + // replace my (old) seed with new seed definition from other network + // but keep the seed name + String peername = this.myName(); this.mySeed = null; // my own seed this.myOwnSeedFile = new File(newNetworkRoot, yacySeedDB.DBFILE_OWN_SEED); + initMySeed(); + this.mySeed.setName(peername); + this.netRedundancy = redundancy; this.scheme = new VerticalWordPartitionScheme(partitionExponent); @@ -228,11 +235,16 @@ public final class yacySeedDB implements AlternativeDomainNames { System.exit(-1); } } - + this.myBotIDs.add(this.mySeed.getName() + ".yacy"); + this.myBotIDs.add(this.mySeed.hash + ".yacyh"); mySeed.setIP(""); // we delete the old information to see what we have now mySeed.put(yacySeed.PEERTYPE, yacySeed.PEERTYPE_VIRGIN); // markup startup condition } + public Set myBotIDs() { + return this.myBotIDs; + } + public int redundancy() { if (this.mySeed.isJunior()) return 1; return this.netRedundancy; @@ -250,6 +262,12 @@ public final class yacySeedDB implements AlternativeDomainNames { return this.mySeed; } + public void setMyName(String name) { + this.myBotIDs.remove(this.mySeed.getName() + ".yacy"); + this.mySeed.setName(name); + this.myBotIDs.add(name + ".yacy"); + } + public String myAlternativeAddress() { return mySeed().getName() + ".yacy"; }