added a handling of appearances of yacy bot entries in robots.txt if this entry addresses the yacy peer

(directly or indirectly) and it grants a crawl-delay of 0. Then all forced pause mechanisms in YaCy are switched off and the domain is crawled at full speed.
crawl delay values can be assigned to either
- all yacy peers using the user-agent yacybot
- a specific peer with peer name <peer-name>.yacy or
- a specific peer with peer hash <peer-hash>.yacyh


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7639 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent 21fe5e6c6a
commit b2fe4b7b1a

@ -106,7 +106,7 @@ public class ConfigBasic {
if (oldSeed == null &&
!peerName.equals(sb.peers.mySeed().getName()) &&
Pattern.compile("[A-Za-z0-9\\-_]{3,80}").matcher(peerName).matches()) {
sb.peers.mySeed().setName(peerName);
sb.peers.setMyName(peerName);
sb.peers.saveMySeed();
}

@ -9,6 +9,7 @@ import net.yacy.document.parser.html.ContentScraper;
import net.yacy.kelondro.data.meta.DigestURI;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.RobotsEntry;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -105,11 +106,17 @@ public class getpageinfo_p {
final DigestURI theURL = new DigestURI(url);
// determine if crawling of the current URL is allowed
prop.put("robots-allowed", sb.robots.isDisallowed(theURL) ? "0" : "1");
RobotsEntry robotsEntry;
try {
robotsEntry = sb.robots.getEntry(theURL, sb.peers.myBotIDs());
} catch (IOException e) {
robotsEntry = null;
}
prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1);
// get the sitemap URL of the domain
final MultiProtocolURI sitemapURL = sb.robots.getSitemapURL(theURL);
prop.putXML("sitemap", (sitemapURL==null) ? "" : sitemapURL.toString());
final MultiProtocolURI sitemapURL = robotsEntry == null ? null : robotsEntry.getSitemap();
prop.putXML("sitemap", sitemapURL == null ? "" : sitemapURL.toString());
} catch (final MalformedURLException e) {}
}

@ -32,6 +32,7 @@ import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.concurrent.ConcurrentHashMap;
@ -73,12 +74,14 @@ public class Balancer {
private long minimumGlobalDelta;
private long lastDomainStackFill;
private int domStackInitSize;
private Set<String> myAgentIDs;
public Balancer(
final File cachePath,
final String stackname,
final long minimumLocalDelta,
final long minimumGlobalDelta,
final Set<String> myAgentIDs,
final boolean useTailCache,
final boolean exceed134217727) {
this.cacheStacksPath = cachePath;
@ -87,6 +90,7 @@ public class Balancer {
this.delayed = new TreeMap<Long, byte[]>();
this.minimumLocalDelta = minimumLocalDelta;
this.minimumGlobalDelta = minimumGlobalDelta;
this.myAgentIDs = myAgentIDs;
this.domStackInitSize = Integer.MAX_VALUE;
this.ddc = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
this.double_push_check = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
@ -411,7 +415,7 @@ public class Balancer {
sleeptime = (
profileEntry.cacheStrategy() == CrawlProfile.CacheStrategy.CACHEONLY ||
(profileEntry.cacheStrategy() == CrawlProfile.CacheStrategy.IFEXIST && Cache.has(crawlEntry.url()))
) ? 0 : Latency.waitingRemaining(crawlEntry.url(), minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
) ? 0 : Latency.waitingRemaining(crawlEntry.url(), myAgentIDs, minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
assert Base64Order.enhancedCoder.equal(nexthash, rowEntry.getPrimaryKeyBytes()) : "result = " + UTF8.String(nexthash) + ", rowEntry.getPrimaryKeyBytes() = " + UTF8.String(rowEntry.getPrimaryKeyBytes());
assert Base64Order.enhancedCoder.equal(nexthash, crawlEntry.url().hash()) : "result = " + UTF8.String(nexthash) + ", crawlEntry.url().hash() = " + UTF8.String(crawlEntry.url().hash());
@ -450,7 +454,7 @@ public class Balancer {
// in best case, this should never happen if the balancer works propertly
// this is only to protection against the worst case, where the crawler could
// behave in a DoS-manner
Log.logInfo("BALANCER", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), minimumLocalDelta, minimumGlobalDelta) + ", top.size() = " + top.size() + ", delayed.size() = " + delayed.size() + ", domainStacks.size() = " + domainStacks.size() + ", domainStacksInitSize = " + this.domStackInitSize);
Log.logInfo("BALANCER", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), myAgentIDs, minimumLocalDelta, minimumGlobalDelta) + ", top.size() = " + top.size() + ", delayed.size() = " + delayed.size() + ", domainStacks.size() = " + domainStacks.size() + ", domainStacksInitSize = " + this.domStackInitSize);
long loops = sleeptime / 1000;
long rest = sleeptime % 1000;
if (loops < 3) {

@ -78,7 +78,7 @@ public class CrawlQueues {
// start crawling management
log.logConfig("Starting Crawling Management");
noticeURL = new NoticedURL(queuePath, sb.useTailCache, sb.exceed134217727);
noticeURL = new NoticedURL(queuePath, sb.peers.myBotIDs(), sb.useTailCache, sb.exceed134217727);
FileUtils.deletedelete(new File(queuePath, ERROR_DB_FILENAME));
errorURL = new ZURL(queuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727);
delegatedURL = new ZURL(queuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727);
@ -90,7 +90,7 @@ public class CrawlQueues {
this.workers = new ConcurrentHashMap<Integer, Loader>();
this.remoteCrawlProviderHashes.clear();
noticeURL = new NoticedURL(newQueuePath, sb.useTailCache, sb.exceed134217727);
noticeURL = new NoticedURL(newQueuePath, sb.peers.myBotIDs(), sb.useTailCache, sb.exceed134217727);
FileUtils.deletedelete(new File(newQueuePath, ERROR_DB_FILENAME));
errorURL = new ZURL(newQueuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727);
delegatedURL = new ZURL(newQueuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727);
@ -571,7 +571,10 @@ public class CrawlQueues {
try {
// checking robots.txt for http(s) resources
this.request.setStatus("worker-checkingrobots", WorkflowJob.STATUS_STARTED);
if ((request.url().getProtocol().equals("http") || request.url().getProtocol().equals("https")) && sb.robots.isDisallowed(request.url())) {
RobotsEntry robotsEntry;
if ((request.url().getProtocol().equals("http") || request.url().getProtocol().equals("https")) &&
(robotsEntry = sb.robots.getEntry(request.url(), sb.peers.myBotIDs())) != null &&
robotsEntry.isDisallowed(request.url())) {
//if (log.isFine()) log.logFine("Crawling of URL '" + request.url().toString() + "' disallowed by robots.txt.");
errorURL.push(
this.request,

@ -23,8 +23,10 @@
package de.anomic.crawler;
import java.io.IOException;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.document.MultiProtocolURI;
@ -159,7 +161,7 @@ public class Latency {
* @param minimumGlobalDelta
* @return the remaining waiting time in milliseconds
*/
public static long waitingRemaining(MultiProtocolURI url, final long minimumLocalDelta, final long minimumGlobalDelta) {
public static long waitingRemaining(MultiProtocolURI url, final Set<String> thisAgents, final long minimumLocalDelta, final long minimumGlobalDelta) {
// first check if the domain was _ever_ accessed before
Host host = host(url);
@ -171,7 +173,7 @@ public class Latency {
long waiting = (local) ? minimumLocalDelta : minimumGlobalDelta;
// the time since last access to the domain is the basis of the remaining calculation
final long timeSinceLastAccess = (host == null) ? 0 : System.currentTimeMillis() - host.lastacc();
final long timeSinceLastAccess = System.currentTimeMillis() - host.lastacc();
// for CGI accesses, we double the minimum time
// mostly there is a database access in the background
@ -182,13 +184,23 @@ public class Latency {
if (!local && host != null) waiting += host.flux(waiting);
// find the delay as given by robots.txt on target site
long robotsDelay = (local) ? 0 : Switchboard.getSwitchboard().robots.getCrawlDelayMillis(url);
long robotsDelay = 0;
if (!local) {
RobotsEntry robotsEntry;
try {
robotsEntry = Switchboard.getSwitchboard().robots.getEntry(url, thisAgents);
} catch (IOException e) {
robotsEntry = null;
}
robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis();
if (robotsEntry != null && robotsDelay == 0) return 0; // no limits if granted exclusively for this peer
}
waiting = Math.max(waiting, robotsDelay);
// use the access latency as rule how fast we can access the server
// this applies also to localhost, but differently, because it is not necessary to
// consider so many external accesses
if (host != null) waiting = Math.max(waiting, (local) ? host.average() / 2 : host.average() * 2);
waiting = Math.max(waiting, (local) ? host.average() / 2 : host.average() * 2);
// prevent that that a robots file can stop our indexer completely
waiting = Math.min(60000, waiting);
@ -199,7 +211,7 @@ public class Latency {
}
public static String waitingRemainingExplain(MultiProtocolURI url, final long minimumLocalDelta, final long minimumGlobalDelta) {
public static String waitingRemainingExplain(MultiProtocolURI url, final Set<String> thisAgents, final long minimumLocalDelta, final long minimumGlobalDelta) {
// first check if the domain was _ever_ accessed before
Host host = host(url);
@ -225,7 +237,17 @@ public class Latency {
if (!local && host != null) s.append(", flux = ").append(host.flux(waiting));
// find the delay as given by robots.txt on target site
long robotsDelay = (local) ? 0 : Switchboard.getSwitchboard().robots.getCrawlDelayMillis(url);
long robotsDelay = 0;
if (!local) {
RobotsEntry robotsEntry;
try {
robotsEntry = Switchboard.getSwitchboard().robots.getEntry(url, thisAgents);
} catch (IOException e) {
robotsEntry = null;
}
robotsDelay = (robotsEntry == null) ? 0 : robotsEntry.getCrawlDelayMillis();
if (robotsEntry != null && robotsDelay == 0) return "no waiting for exclusive granted peer"; // no limits if granted exclusively for this peer
}
s.append(", robots.delay = ").append(robotsDelay);
// use the access latency as rule how fast we can access the server

@ -32,6 +32,7 @@ import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.index.RowSpaceExceededException;
@ -56,14 +57,15 @@ public class NoticedURL {
public NoticedURL(
final File cachePath,
final Set<String> myAgentIDs,
final boolean useTailCache,
final boolean exceed134217727) {
Log.logInfo("NoticedURL", "CREATING STACKS at " + cachePath.toString());
this.coreStack = new Balancer(cachePath, "urlNoticeCoreStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, useTailCache, exceed134217727);
this.limitStack = new Balancer(cachePath, "urlNoticeLimitStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, useTailCache, exceed134217727);
this.coreStack = new Balancer(cachePath, "urlNoticeCoreStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, myAgentIDs, useTailCache, exceed134217727);
this.limitStack = new Balancer(cachePath, "urlNoticeLimitStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, myAgentIDs, useTailCache, exceed134217727);
//overhangStack = new plasmaCrawlBalancer(overhangStackFile);
this.remoteStack = new Balancer(cachePath, "urlNoticeRemoteStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, useTailCache, exceed134217727);
this.noloadStack = new Balancer(cachePath, "urlNoticeNoLoadStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, useTailCache, exceed134217727);
this.remoteStack = new Balancer(cachePath, "urlNoticeRemoteStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, myAgentIDs, useTailCache, exceed134217727);
this.noloadStack = new Balancer(cachePath, "urlNoticeNoLoadStack", minimumLocalDeltaInit, minimumGlobalDeltaInit, myAgentIDs, useTailCache, exceed134217727);
}
public long getMinimumLocalDelta() {

@ -28,6 +28,7 @@
package de.anomic.crawler;
import java.net.MalformedURLException;
import java.util.Arrays;
import java.util.Date;
import java.util.LinkedHashMap;
@ -53,9 +54,9 @@ public class RobotsEntry {
public static final String CRAWL_DELAY_MILLIS = "crawlDelayMillis";
// this is a simple record structure that holds all properties of a single crawl start
private Map<String, byte[]> mem;
private List<String> allowPathList, denyPathList;
String hostName;
private final Map<String, byte[]> mem;
private final List<String> allowPathList, denyPathList;
private final String hostName;
public RobotsEntry(final String hostName, final Map<String, byte[]> mem) {
this.hostName = hostName.toLowerCase();
@ -134,6 +135,10 @@ public class RobotsEntry {
}
}
public String getHostName() {
return this.hostName;
}
public Map<String, byte[]> getMem() {
if (!this.mem.containsKey(HOST_NAME)) this.mem.put(HOST_NAME, this.hostName.getBytes());
return this.mem;
@ -147,8 +152,18 @@ public class RobotsEntry {
return str.toString();
}
public String getSitemap() {
return this.mem.containsKey(SITEMAP)? UTF8.String(this.mem.get(SITEMAP)): null;
/**
* get the sitemap url
* @return the sitemap url or null if no sitemap url is given
*/
public MultiProtocolURI getSitemap() {
String url = this.mem.containsKey(SITEMAP)? UTF8.String(this.mem.get(SITEMAP)): null;
if (url == null) return null;
try {
return new MultiProtocolURI(url);
} catch (MalformedURLException e) {
return null;
}
}
public Date getLoadedDate() {
@ -192,7 +207,8 @@ public class RobotsEntry {
return 0;
}
public boolean isDisallowed(String path) {
public boolean isDisallowed(MultiProtocolURI subpathURL) {
String path = subpathURL.getFile();
if ((this.mem == null) || (this.denyPathList.isEmpty())) return false;
// if the path is null or empty we set it to /

@ -31,6 +31,7 @@ import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern;
@ -76,8 +77,14 @@ public class RobotsTxt {
return this.robotsTable.size();
}
private RobotsEntry getEntry(final MultiProtocolURI theURL, final boolean fetchOnlineIfNotAvailableOrNotFresh) throws IOException {
// this method will always return a non-null value
public RobotsEntry getEntry(final MultiProtocolURI theURL, final Set<String> thisAgents) throws IOException {
if (theURL == null) throw new IllegalArgumentException();
if (!theURL.getProtocol().startsWith("http")) return null;
return getEntry(theURL, thisAgents, true);
}
private RobotsEntry getEntry(final MultiProtocolURI theURL, final Set<String> thisAgents, final boolean fetchOnlineIfNotAvailableOrNotFresh) throws IOException {
// this method will always return a non-null value
String urlHostPort = getHostPort(theURL);
RobotsEntry robotsTxt4Host = null;
Map<String, byte[]> record;
@ -174,7 +181,7 @@ public class RobotsTxt {
addEntry(robotsTxt4Host);
}
} else {
final robotsParser parserResult = new robotsParser((byte[]) result[DOWNLOAD_ROBOTS_TXT]);
final robotsParser parserResult = new robotsParser((byte[]) result[DOWNLOAD_ROBOTS_TXT], thisAgents);
ArrayList<String> denyPath = parserResult.denyList();
if (((Boolean) result[DOWNLOAD_ACCESS_RESTRICTED]).booleanValue()) {
denyPath = new ArrayList<String>();
@ -219,8 +226,8 @@ public class RobotsTxt {
private String addEntry(final RobotsEntry entry) {
// writes a new page and returns key
try {
this.robotsTable.insert(this.robotsTable.encodedKey(entry.hostName), entry.getMem());
return entry.hostName;
this.robotsTable.insert(this.robotsTable.encodedKey(entry.getHostName()), entry.getMem());
return entry.getHostName();
} catch (final Exception e) {
log.warn("cannot write robots.txt entry", e);
return null;
@ -255,57 +262,7 @@ public class RobotsTxt {
}
return port;
}
public MultiProtocolURI getSitemapURL(final MultiProtocolURI theURL) {
if (theURL == null) throw new IllegalArgumentException();
if (!theURL.getProtocol().startsWith("http")) return null;
MultiProtocolURI sitemapURL = null;
// generating the hostname:poart string needed to do a DB lookup
RobotsEntry robotsTxt4Host;
try {
robotsTxt4Host = this.getEntry(theURL, true);
} catch (IOException e1) {
return null;
}
try {
final String sitemapUrlStr = robotsTxt4Host.getSitemap();
if (sitemapUrlStr != null) sitemapURL = new MultiProtocolURI(sitemapUrlStr);
} catch (final MalformedURLException e) {/* ignore this */}
return sitemapURL;
}
public long getCrawlDelayMillis(final MultiProtocolURI theURL) {
if (theURL == null) throw new IllegalArgumentException();
if (!theURL.getProtocol().startsWith("http")) return 0;
RobotsEntry robotsEntry;
try {
robotsEntry = getEntry(theURL, true);
} catch (IOException e) {
log.warn("cannot load robots.txt entry", e);
return 0;
}
return robotsEntry.getCrawlDelayMillis();
}
public boolean isDisallowed(final MultiProtocolURI nexturl) {
if (nexturl == null) throw new IllegalArgumentException();
if (!nexturl.getProtocol().startsWith("http")) return false;
// generating the hostname:port string needed to do a DB lookup
RobotsEntry robotsTxt4Host = null;
try {
robotsTxt4Host = getEntry(nexturl, true);
} catch (IOException e) {
log.warn("cannot load robots.txt entry", e);
return false;
}
return robotsTxt4Host.isDisallowed(nexturl.getFile());
}
private static Object[] downloadRobotsTxt(final MultiProtocolURI robotsURL, int redirectionCount, final RobotsEntry entry) throws Exception {
if (robotsURL == null || !robotsURL.getProtocol().startsWith("http")) return null;

@ -1,29 +1,33 @@
//robotsParser.java
//-------------------------------------
//part of YACY
//
//(C) 2005, 2006 by Alexander Schier
// Martin Thelian
//
//last change: $LastChangedDate$ by $LastChangedBy$
//Revision: $LastChangedRevision$
//
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation; either version 2 of the License, or
//(at your option) any later version.
//
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
/*
robotsParser.java
-------------------------------------
part of YACY
(C) 2005, 2006 by Alexander Schier
Martin Thelian
last change: $LastChangedDate$LastChangedBy: orbiter $
Revision: $LastChangedRevision$
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
// extended to return structured objects instead of a Object[] and
// extended to return a Allow-List by Michael Christen, 21.07.2008
extended to return structured objects instead of a Object[] and
extended to return a Allow-List by Michael Christen, 21.07.2008
extended to allow multiple user agents given by definition and
returning the used user agent my Michael Christen 3.4.2011
*/
package de.anomic.crawler;
@ -33,6 +37,7 @@ import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Set;
import java.util.regex.Pattern;
/*
@ -65,48 +70,48 @@ public final class robotsParser {
public static final String ROBOTS_SITEMAP = "Sitemap:".toUpperCase();
public static final String ROBOTS_CRAWL_DELAY = "Crawl-delay:".toUpperCase();
private ArrayList<String> allowList;
private ArrayList<String> denyList;
private String sitemap;
private long crawlDelayMillis;
private final ArrayList<String> allowList;
private final ArrayList<String> denyList;
private String sitemap;
private long crawlDelayMillis;
private final Set<String> myNames; // a list of own name lists
private String agentName; // the name of the agent that was used to return the result
public robotsParser(final byte[] robotsTxt) {
if ((robotsTxt == null)||(robotsTxt.length == 0)) {
allowList = new ArrayList<String>(0);
denyList = new ArrayList<String>(0);
sitemap = "";
crawlDelayMillis = 0;
} else {
public robotsParser(final byte[] robotsTxt, final Set<String> myNames) {
this.allowList = new ArrayList<String>(0);
this.denyList = new ArrayList<String>(0);
this.sitemap = "";
this.crawlDelayMillis = 0;
this.myNames = myNames;
this.agentName = null;
if (robotsTxt != null && robotsTxt.length != 0) {
final ByteArrayInputStream bin = new ByteArrayInputStream(robotsTxt);
final BufferedReader reader = new BufferedReader(new InputStreamReader(bin));
parse(reader);
}
}
public robotsParser(final BufferedReader reader) {
if (reader == null) {
allowList = new ArrayList<String>(0);
denyList = new ArrayList<String>(0);
sitemap = "";
crawlDelayMillis = 0;
} else {
parse(reader);
}
public robotsParser(final BufferedReader reader, final Set<String> myNames) {
this.allowList = new ArrayList<String>(0);
this.denyList = new ArrayList<String>(0);
this.sitemap = "";
this.crawlDelayMillis = 0;
this.myNames = myNames;
this.agentName = null;
if (reader != null) parse(reader);
}
private void parse(final BufferedReader reader) {
final ArrayList<String> deny4AllAgents = new ArrayList<String>();
final ArrayList<String> deny4YaCyAgent = new ArrayList<String>();
final ArrayList<String> deny4ThisAgents = new ArrayList<String>();
final ArrayList<String> allow4AllAgents = new ArrayList<String>();
final ArrayList<String> allow4YaCyAgent = new ArrayList<String>();
final ArrayList<String> allow4ThisAgents = new ArrayList<String>();
int pos;
String line = null, lineUpper = null;
sitemap = null;
crawlDelayMillis = 0;
boolean isRule4AllAgents = false,
isRule4YaCyAgent = false,
rule4YaCyFound = false,
isRule4ThisAgents = false,
rule4ThisAgentsFound = false,
inBlock = false;
try {
@ -118,7 +123,7 @@ public final class robotsParser {
// parse empty line
if (line.length() == 0) {
// we have reached the end of the rule block
if (rule4YaCyFound) {
if (rule4ThisAgentsFound) {
// stop here because other robot blocks are either not for YaCy
// or global settings which shall not overwrite YaCys settings.
break lineparser;
@ -147,7 +152,7 @@ public final class robotsParser {
if (inBlock) {
// we have detected the start of a new block
if (rule4YaCyFound) {
if (rule4ThisAgentsFound) {
// stop here because other robot blocks are either not for YaCy
// or global settings which shall not overwrite YaCys settings.
break lineparser;
@ -155,7 +160,7 @@ public final class robotsParser {
inBlock = false;
isRule4AllAgents = false;
isRule4YaCyAgent = false;
isRule4ThisAgents = false;
crawlDelayMillis = 0; // each block has a separate delay
}
@ -168,9 +173,14 @@ public final class robotsParser {
if (pos != -1) {
final String userAgent = line.substring(pos).trim();
isRule4AllAgents |= userAgent.equals("*");
isRule4YaCyAgent |= userAgent.toLowerCase().indexOf("yacy") >=0;
isRule4YaCyAgent |= userAgent.toLowerCase().indexOf("yacybot") >=0;
if (isRule4YaCyAgent) rule4YaCyFound = true;
for (String agent: this.myNames) {
if (userAgent.toLowerCase().indexOf(agent) >= 0) {
this.agentName = agent;
isRule4ThisAgents = true;
break;
}
}
if (isRule4ThisAgents) rule4ThisAgentsFound = true;
}
continue lineparser;
}
@ -178,7 +188,7 @@ public final class robotsParser {
// parse crawl delay
if (lineUpper.startsWith(ROBOTS_CRAWL_DELAY)) {
inBlock = true;
if (isRule4YaCyAgent || isRule4AllAgents) {
if (isRule4ThisAgents || isRule4AllAgents) {
pos = line.indexOf(' ');
if (pos != -1) {
try {
@ -197,7 +207,7 @@ public final class robotsParser {
inBlock = true;
final boolean isDisallowRule = lineUpper.startsWith(ROBOTS_DISALLOW);
if (isRule4YaCyAgent || isRule4AllAgents) {
if (isRule4ThisAgents || isRule4AllAgents) {
// cutting off comments at the line end
pos = line.indexOf(ROBOTS_COMMENT);
if (pos != -1) line = line.substring(0,pos).trim();
@ -227,10 +237,10 @@ public final class robotsParser {
// adding it to the pathlist
if (isDisallowRule) {
if (isRule4AllAgents) deny4AllAgents.add(path);
if (isRule4YaCyAgent) deny4YaCyAgent.add(path);
if (isRule4ThisAgents) deny4ThisAgents.add(path);
} else {
if (isRule4AllAgents) allow4AllAgents.add(path);
if (isRule4YaCyAgent) allow4YaCyAgent.add(path);
if (isRule4ThisAgents) allow4ThisAgents.add(path);
}
}
}
@ -239,14 +249,32 @@ public final class robotsParser {
}
} catch (final IOException e) {}
allowList = (rule4YaCyFound) ? allow4YaCyAgent : allow4AllAgents;
denyList = (rule4YaCyFound) ? deny4YaCyAgent : deny4AllAgents;
allowList.addAll(rule4ThisAgentsFound ? allow4ThisAgents : allow4AllAgents);
denyList.addAll(rule4ThisAgentsFound ? deny4ThisAgents : deny4AllAgents);
}
/**
* a crawl delay can be assigned to every agent or for all agents
* a special case is where the user agent of this yacy peer is given explicitely
* using the peer name and then if the crawl delay is given as '0' the crawler
* does not make any no-DOS-forced crawl pause.
* @return the crawl delay between two crawl access times in milliseconds
*/
public long crawlDelayMillis() {
return this.crawlDelayMillis;
}
/**
* the user agent that was applied to get the crawl properties is recorded
* because it is possible that this robots.txt parser applies to several user agents
* which may be i.e. 'yacy', 'yacybot', <peer-name>'.yacy' or <peer-hash>'.yacyh'
* Effects: see also comment to crawlDelayMillis()
* @return the name of the user agent that was used for the result properties or null if no user agent name was used to identify the agent
*/
public String agentName() {
return this.agentName;
}
public String sitemap() {
return this.sitemap;
}

@ -99,7 +99,7 @@ import de.anomic.server.serverObjects;
public final class HTTPDProxyHandler {
public static final String yacyUserAgent = "yacy (" + MultiProtocolURI.systemOST +") yacy.net";
public static final String yacyUserAgent = "yacyproxy (" + MultiProtocolURI.systemOST +") http://yacy.net/bot.html";
// static variables
// can only be instantiated upon first instantiation of this class object

@ -32,9 +32,11 @@ import java.io.PrintWriter;
import java.lang.ref.SoftReference;
import java.net.InetAddress;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.ConcurrentHashMap;
@ -95,7 +97,7 @@ public final class yacySeedDB implements AlternativeDomainNames {
public PartitionScheme scheme;
private yacySeed mySeed; // my own seed
private Set<String> myBotIDs; // list of id's that this bot accepts as robots.txt identification
private final Hashtable<String, String> nameLookupCache; // a name-to-hash relation
private final Hashtable<InetAddress, SoftReference<yacySeed>> ipLookupCache;
@ -114,6 +116,9 @@ public final class yacySeedDB implements AlternativeDomainNames {
this.seedPotentialDBFile = new File(networkRoot, seedPotentialDBFileName);
this.mySeed = null; // my own seed
this.myOwnSeedFile = myOwnSeedFile;
this.myBotIDs = new HashSet<String>();
this.myBotIDs.add("yacy");
this.myBotIDs.add("yacybot");
this.netRedundancy = redundancy;
this.scheme = new VerticalWordPartitionScheme(partitionExponent);
@ -161,13 +166,15 @@ public final class yacySeedDB implements AlternativeDomainNames {
this.seedActiveDBFile = new File(newNetworkRoot, seedActiveDBFile.getName());
this.seedPassiveDBFile = new File(newNetworkRoot, seedPassiveDBFile.getName());
this.seedPotentialDBFile = new File(newNetworkRoot, seedPotentialDBFile.getName());
// read current peer name
String peername = this.myName();
// replace my (old) seed with new seed definition from other network
// but keep the seed name
String peername = this.myName();
this.mySeed = null; // my own seed
this.myOwnSeedFile = new File(newNetworkRoot, yacySeedDB.DBFILE_OWN_SEED);
initMySeed();
this.mySeed.setName(peername);
this.netRedundancy = redundancy;
this.scheme = new VerticalWordPartitionScheme(partitionExponent);
@ -228,11 +235,16 @@ public final class yacySeedDB implements AlternativeDomainNames {
System.exit(-1);
}
}
this.myBotIDs.add(this.mySeed.getName() + ".yacy");
this.myBotIDs.add(this.mySeed.hash + ".yacyh");
mySeed.setIP(""); // we delete the old information to see what we have now
mySeed.put(yacySeed.PEERTYPE, yacySeed.PEERTYPE_VIRGIN); // markup startup condition
}
public Set<String> myBotIDs() {
return this.myBotIDs;
}
public int redundancy() {
if (this.mySeed.isJunior()) return 1;
return this.netRedundancy;
@ -250,6 +262,12 @@ public final class yacySeedDB implements AlternativeDomainNames {
return this.mySeed;
}
public void setMyName(String name) {
this.myBotIDs.remove(this.mySeed.getName() + ".yacy");
this.mySeed.setName(name);
this.myBotIDs.add(name + ".yacy");
}
public String myAlternativeAddress() {
return mySeed().getName() + ".yacy";
}

Loading…
Cancel
Save