added option to configure a custom user agent within allip networks

pull/1/head
Michael Peter Christen 11 years ago
parent 20cffa34bf
commit f23c4142e0

@ -1231,3 +1231,17 @@ greedylearning.active = true
# postprocessing steering
postprocessing.maximum_load = 2.5
postprocessing.minimum_ram = 536870912
# Custom user agents for 'allip' networks:
# This user agent is only available if the network is set to 'allip' (which is a non-limited domain 'network'
# without p2p options). Changing this will NOT change the default YaCy user agent, it will only provide an
# agent which is available at crawl start within 'allip'. The userAgent.name is the identifier for the
# robots.txt file which YaCy always obeys for the given name or a wildcard for robot types.
# If any part of this custom user agent name or string includes the phrase 'yacy', it will be IGNORED
# to prevent fraud, DoS or bad behavior in the name of YaCy.
# To use this user agent option, you must define completely different names and strings
# and remove the given example here, which will be ignored by default.
crawler.userAgent.name = yacybot
crawler.userAgent.string = yacybot ($$SYSTEM$$) http://yacy.net/bot.html
crawler.userAgent.minimumdelta = 500
crawler.userAgent.clienttimeout = 10000

@ -494,6 +494,7 @@ public class CrawlStartExpert {
agentNames.add(ClientIdentification.googleAgentName);
if (sb.isAllIPMode()) {
agentNames.add(ClientIdentification.browserAgentName);
if (ClientIdentification.getAgent(ClientIdentification.customAgentName) != null) agentNames.add(ClientIdentification.customAgentName);
}
String defaultAgentName = agentNames.get(0);
if (post != null && post.containsKey("agentName")) {

@ -319,8 +319,10 @@ public class Crawler_p {
env.setConfig("storeHTCache", storeHTCache);
String agentName = post.get("agentName", sb.isIntranetMode() ? ClientIdentification.yacyIntranetCrawlerAgentName : ClientIdentification.yacyInternetCrawlerAgentName);
String defaultAgentName = sb.isIntranetMode() ? ClientIdentification.yacyIntranetCrawlerAgentName : ClientIdentification.yacyInternetCrawlerAgentName;
String agentName = post.get("agentName", defaultAgentName);
ClientIdentification.Agent agent = ClientIdentification.getAgent(agentName);
if (agent == null) agent = ClientIdentification.getAgent(defaultAgentName);
CacheStrategy cachePolicy = CacheStrategy.parse(post.get("cachePolicy", "iffresh"));
if (cachePolicy == null) cachePolicy = CacheStrategy.IFFRESH;

@ -77,6 +77,7 @@ public class ClientIdentification {
public final static Agent browserAgent = new Agent(browserAgents[random.nextInt(browserAgents.length)], new String[]{"Mozilla"}, minimumLocalDeltaInit, clientTimeoutInit);
public final static String yacyProxyAgentName = "YaCyProxy";
public final static Agent yacyProxyAgent = new Agent("yacy - this is a proxy access through YaCy from a browser, not a robot (the yacy bot user agent is 'yacybot')", new String[]{"yacy"}, minimumGlobalDeltaInit, clientTimeoutInit);
public final static String customAgentName = "Custom Agent";
static {
generateYaCyBot("new");
@ -104,6 +105,12 @@ public class ClientIdentification {
agents.put(yacyInternetCrawlerAgentName, yacyInternetCrawlerAgent);
agents.put(yacyIntranetCrawlerAgentName, yacyIntranetCrawlerAgent);
}
public static void generateCustomBot(String name, String string, int minimumdelta, int clienttimeout) {
if (name.toLowerCase().indexOf("yacy") >= 0 || string.toLowerCase().indexOf("yacy") >= 0) return; // don't allow 'yacy' in custom bot strings
String agentString = string.replace("$$SYSTEM$$", yacySystem.replace("java", "O"));
agents.put(customAgentName, new Agent(agentString, new String[]{name}, minimumdelta, clienttimeout));
}
/**
* get the default agent

@ -395,13 +395,19 @@ public final class Switchboard extends serverSwitch {
// load the network definition
try {
overwriteNetworkDefinition();
overwriteNetworkDefinition(getSysinfo());
} catch (final FileNotFoundException e) {
ConcurrentLog.logException(e);
} catch (final IOException e) {
ConcurrentLog.logException(e);
}
// create custom user agent
ClientIdentification.generateCustomBot(
getConfig(SwitchboardConstants.CRAWLER_USER_AGENT_NAME, ""),
getConfig(SwitchboardConstants.CRAWLER_USER_AGENT_STRING, ""),
(int) getConfigLong(SwitchboardConstants.CRAWLER_USER_AGENT_MINIMUMDELTA, 500),
(int) getConfigLong(SwitchboardConstants.CRAWLER_USER_AGENT_CLIENTTIMEOUT , 1000));
// start indexing management
this.log.config("Starting Indexing Management");
final String networkName = getConfig(SwitchboardConstants.NETWORK_NAME, "");
@ -1140,6 +1146,10 @@ public final class Switchboard extends serverSwitch {
this.log.config("Finished Switchboard Initialization");
}
final String getSysinfo() {
return getConfig(SwitchboardConstants.NETWORK_NAME, "") + (isRobinsonMode() ? "-" : "/") + getConfig(SwitchboardConstants.NETWORK_DOMAIN, "global");
}
@Override
public void setHttpServer(YaCyHttpServer server) {
super.setHttpServer(server);
@ -1162,7 +1172,7 @@ public final class Switchboard extends serverSwitch {
+ this.indexingStorageProcessor.getQueueSize();
}
public void overwriteNetworkDefinition() throws FileNotFoundException, IOException {
public void overwriteNetworkDefinition(final String sysinfo) throws FileNotFoundException, IOException {
// load network configuration into settings
String networkUnitDefinition =
@ -1264,9 +1274,7 @@ public final class Switchboard extends serverSwitch {
}
*/
// write the YaCy network identification inside the yacybot client user agent to distinguish networks
ClientIdentification.generateYaCyBot(getConfig(SwitchboardConstants.NETWORK_NAME, "")
+ (isRobinsonMode() ? "-" : "/")
+ getConfig(SwitchboardConstants.NETWORK_DOMAIN, "global"));
ClientIdentification.generateYaCyBot(sysinfo);
}
public void switchNetwork(final String networkDefinition) throws FileNotFoundException, IOException {
@ -1309,7 +1317,7 @@ public final class Switchboard extends serverSwitch {
// new properties
setConfig("network.unit.definition", networkDefinition);
overwriteNetworkDefinition();
overwriteNetworkDefinition(getSysinfo());
final File indexPrimaryPath =
getDataPath(SwitchboardConstants.INDEX_PRIMARY_PATH, SwitchboardConstants.INDEX_PATH_DEFAULT);
final int wordCacheMaxCount =

@ -340,6 +340,10 @@ public final class SwitchboardConstants {
public static final String CRAWLER_FOLLOW_REDIRECTS = "crawler.http.FollowRedirects"; // ignore the target url and follow to the redirect
public static final String CRAWLER_RECORD_REDIRECTS = "crawler.http.RecordRedirects"; // record the ignored redirected page to the index store
public static final String CRAWLER_USER_AGENT_NAME = "crawler.userAgent.name";
public static final String CRAWLER_USER_AGENT_STRING = "crawler.userAgent.string";
public static final String CRAWLER_USER_AGENT_MINIMUMDELTA = "crawler.userAgent.minimumdelta";
public static final String CRAWLER_USER_AGENT_CLIENTTIMEOUT = "crawler.userAgent.clienttimeout";
/**
* debug flags

Loading…
Cancel
Save