enabling all crawl profiles in all network modes

also: increased default internet crawl speed to
4 urls/s/host
pull/402/head
Michael Peter Christen 4 years ago
parent 32ca669bfb
commit d0abb0cedb

@ -576,15 +576,15 @@
</dd>
</dl>
</fieldset>
#(agentSelect)#<input type="hidden" name="agentName" id="agentName" value="#[defaultAgentName]#" />::
<fieldset>
<legend>Robot Behaviour</legend>
<dl>
<dt><label>Use Special User Agent and robot identification</label></dt>
<dd>
<span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
You are running YaCy in non-p2p mode and because YaCy can be used as replacement for commercial search appliances
(like the GSA) the user must be able to crawl all web pages that are granted to such commercial plattforms.
Because YaCy can be used as replacement for commercial search appliances
(like the Google Search Appliance aka GSA) the user must be able to crawl all web pages that are granted to such commercial platforms.
Not having this option would be a strong handicap for professional usage of this software. Therefore you are able to select
alternative user agents here which have different crawl timings and also identify itself with another user agent and obey the corresponding robots rule.
</span></span>
@ -596,7 +596,6 @@
</dd>
</dl>
</fieldset>
#(/agentSelect)#
#(vocabularySelect)#::
<fieldset>

@ -145,10 +145,8 @@ public class CrawlStartExpert {
prop.put("bookmarkTitle", "");
}
// ---------- Crawling filter
final int crawlingDomMaxPages = env.getConfigInt(
"crawlingDomMaxPages", -1);
final int crawlingDomMaxPages = env.getConfigInt("crawlingDomMaxPages", -1);
// crawling depth
if (post != null && post.containsKey("crawlingDepth")) {
@ -434,7 +432,6 @@ public class CrawlStartExpert {
prop.put("deleteIfOlderUnitSelect_list_2_default", 1);
}
// clean up search events cache ?
if (post != null && post.containsKey("cleanSearchCache")) {
prop.put("cleanSearchCacheChecked", post.getBoolean("cleanSearchCache"));
@ -565,10 +562,6 @@ public class CrawlStartExpert {
}
// ---------- Agent name
if (sb.isP2PMode()) {
prop.put("agentSelect", 0);
} else {
prop.put("agentSelect", 1);
List<String> agentNames = new ArrayList<String>();
if (sb.isIntranetMode()) {
agentNames.add(ClientIdentification.yacyIntranetCrawlerAgentName);
@ -587,13 +580,11 @@ public class CrawlStartExpert {
if (agentNames.contains(agentName)) defaultAgentName = agentName;
}
for (int i = 0; i < agentNames.size(); i++) {
prop.put("agentSelect_list_" + i + "_name", agentNames.get(i));
prop.put("agentSelect_list_" + i + "_default", agentNames.get(i).equals(defaultAgentName) ? 1 : 0);
}
prop.put("agentSelect_list", agentNames.size());
prop.put("list_" + i + "_name", agentNames.get(i));
prop.put("list_" + i + "_default", agentNames.get(i).equals(defaultAgentName) ? 1 : 0);
}
prop.put("agentSelect_defaultAgentName", ClientIdentification.yacyInternetCrawlerAgentName);
prop.put("list", agentNames.size());
prop.put("defaultAgentName", sb.isIntranetMode() ? ClientIdentification.yacyIntranetCrawlerAgentName : ClientIdentification.yacyInternetCrawlerAgentName);
// ---------- Ignore Class Name
if (post != null && post.containsKey("ignoreclassname")) {

@ -33,7 +33,7 @@ public class ClientIdentification {
public static final int clientTimeoutInit = 10000;
public static final int minimumLocalDeltaInit = 10; // the minimum time difference between access of the same local domain
public static final int minimumGlobalDeltaInit = 500; // the minimum time difference between access of the same global domain
public static final int minimumGlobalDeltaInit = 250; // the minimum time difference between access of the same global domain
public static class Agent {
public final String userAgent; // the name that is send in http request to identify the agent

Loading…
Cancel
Save