Merge pull request #40 from Scarfmonster/autocrawl

Automatic crawling
pull/41/merge
Michael Peter Christen 9 years ago
commit 5d635879f8

@ -538,6 +538,17 @@ proxyURL.access=127.0.0.1,0:0:0:0:0:0:0:1
proxyURL.rewriteURLs=domainlist proxyURL.rewriteURLs=domainlist
proxyURL.useforresults=false proxyURL.useforresults=false
# Autocrawl configuration
autocrawl=false
autocrawl.index.text=true
autocrawl.index.media=true
autocrawl.ratio=50
autocrawl.rows=100
autocrawl.days=1
autocrawl.query=*:*
autocrawl.deep.depth=3
autocrawl.shallow.depth=1
# From the 'IndexCreate' menu point you can also define a crawling start point. # From the 'IndexCreate' menu point you can also define a crawling start point.
# The crawling works the same way as the prefetch, but it is possible to # The crawling works the same way as the prefetch, but it is possible to
# assign a different crawling depth. # assign a different crawling depth.
@ -627,6 +638,10 @@ collection=user
50_localcrawl_memprereq=25165824 50_localcrawl_memprereq=25165824
50_localcrawl_loadprereq=6.0 50_localcrawl_loadprereq=6.0
50_localcrawl_isPaused=false 50_localcrawl_isPaused=false
55_autocrawl_idlesleep=10000
55_autocrawl_busysleep=10000
55_autocrawl_memprereq=25165824
55_autocrawl_loadprereq=6.0
60_remotecrawlloader_idlesleep=4000 60_remotecrawlloader_idlesleep=4000
60_remotecrawlloader_busysleep=800 60_remotecrawlloader_busysleep=800
60_remotecrawlloader_memprereq=12582912 60_remotecrawlloader_memprereq=12582912

@ -0,0 +1,50 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" >
<head>
<title>YaCy '#[clientname]#': Crawl Start</title>
#%env/templates/metas.template%#
</head>
</html>
<body id="Autocrawl">
#%env/templates/header.template%#
#%env/templates/submenuIndexCreate.template%#
<h2>Autocrawler</h2>
Autocrawler automatically selects and adds tasks to the local crawl queue. This will work best when there are already quite a few domains in the index.
<fieldset>
<legend>Autocralwer Configuration</legend>
<form id="ConfigForm" method="post" action="Autocrawl_p.html" enctype="multipart/form-data" accept-charset="UTF-8">
<dl>
#(changed)#::<dt></dt><dd><span class="error">You need to restart for some settings to be applied</span></dd>#(/changed)#
<dt>Enable Autocrawler:</dt>
<dd><input id="autocrawlEnable" name="autocrawlEnable" type="checkbox" #(autocrawlEnable)#::checked="checked"#(/autocrawlEnable)# /></dd>
<dt>Deep crawl every:</dt>
<dd>
<input id="autocrawlRatio" name="autocrawlRatio" type="number" min="1" max="500" step="1" size="2" maxlength="2" value="#[autocrawlRatio]#" />
Warning: if this is bigger than "Rows to fetch" only shallow crawls will run.
</dd>
<dt>Rows to fetch at once:</dt>
<dd><input id="autocrawlRows" name="autocrawlRows" type="number" min="1" max="500" step="1" size="3" maxlength="3" value="#[autocrawlRows]#" /></dd>
<dt>Recrawl only older than # days:</dt>
<dd><input id="autocrawlDays" name="autocrawlDays" type="number" min="1" max="360" step="1" size="3" maxlength="3" value="#[autocrawlDays]#" /></dd>
<dt>Get hosts by query:</dt>
<dd>
<input id="autocrawlQuery" name="autocrawlQuery" type="text" size="50" value="#[autocrawlQuery]#" />
Can be any valid Solr query.
</dd>
<dt>Shallow crawl depth (0 to 2):</dt>
<dd><input id="autocrawlShallow" name="autocrawlShallow" type="number" min="0" max="2" step="1" size="1" maxlength="1" value="#[autocrawlShallow]#" /></dd>
<dt>Deep crawl depth (1 to 5):</dt>
<dd><input id="autocrawlDeep" name="autocrawlDeep" type="number" min="1" max="5" step="1" size="1" maxlength="1" value="#[autocrawlDeep]#" /></dd>
<dt>Index text:</dt>
<dd><input id="autocrawlText" name="autocrawlText" type="checkbox" #(autocrawlText)#::checked="checked"#(/autocrawlText)# /></dd>
<dt>Index media:</dt>
<dd><input id="autocrawlMedia" name="autocrawlMedia" type="checkbox" #(autocrawlMedia)#::checked="checked"#(/autocrawlMedia)# /></dd>
<dt><input type="submit" name="save" class="btn btn-primary" value="Save" /></dt>
</dl>
</form>
</fieldset>
</body>

@ -0,0 +1,102 @@
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.crawler.data.CrawlProfile.CrawlAttribute;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
public class Autocrawl_p {
public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) {
final serverObjects prop = new serverObjects();
final Switchboard sb = (Switchboard) env;
boolean autocrawlEnable = sb.getConfigBool(SwitchboardConstants.AUTOCRAWL, false);
int autocrawlRatio = Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_RATIO, "50"));
int autocrawlRows = Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_ROWS, "100"));
int autocrawlDays = Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DAYS, "1"));
String autocrawlQuery = sb.getConfig(SwitchboardConstants.AUTOCRAWL_QUERY, "*:*");
int autocrawlShallow = Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_SHALLOW_DEPTH, "1"));
int autocrawlDeep = Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DEEP_DEPTH, "3"));
boolean autocrawlText = sb.getConfigBool(SwitchboardConstants.AUTOCRAWL_INDEX_TEXT, true);
boolean autocrawlMedia = sb.getConfigBool(SwitchboardConstants.AUTOCRAWL_INDEX_MEDIA, true);
if (post != null) {
autocrawlEnable = post.getBoolean("autocrawlEnable");
if (post.containsKey("autocrawlRatio")) {
autocrawlRatio = post.getInt("autocrawlRatio", 50);
}
if (post.containsKey("autocrawlRows")) {
autocrawlRows = post.getInt("autocralwRows", 100);
}
if (post.containsKey("autocrawlDays")) {
autocrawlDays = post.getInt("autocrawlDays", 1);
}
if (post.containsKey("autocrawlQuery")) {
autocrawlQuery = post.get("autocrawlQuery", "*:*");
}
if (post.containsKey("autocrawlShallow")){
autocrawlShallow = post.getInt("autocrawlShallow", 1);
}
if (post.containsKey("autocrawlDeep")) {
autocrawlDeep = post.getInt("autocrawlDeep", 3);
}
autocrawlText = post.getBoolean("autocrawlText");
autocrawlMedia = post.getBoolean("autocrawlMedia");
}
if (autocrawlRatio > 500) {
autocrawlRatio = 500;
} else if (autocrawlRatio < 1) {
autocrawlRatio = 1;
}
if (autocrawlRows > 500) {
autocrawlRows = 500;
} else if (autocrawlRows < 1) {
autocrawlRows = 1;
}
if (autocrawlDays > 60) {
autocrawlDays = 60;
} else if (autocrawlDays < 1) {
autocrawlDays = 1;
}
if (autocrawlShallow > 1) {
autocrawlShallow = 2;
} else if (autocrawlShallow < 0) {
autocrawlShallow = 0;
}
if (autocrawlDeep > 5) {
autocrawlDeep = 5;
} else if (autocrawlDeep < 1) {
autocrawlDeep = 1;
}
if (post != null) {
sb.setConfig(SwitchboardConstants.AUTOCRAWL, autocrawlEnable);
sb.setConfig(SwitchboardConstants.AUTOCRAWL_RATIO, autocrawlRatio);
sb.setConfig(SwitchboardConstants.AUTOCRAWL_ROWS, autocrawlRows);
sb.setConfig(SwitchboardConstants.AUTOCRAWL_DAYS, autocrawlDays);
sb.setConfig(SwitchboardConstants.AUTOCRAWL_QUERY, autocrawlQuery);
sb.setConfig(SwitchboardConstants.AUTOCRAWL_SHALLOW_DEPTH, autocrawlShallow);
sb.setConfig(SwitchboardConstants.AUTOCRAWL_DEEP_DEPTH, autocrawlDeep);
sb.setConfig(SwitchboardConstants.AUTOCRAWL_INDEX_TEXT, autocrawlText);
sb.setConfig(SwitchboardConstants.AUTOCRAWL_INDEX_MEDIA, autocrawlMedia);
sb.initAutocrawl(autocrawlEnable);
prop.put("changed", true);
}
prop.put("autocrawlEnable", autocrawlEnable);
prop.put("autocrawlRatio", autocrawlRatio);
prop.put("autocrawlRows", autocrawlRows);
prop.put("autocrawlDays", autocrawlDays);
prop.put("autocrawlQuery", autocrawlQuery);
prop.put("autocrawlShallow", autocrawlShallow);
prop.put("autocrawlDeep", autocrawlDeep);
prop.put("autocrawlText", autocrawlText);
prop.put("autocrawlMedia", autocrawlMedia);
return prop;
}
}

@ -17,6 +17,7 @@
<li><a href="CrawlStartScanner_p.html" class="MenuItemLink #(authorized)#lock::unlock#(/authorized)#">Network Scanner</a></li> <li><a href="CrawlStartScanner_p.html" class="MenuItemLink #(authorized)#lock::unlock#(/authorized)#">Network Scanner</a></li>
<li><a href="RemoteCrawl_p.html" class="MenuItemLink #(authorized)#lock::unlock#(/authorized)#">Remote Crawling</a></li> <li><a href="RemoteCrawl_p.html" class="MenuItemLink #(authorized)#lock::unlock#(/authorized)#">Remote Crawling</a></li>
<li><a href="ProxyIndexingMonitor_p.html" class="MenuItemLink #(authorized)#lock::unlock#(/authorized)#">Scraping Proxy</a></li> <li><a href="ProxyIndexingMonitor_p.html" class="MenuItemLink #(authorized)#lock::unlock#(/authorized)#">Scraping Proxy</a></li>
<li><a href="Autocrawl_p.html" class="MenuItemLink #(authorized)#lock::unlock#(/authorized)#">Autocrawl</a></li>
</ul> </ul>
</div> </div>

@ -58,7 +58,9 @@ import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants; import net.yacy.search.SwitchboardConstants;
public final class CrawlSwitchboard { public final class CrawlSwitchboard {
public static final String CRAWL_PROFILE_AUTOCRAWL_DEEP = "autocrawlDeep";
public static final String CRAWL_PROFILE_AUTOCRAWL_SHALLOW = "autocrawlShallow";
public static final String CRAWL_PROFILE_PROXY = "proxy"; public static final String CRAWL_PROFILE_PROXY = "proxy";
public static final String CRAWL_PROFILE_REMOTE = "remote"; public static final String CRAWL_PROFILE_REMOTE = "remote";
public static final String CRAWL_PROFILE_SNIPPET_LOCAL_TEXT = "snippetLocalText"; public static final String CRAWL_PROFILE_SNIPPET_LOCAL_TEXT = "snippetLocalText";
@ -70,6 +72,8 @@ public final class CrawlSwitchboard {
public static Set<String> DEFAULT_PROFILES = new HashSet<String>(); public static Set<String> DEFAULT_PROFILES = new HashSet<String>();
static { static {
DEFAULT_PROFILES.add(CRAWL_PROFILE_AUTOCRAWL_DEEP);
DEFAULT_PROFILES.add(CRAWL_PROFILE_AUTOCRAWL_SHALLOW);
DEFAULT_PROFILES.add(CRAWL_PROFILE_PROXY); DEFAULT_PROFILES.add(CRAWL_PROFILE_PROXY);
DEFAULT_PROFILES.add(CRAWL_PROFILE_REMOTE); DEFAULT_PROFILES.add(CRAWL_PROFILE_REMOTE);
DEFAULT_PROFILES.add(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT); DEFAULT_PROFILES.add(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT);
@ -98,6 +102,7 @@ public final class CrawlSwitchboard {
private final Map<String, RowHandleSet> profilesActiveCrawlsCounter; private final Map<String, RowHandleSet> profilesActiveCrawlsCounter;
public CrawlProfile defaultProxyProfile, defaultRemoteProfile, defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile; public CrawlProfile defaultProxyProfile, defaultRemoteProfile, defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile;
public CrawlProfile defaultTextGreedyLearningProfile, defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile, defaultSurrogateProfile; public CrawlProfile defaultTextGreedyLearningProfile, defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile, defaultSurrogateProfile;
public CrawlProfile defaultAutocrawlDeepProfile, defaultAutocrawlShallowProfile;
private Map<String, CrawlProfile> defaultPushProfiles; // for each collection one profile private Map<String, CrawlProfile> defaultPushProfiles; // for each collection one profile
private final File queuesRoot; private final File queuesRoot;
private Switchboard switchboard; private Switchboard switchboard;
@ -268,8 +273,75 @@ public final class CrawlSwitchboard {
private void initActiveCrawlProfiles() { private void initActiveCrawlProfiles() {
// generate new default entry for proxy crawling
final Switchboard sb = Switchboard.getSwitchboard(); final Switchboard sb = Switchboard.getSwitchboard();
// generate new default entry for deep auto crawl
this.defaultAutocrawlDeepProfile =
new CrawlProfile(
CRAWL_PROFILE_AUTOCRAWL_DEEP,
CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DEEP_DEPTH, "3")),
true,
CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DAYS, "1"))*1440),
-1,
true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
sb.getConfigBool(SwitchboardConstants.AUTOCRAWL_INDEX_TEXT, true),
sb.getConfigBool(SwitchboardConstants.AUTOCRAWL_INDEX_MEDIA, true),
false,
false,
-1,
false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.NOCACHE,
"robot_" + CRAWL_PROFILE_AUTOCRAWL_DEEP,
ClientIdentification.yacyInternetCrawlerAgentName,
null,
0);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultAutocrawlDeepProfile.handle()),
this.defaultAutocrawlDeepProfile);
// generate new default entry for shallow auto crawl
this.defaultAutocrawlShallowProfile =
new CrawlProfile(
CRAWL_PROFILE_AUTOCRAWL_SHALLOW,
CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_SHALLOW_DEPTH, "1")),
true,
CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DAYS, "1"))*1440),
-1,
true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
sb.getConfigBool(SwitchboardConstants.AUTOCRAWL_INDEX_TEXT, true),
sb.getConfigBool(SwitchboardConstants.AUTOCRAWL_INDEX_MEDIA, true),
false,
false,
-1,
false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.NOCACHE,
"robot_" + CRAWL_PROFILE_AUTOCRAWL_SHALLOW,
ClientIdentification.yacyInternetCrawlerAgentName,
null,
0);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultAutocrawlShallowProfile.handle()),
this.defaultAutocrawlShallowProfile);
// generate new default entry for proxy crawling
this.defaultProxyProfile = this.defaultProxyProfile =
new CrawlProfile( new CrawlProfile(
CRAWL_PROFILE_PROXY, CRAWL_PROFILE_PROXY,

@ -29,6 +29,7 @@ package net.yacy.crawler.data;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.text.SimpleDateFormat;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.HashMap; import java.util.HashMap;
@ -39,12 +40,18 @@ import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrException;
import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.feed.Hit; import net.yacy.cora.document.feed.Hit;
import net.yacy.cora.document.feed.RSSFeed; import net.yacy.cora.document.feed.RSSFeed;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.FailCategory; import net.yacy.cora.federate.solr.FailCategory;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.ConnectionInfo; import net.yacy.cora.protocol.ConnectionInfo;
@ -548,6 +555,78 @@ public class CrawlQueues {
} }
return true; return true;
} }
public boolean autocrawlJob() {
if (!this.sb.getConfigBool(SwitchboardConstants.AUTOCRAWL, false)) {
return false;
}
if (isPaused(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL)) {
return false;
}
if (coreCrawlJobSize() > 200) {
return false;
}
String rows = this.sb.getConfig(SwitchboardConstants.AUTOCRAWL_ROWS, "100");
String dateQuery = String.format("load_date_dt:[* TO NOW-%sDAY]", this.sb.getConfig(SwitchboardConstants.AUTOCRAWL_DAYS, "1"));
final SolrQuery query = new SolrQuery();
query.add("group", "true");
query.add("group.field", "host_s");
query.add("group.limit", "1");
query.add("group.main", "true");
query.add("rows", rows);
query.setQuery(this.sb.getConfig(SwitchboardConstants.AUTOCRAWL_QUERY, "*:*"));
query.setFields("host_s,url_protocol_s");
query.addSort("load_date_dt", SolrQuery.ORDER.asc);
query.addFilterQuery(dateQuery);
try {
QueryResponse resp = sb.index.fulltext().getDefaultConnector().getResponseByParams(query);
int i = 0;
int deepRatio = Integer.parseInt(this.sb.getConfig(SwitchboardConstants.AUTOCRAWL_RATIO, "50"));
for (SolrDocument doc: resp.getResults()) {
boolean deep = false;
i++;
if( i % deepRatio == 0 ){
deep = true;
}
DigestURL url;
final String u = doc.getFieldValue("url_protocol_s").toString() + "://" + doc.getFieldValue("host_s").toString();
try {
url = new DigestURL(u);
} catch (final MalformedURLException e) {
continue;
}
final String urlRejectReason = this.sb.crawlStacker.urlInAcceptedDomain(url);
if (urlRejectReason == null) {
this.sb.crawlStacker.enqueueEntry(new Request(
ASCII.getBytes(this.sb.peers.mySeed().hash),
url,
null,
"CRAWLING-ROOT",
new Date(),
deep ? this.sb.crawler.defaultAutocrawlDeepProfile.handle() : this.sb.crawler.defaultAutocrawlShallowProfile.handle(),
0,
deep ? this.sb.crawler.defaultAutocrawlDeepProfile.timezoneOffset() : this.sb.crawler.defaultAutocrawlShallowProfile.timezoneOffset()
));
} else {
CrawlQueues.log.warn("autocrawl: Rejected URL '" + urlToString(url) + "': " + urlRejectReason);
}
}
} catch (SolrException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return true;
}
/** /**
* @param url * @param url

@ -1069,6 +1069,7 @@ public final class Switchboard extends serverSwitch {
10000); 10000);
this.initRemoteCrawler(this.getConfigBool(SwitchboardConstants.CRAWLJOB_REMOTE, false)); this.initRemoteCrawler(this.getConfigBool(SwitchboardConstants.CRAWLJOB_REMOTE, false));
this.initAutocrawl(this.getConfigBool(SwitchboardConstants.AUTOCRAWL, false));
deployThread( deployThread(
SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL, SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL,
@ -1537,6 +1538,37 @@ public final class Switchboard extends serverSwitch {
rcl.setIdleSleep(getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_IDLESLEEP, 10000)); rcl.setIdleSleep(getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_IDLESLEEP, 10000));
} }
} }
/**
* Initialise the Autocrawl thread
* @param activate true=enable, false=disable
*/
public void initAutocrawl(final boolean activate) {
this.setConfig(SwitchboardConstants.AUTOCRAWL, activate);
if (activate) {
BusyThread acr = getThread(SwitchboardConstants.CRAWLJOB_AUTOCRAWL);
if (acr == null) {
deployThread(
SwitchboardConstants.CRAWLJOB_AUTOCRAWL,
"Autocrawl",
"Thread that selects and automatically adds crawling jobs to the local queue",
null,
new InstantBusyThread(
this.crawlQueues,
SwitchboardConstants.CRAWLJOB_AUTOCRAWL_METHOD_START,
SwitchboardConstants.CRAWLJOB_AUTOCRAWL_METHOD_JOBCOUNT,
SwitchboardConstants.CRAWLJOB_AUTOCRAWL_METHOD_FREEMEM,
10000,
10000),
10000);
acr = getThread(SwitchboardConstants.CRAWLJOB_AUTOCRAWL);
}
acr.setBusySleep(getConfigLong(SwitchboardConstants.CRAWLJOB_AUTOCRAWL_BUSYSLEEP, 10000));
acr.setIdleSleep(getConfigLong(SwitchboardConstants.CRAWLJOB_AUTOCRAWL_IDLESLEEP, 10000));
}
}
public void initMessages() throws IOException { public void initMessages() throws IOException {
this.log.config("Starting Message Board"); this.log.config("Starting Message Board");

@ -106,6 +106,17 @@ public final class SwitchboardConstants {
public static final String CRAWLJOB_LOCAL_CRAWL_IDLESLEEP = "50_localcrawl_idlesleep"; public static final String CRAWLJOB_LOCAL_CRAWL_IDLESLEEP = "50_localcrawl_idlesleep";
public static final String CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP = "50_localcrawl_busysleep"; public static final String CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP = "50_localcrawl_busysleep";
public static final String CRAWLJOB_LOCAL_CRAWL_LOADPREREQ = "50_localcrawl_loadprereq"; public static final String CRAWLJOB_LOCAL_CRAWL_LOADPREREQ = "50_localcrawl_loadprereq";
// 55_autocrawl
/**
* <p><code>public static final String <string>CRAWLJOB_AUTOCRAWL</strong> = "55_autocrawl"</code></p>
* <p>Name of the autocrawl thread</p>
*/
public static final String CRAWLJOB_AUTOCRAWL = "55_autocrawl";
public static final String CRAWLJOB_AUTOCRAWL_METHOD_START = "autocrawlJob";
public static final String CRAWLJOB_AUTOCRAWL_METHOD_JOBCOUNT = null;
public static final String CRAWLJOB_AUTOCRAWL_METHOD_FREEMEM = null;
public static final String CRAWLJOB_AUTOCRAWL_IDLESLEEP = "55_autocrawl_idlesleep";
public static final String CRAWLJOB_AUTOCRAWL_BUSYSLEEP = "55_autocrawl_busysleep";
// 60_remotecrawlloader // 60_remotecrawlloader
/** /**
* <p><code>public static final String <strong>CRAWLJOB_REMOTE_CRAWL_LOADER</strong> = "60_remotecrawlloader"</code></p> * <p><code>public static final String <strong>CRAWLJOB_REMOTE_CRAWL_LOADER</strong> = "60_remotecrawlloader"</code></p>
@ -308,6 +319,17 @@ public final class SwitchboardConstants {
* @see Switchboard#PROXY_CACHE_LAYOUT_HASH * @see Switchboard#PROXY_CACHE_LAYOUT_HASH
*/ */
public static final String PROXY_YACY_ONLY = "proxyYacyOnly"; public static final String PROXY_YACY_ONLY = "proxyYacyOnly";
public static final String AUTOCRAWL = "autocrawl";
public static final String AUTOCRAWL_INDEX_TEXT = "autocrawl.index.text";
public static final String AUTOCRAWL_INDEX_MEDIA = "autocrawl.index.media";
public static final String AUTOCRAWL_RATIO = "autocrawl.ratio";
public static final String AUTOCRAWL_ROWS = "autocrawl.rows";
public static final String AUTOCRAWL_DAYS = "autocrawl.days";
public static final String AUTOCRAWL_QUERY = "autocrawl.query";
public static final String AUTOCRAWL_DEEP_DEPTH = "autocrawl.deep.depth";
public static final String AUTOCRAWL_SHALLOW_DEPTH = "autocrawl.shallow.depth";
////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////
// Cluster settings // Cluster settings

Loading…
Cancel
Save