Add the Autocrawl thread

pull/40/head
Ryszard Goń 9 years ago
parent 1728cd30c6
commit a98c395023

@ -542,10 +542,12 @@ proxyURL.useforresults=false
autocrawl=false
autocrawl.index.text=true
autocrawl.index.meia=true
autocrawl.ratio=50
autocrawl.rows=100
autocrawl.days=1
autocrawl.query=*:*
autocrawl.deep.depth=3
autocrawl.deep.recrawl=43200
autocrawl.shallow.depth=1
autocrawl.shallow.recrawl=1440
# From the 'IndexCreate' menu point you can also define a crawling start point.
# The crawling works the same way as the prefetch, but it is possible to
@ -636,6 +638,10 @@ collection=user
50_localcrawl_memprereq=25165824
50_localcrawl_loadprereq=6.0
50_localcrawl_isPaused=false
55_autocrawl_idlesleep=10000
55_autocrawl_busysleep=10000
55_autocrawl_memprereq=25165824
55_autocrawl_loadprereq=6.0
60_remotecrawlloader_idlesleep=4000
60_remotecrawlloader_busysleep=800
60_remotecrawlloader_memprereq=12582912

@ -291,7 +291,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DEEP_DEPTH, "3")),
true,
CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DEEP_RECRAWL, "43200"))),
CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DAYS, "1"))*1440),
-1,
true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
sb.getConfigBool(SwitchboardConstants.AUTOCRAWL_INDEX_TEXT, true),
@ -324,7 +324,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_SHALLOW_DEPTH, "1")),
true,
CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_SHALLOW_RECRAWL, "1440"))),
CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DAYS, "1"))*1440),
-1,
true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
sb.getConfigBool(SwitchboardConstants.AUTOCRAWL_INDEX_TEXT, true),

@ -29,6 +29,7 @@ package net.yacy.crawler.data;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
@ -39,12 +40,18 @@ import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrException;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.feed.Hit;
import net.yacy.cora.document.feed.RSSFeed;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.FailCategory;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.ConnectionInfo;
@ -549,6 +556,78 @@ public class CrawlQueues {
return true;
}
public boolean autocrawlJob() {
if (!this.sb.getConfigBool(SwitchboardConstants.AUTOCRAWL, false)) {
return false;
}
if (isPaused(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL)) {
return false;
}
if (coreCrawlJobSize() > 200) {
return false;
}
String rows = this.sb.getConfig(SwitchboardConstants.AUTOCRAWL_ROWS, "100");
String dateQuery = String.format("load_date_dt:[* TO NOW-%sDAY]", this.sb.getConfig(SwitchboardConstants.AUTOCRAWL_DAYS, "1"));
final SolrQuery query = new SolrQuery();
query.add("group", "true");
query.add("group.field", "host_s");
query.add("group.limit", "1");
query.add("group.main", "true");
query.add("rows", rows);
query.setQuery(this.sb.getConfig(SwitchboardConstants.AUTOCRAWL_QUERY, "*:*"));
query.setFields("host_s,url_protocol_s");
query.addSort("load_date_dt", SolrQuery.ORDER.asc);
query.addFilterQuery(dateQuery);
try {
QueryResponse resp = sb.index.fulltext().getDefaultConnector().getResponseByParams(query);
int i = 0;
int deepRatio = Integer.parseInt(this.sb.getConfig(SwitchboardConstants.AUTOCRAWL_RATIO, "50"));
for (SolrDocument doc: resp.getResults()) {
boolean deep = false;
i++;
if( i % deepRatio == 0 ){
deep = true;
}
DigestURL url;
final String u = doc.getFieldValue("url_protocol_s").toString() + "://" + doc.getFieldValue("host_s").toString();
try {
url = new DigestURL(u);
} catch (final MalformedURLException e) {
continue;
}
final String urlRejectReason = this.sb.crawlStacker.urlInAcceptedDomain(url);
if (urlRejectReason == null) {
this.sb.crawlStacker.enqueueEntry(new Request(
ASCII.getBytes(this.sb.peers.mySeed().hash),
url,
null,
"CRAWLING-ROOT",
new Date(),
deep ? this.sb.crawler.defaultAutocrawlDeepProfile.handle() : this.sb.crawler.defaultAutocrawlShallowProfile.handle(),
0,
deep ? this.sb.crawler.defaultAutocrawlDeepProfile.timezoneOffset() : this.sb.crawler.defaultAutocrawlShallowProfile.timezoneOffset()
));
} else {
CrawlQueues.log.warn("autocrawl: Rejected URL '" + urlToString(url) + "': " + urlRejectReason);
}
}
} catch (SolrException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return true;
}
/**
* @param url
* @return

@ -1069,6 +1069,7 @@ public final class Switchboard extends serverSwitch {
10000);
this.initRemoteCrawler(this.getConfigBool(SwitchboardConstants.CRAWLJOB_REMOTE, false));
this.initAutocrawl(this.getConfigBool(SwitchboardConstants.AUTOCRAWL, false));
deployThread(
SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL,
@ -1538,6 +1539,37 @@ public final class Switchboard extends serverSwitch {
}
}
/**
* Initialise the Autocrawl thread
* @param activate true=enable, false=disable
*/
public void initAutocrawl(final boolean activate) {
this.setConfig(SwitchboardConstants.AUTOCRAWL, activate);
if (activate) {
BusyThread acr = getThread(SwitchboardConstants.CRAWLJOB_AUTOCRAWL);
if (acr == null) {
deployThread(
SwitchboardConstants.CRAWLJOB_AUTOCRAWL,
"Autocrawl",
"Thread that selects and automatically adds crawling jobs to the local queue",
null,
new InstantBusyThread(
this.crawlQueues,
SwitchboardConstants.CRAWLJOB_AUTOCRAWL_METHOD_START,
SwitchboardConstants.CRAWLJOB_AUTOCRAWL_METHOD_JOBCOUNT,
SwitchboardConstants.CRAWLJOB_AUTOCRAWL_METHOD_FREEMEM,
10000,
10000),
10000);
acr = getThread(SwitchboardConstants.CRAWLJOB_AUTOCRAWL);
}
acr.setBusySleep(getConfigLong(SwitchboardConstants.CRAWLJOB_AUTOCRAWL_BUSYSLEEP, 10000));
acr.setIdleSleep(getConfigLong(SwitchboardConstants.CRAWLJOB_AUTOCRAWL_IDLESLEEP, 10000));
}
}
public void initMessages() throws IOException {
this.log.config("Starting Message Board");
final File messageDbFile = new File(this.workPath, "message.heap");

@ -106,6 +106,17 @@ public final class SwitchboardConstants {
public static final String CRAWLJOB_LOCAL_CRAWL_IDLESLEEP = "50_localcrawl_idlesleep";
public static final String CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP = "50_localcrawl_busysleep";
public static final String CRAWLJOB_LOCAL_CRAWL_LOADPREREQ = "50_localcrawl_loadprereq";
// 55_autocrawl
/**
* <p><code>public static final String <string>CRAWLJOB_AUTOCRAWL</strong> = "55_autocrawl"</code></p>
* <p>Name of the autocrawl thread</p>
*/
public static final String CRAWLJOB_AUTOCRAWL = "55_autocrawl";
public static final String CRAWLJOB_AUTOCRAWL_METHOD_START = "autocrawlJob";
public static final String CRAWLJOB_AUTOCRAWL_METHOD_JOBCOUNT = null;
public static final String CRAWLJOB_AUTOCRAWL_METHOD_FREEMEM = null;
public static final String CRAWLJOB_AUTOCRAWL_IDLESLEEP = "55_autocrawl_idlesleep";
public static final String CRAWLJOB_AUTOCRAWL_BUSYSLEEP = "55_autocrawl_busysleep";
// 60_remotecrawlloader
/**
* <p><code>public static final String <strong>CRAWLJOB_REMOTE_CRAWL_LOADER</strong> = "60_remotecrawlloader"</code></p>
@ -312,10 +323,12 @@ public final class SwitchboardConstants {
public static final String AUTOCRAWL = "autocrawl";
public static final String AUTOCRAWL_INDEX_TEXT = "autocrawl.index.text";
public static final String AUTOCRAWL_INDEX_MEDIA = "autocrawl.index.media";
public static final String AUTOCRAWL_RATIO = "autocrawl.ratio";
public static final String AUTOCRAWL_ROWS = "autocrawl.rows";
public static final String AUTOCRAWL_DAYS = "autocrawl.days";
public static final String AUTOCRAWL_QUERY = "autocrawl.query";
public static final String AUTOCRAWL_DEEP_DEPTH = "autocrawl.deep.depth";
public static final String AUTOCRAWL_DEEP_RECRAWL = "autocrawl.deep.recrawl";
public static final String AUTOCRAWL_SHALLOW_DEPTH = "autocrawl.shallow.depth";
public static final String AUTOCRAWL_SHALLOW_RECRAWL = "autocrawl.shallow.recrawl";
//////////////////////////////////////////////////////////////////////////////////////////////

Loading…
Cancel
Save