From a98c395023d63be45e1ae0abe7cd7f6342966206 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ryszard=20Go=C5=84?= Date: Thu, 14 Jan 2016 00:50:23 +0100 Subject: [PATCH] Add the Autocrawl thread --- defaults/yacy.init | 10 ++- source/net/yacy/crawler/CrawlSwitchboard.java | 4 +- source/net/yacy/crawler/data/CrawlQueues.java | 79 +++++++++++++++++++ source/net/yacy/search/Switchboard.java | 32 ++++++++ .../net/yacy/search/SwitchboardConstants.java | 17 +++- 5 files changed, 136 insertions(+), 6 deletions(-) diff --git a/defaults/yacy.init b/defaults/yacy.init index 4b9907589..e6ec5dc75 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -542,10 +542,12 @@ proxyURL.useforresults=false autocrawl=false autocrawl.index.text=true autocrawl.index.meia=true +autocrawl.ratio=50 +autocrawl.rows=100 +autocrawl.days=1 +autocrawl.query=*:* autocrawl.deep.depth=3 -autocrawl.deep.recrawl=43200 autocrawl.shallow.depth=1 -autocrawl.shallow.recrawl=1440 # From the 'IndexCreate' menu point you can also define a crawling start point. # The crawling works the same way as the prefetch, but it is possible to @@ -636,6 +638,10 @@ collection=user 50_localcrawl_memprereq=25165824 50_localcrawl_loadprereq=6.0 50_localcrawl_isPaused=false +55_autocrawl_idlesleep=10000 +55_autocrawl_busysleep=10000 +55_autocrawl_memprereq=25165824 +55_autocrawl_loadprereq=6.0 60_remotecrawlloader_idlesleep=4000 60_remotecrawlloader_busysleep=800 60_remotecrawlloader_memprereq=12582912 diff --git a/source/net/yacy/crawler/CrawlSwitchboard.java b/source/net/yacy/crawler/CrawlSwitchboard.java index d2101eee7..9e3b18081 100644 --- a/source/net/yacy/crawler/CrawlSwitchboard.java +++ b/source/net/yacy/crawler/CrawlSwitchboard.java @@ -291,7 +291,7 @@ public final class CrawlSwitchboard { CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DEEP_DEPTH, "3")), true, - CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DEEP_RECRAWL, "43200"))), + CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DAYS, "1"))*1440), -1, true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, sb.getConfigBool(SwitchboardConstants.AUTOCRAWL_INDEX_TEXT, true), @@ -324,7 +324,7 @@ public final class CrawlSwitchboard { CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_SHALLOW_DEPTH, "1")), true, - CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_SHALLOW_RECRAWL, "1440"))), + CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DAYS, "1"))*1440), -1, true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, sb.getConfigBool(SwitchboardConstants.AUTOCRAWL_INDEX_TEXT, true), diff --git a/source/net/yacy/crawler/data/CrawlQueues.java b/source/net/yacy/crawler/data/CrawlQueues.java index e7bcd5e3b..4593c419d 100644 --- a/source/net/yacy/crawler/data/CrawlQueues.java +++ b/source/net/yacy/crawler/data/CrawlQueues.java @@ -29,6 +29,7 @@ package net.yacy.crawler.data; import java.io.File; import java.io.IOException; import java.net.MalformedURLException; +import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; @@ -39,12 +40,18 @@ import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.TimeUnit; +import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrException; + import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.feed.Hit; import net.yacy.cora.document.feed.RSSFeed; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.solr.FailCategory; +import net.yacy.cora.federate.solr.connector.SolrConnector; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.ConnectionInfo; @@ -548,6 +555,78 @@ public class CrawlQueues { } return true; } + + public boolean autocrawlJob() { + if (!this.sb.getConfigBool(SwitchboardConstants.AUTOCRAWL, false)) { + return false; + } + + if (isPaused(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL)) { + return false; + } + + if (coreCrawlJobSize() > 200) { + return false; + } + + String rows = this.sb.getConfig(SwitchboardConstants.AUTOCRAWL_ROWS, "100"); + + String dateQuery = String.format("load_date_dt:[* TO NOW-%sDAY]", this.sb.getConfig(SwitchboardConstants.AUTOCRAWL_DAYS, "1")); + + final SolrQuery query = new SolrQuery(); + query.add("group", "true"); + query.add("group.field", "host_s"); + query.add("group.limit", "1"); + query.add("group.main", "true"); + query.add("rows", rows); + query.setQuery(this.sb.getConfig(SwitchboardConstants.AUTOCRAWL_QUERY, "*:*")); + query.setFields("host_s,url_protocol_s"); + query.addSort("load_date_dt", SolrQuery.ORDER.asc); + query.addFilterQuery(dateQuery); + + try { + QueryResponse resp = sb.index.fulltext().getDefaultConnector().getResponseByParams(query); + + int i = 0; + int deepRatio = Integer.parseInt(this.sb.getConfig(SwitchboardConstants.AUTOCRAWL_RATIO, "50")); + for (SolrDocument doc: resp.getResults()) { + boolean deep = false; + i++; + if( i % deepRatio == 0 ){ + deep = true; + } + DigestURL url; + final String u = doc.getFieldValue("url_protocol_s").toString() + "://" + doc.getFieldValue("host_s").toString(); + try { + url = new DigestURL(u); + } catch (final MalformedURLException e) { + continue; + } + final String urlRejectReason = this.sb.crawlStacker.urlInAcceptedDomain(url); + if (urlRejectReason == null) { + this.sb.crawlStacker.enqueueEntry(new Request( + ASCII.getBytes(this.sb.peers.mySeed().hash), + url, + null, + "CRAWLING-ROOT", + new Date(), + deep ? this.sb.crawler.defaultAutocrawlDeepProfile.handle() : this.sb.crawler.defaultAutocrawlShallowProfile.handle(), + 0, + deep ? this.sb.crawler.defaultAutocrawlDeepProfile.timezoneOffset() : this.sb.crawler.defaultAutocrawlShallowProfile.timezoneOffset() + )); + } else { + CrawlQueues.log.warn("autocrawl: Rejected URL '" + urlToString(url) + "': " + urlRejectReason); + } + } + + } catch (SolrException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + + return true; + } /** * @param url diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 7596892b8..429526988 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -1069,6 +1069,7 @@ public final class Switchboard extends serverSwitch { 10000); this.initRemoteCrawler(this.getConfigBool(SwitchboardConstants.CRAWLJOB_REMOTE, false)); + this.initAutocrawl(this.getConfigBool(SwitchboardConstants.AUTOCRAWL, false)); deployThread( SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL, @@ -1537,6 +1538,37 @@ public final class Switchboard extends serverSwitch { rcl.setIdleSleep(getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_IDLESLEEP, 10000)); } } + + /** + * Initialise the Autocrawl thread + * @param activate true=enable, false=disable + */ + public void initAutocrawl(final boolean activate) { + this.setConfig(SwitchboardConstants.AUTOCRAWL, activate); + if (activate) { + BusyThread acr = getThread(SwitchboardConstants.CRAWLJOB_AUTOCRAWL); + if (acr == null) { + deployThread( + SwitchboardConstants.CRAWLJOB_AUTOCRAWL, + "Autocrawl", + "Thread that selects and automatically adds crawling jobs to the local queue", + null, + new InstantBusyThread( + this.crawlQueues, + SwitchboardConstants.CRAWLJOB_AUTOCRAWL_METHOD_START, + SwitchboardConstants.CRAWLJOB_AUTOCRAWL_METHOD_JOBCOUNT, + SwitchboardConstants.CRAWLJOB_AUTOCRAWL_METHOD_FREEMEM, + 10000, + 10000), + 10000); + + acr = getThread(SwitchboardConstants.CRAWLJOB_AUTOCRAWL); + } + + acr.setBusySleep(getConfigLong(SwitchboardConstants.CRAWLJOB_AUTOCRAWL_BUSYSLEEP, 10000)); + acr.setIdleSleep(getConfigLong(SwitchboardConstants.CRAWLJOB_AUTOCRAWL_IDLESLEEP, 10000)); + } + } public void initMessages() throws IOException { this.log.config("Starting Message Board"); diff --git a/source/net/yacy/search/SwitchboardConstants.java b/source/net/yacy/search/SwitchboardConstants.java index d11fa1776..eacb46d7a 100644 --- a/source/net/yacy/search/SwitchboardConstants.java +++ b/source/net/yacy/search/SwitchboardConstants.java @@ -106,6 +106,17 @@ public final class SwitchboardConstants { public static final String CRAWLJOB_LOCAL_CRAWL_IDLESLEEP = "50_localcrawl_idlesleep"; public static final String CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP = "50_localcrawl_busysleep"; public static final String CRAWLJOB_LOCAL_CRAWL_LOADPREREQ = "50_localcrawl_loadprereq"; + // 55_autocrawl + /** + *

public static final String CRAWLJOB_AUTOCRAWL = "55_autocrawl"

+ *

Name of the autocrawl thread

+ */ + public static final String CRAWLJOB_AUTOCRAWL = "55_autocrawl"; + public static final String CRAWLJOB_AUTOCRAWL_METHOD_START = "autocrawlJob"; + public static final String CRAWLJOB_AUTOCRAWL_METHOD_JOBCOUNT = null; + public static final String CRAWLJOB_AUTOCRAWL_METHOD_FREEMEM = null; + public static final String CRAWLJOB_AUTOCRAWL_IDLESLEEP = "55_autocrawl_idlesleep"; + public static final String CRAWLJOB_AUTOCRAWL_BUSYSLEEP = "55_autocrawl_busysleep"; // 60_remotecrawlloader /** *

public static final String CRAWLJOB_REMOTE_CRAWL_LOADER = "60_remotecrawlloader"

@@ -312,10 +323,12 @@ public final class SwitchboardConstants { public static final String AUTOCRAWL = "autocrawl"; public static final String AUTOCRAWL_INDEX_TEXT = "autocrawl.index.text"; public static final String AUTOCRAWL_INDEX_MEDIA = "autocrawl.index.media"; + public static final String AUTOCRAWL_RATIO = "autocrawl.ratio"; + public static final String AUTOCRAWL_ROWS = "autocrawl.rows"; + public static final String AUTOCRAWL_DAYS = "autocrawl.days"; + public static final String AUTOCRAWL_QUERY = "autocrawl.query"; public static final String AUTOCRAWL_DEEP_DEPTH = "autocrawl.deep.depth"; - public static final String AUTOCRAWL_DEEP_RECRAWL = "autocrawl.deep.recrawl"; public static final String AUTOCRAWL_SHALLOW_DEPTH = "autocrawl.shallow.depth"; - public static final String AUTOCRAWL_SHALLOW_RECRAWL = "autocrawl.shallow.recrawl"; //////////////////////////////////////////////////////////////////////////////////////////////