init Recrawl job chunk size to max crawl loader during job start, to use some system preferences

and allow injection of recrawl urls before queue is empty During recrawl the balancer hangs on the very last urls often on hosts with huge delay time, by allowing injection earlier progress is more balanced. Max number of injected crawl urls by recrawl job is 2 * max loader.
9 years ago · 7a64bebb86
parent 9244694e64
commit 7a64bebb86
1 changed files with 5 additions and 3 deletions
--- a/source/net/yacy/crawler/RecrawlBusyThread.java
+++ b/source/net/yacy/crawler/RecrawlBusyThread.java
@ -35,6 +35,7 @@ import net.yacy.crawler.data.NoticedURL;
 import net.yacy.crawler.retrieval.Request;
 import net.yacy.kelondro.workflow.AbstractBusyThread;
 import net.yacy.search.Switchboard;
 import net.yacy.search.SwitchboardConstants;
 import net.yacy.search.schema.CollectionSchema;
 import org.apache.solr.common.SolrDocument;
 import org.apache.solr.common.SolrDocumentList;
@ -53,7 +54,7 @@ public class RecrawlBusyThread extends AbstractBusyThread {
    private String currentQuery = CollectionSchema.fresh_date_dt.getSolrFieldName()+":[* TO NOW/DAY-1DAY]"; // current query
    private boolean includefailed = false; // flag if docs with httpstatus_i <> 200 shall be recrawled
    private int chunkstart = 0;
-    private int chunksize = 200;
+    private final int chunksize;
    final Switchboard sb;
    private final Set<DigestURL> urlstack; // buffer of urls to recrawl
    public long urlsfound = 0;
@ -70,6 +71,7 @@ public class RecrawlBusyThread extends AbstractBusyThread {
        // workaround to prevent solr exception on existing index (not fully reindexed) since intro of schema with docvalues
        // org.apache.solr.core.SolrCore java.lang.IllegalStateException: unexpected docvalues type NONE for field 'load_date_dt' (expected=NUMERIC). Use UninvertingReader or index with docvalues.
        solrSortBy = null; // CollectionSchema.load_date_dt.getSolrFieldName() + " asc";
        this.chunksize = sb.getConfigInt(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 200);
    }
    /**
@ -146,8 +148,8 @@ public class RecrawlBusyThread extends AbstractBusyThread {
     */
    @Override
    public boolean job() {
-        // other crawls are running, do nothing
+        // more than chunksize crawls are running, do nothing
-        if (sb.crawlQueues.coreCrawlJobSize() > 0) {
+        if (sb.crawlQueues.coreCrawlJobSize() > this.chunksize) {
            return false;
        }