init Recrawl job chunk size to max crawl loader during job start, to use some system preferences

and allow injection of recrawl urls before queue is empty
During recrawl the balancer hangs on the very last urls often on hosts with huge delay time,
by allowing injection earlier progress is more balanced. Max number of injected crawl urls by recrawl job is 2 * max loader.
pull/23/head
reger 9 years ago
parent 9244694e64
commit 7a64bebb86

@ -35,6 +35,7 @@ import net.yacy.crawler.data.NoticedURL;
import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.Request;
import net.yacy.kelondro.workflow.AbstractBusyThread; import net.yacy.kelondro.workflow.AbstractBusyThread;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.schema.CollectionSchema; import net.yacy.search.schema.CollectionSchema;
import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrDocumentList;
@ -53,7 +54,7 @@ public class RecrawlBusyThread extends AbstractBusyThread {
private String currentQuery = CollectionSchema.fresh_date_dt.getSolrFieldName()+":[* TO NOW/DAY-1DAY]"; // current query private String currentQuery = CollectionSchema.fresh_date_dt.getSolrFieldName()+":[* TO NOW/DAY-1DAY]"; // current query
private boolean includefailed = false; // flag if docs with httpstatus_i <> 200 shall be recrawled private boolean includefailed = false; // flag if docs with httpstatus_i <> 200 shall be recrawled
private int chunkstart = 0; private int chunkstart = 0;
private int chunksize = 200; private final int chunksize;
final Switchboard sb; final Switchboard sb;
private final Set<DigestURL> urlstack; // buffer of urls to recrawl private final Set<DigestURL> urlstack; // buffer of urls to recrawl
public long urlsfound = 0; public long urlsfound = 0;
@ -70,6 +71,7 @@ public class RecrawlBusyThread extends AbstractBusyThread {
// workaround to prevent solr exception on existing index (not fully reindexed) since intro of schema with docvalues // workaround to prevent solr exception on existing index (not fully reindexed) since intro of schema with docvalues
// org.apache.solr.core.SolrCore java.lang.IllegalStateException: unexpected docvalues type NONE for field 'load_date_dt' (expected=NUMERIC). Use UninvertingReader or index with docvalues. // org.apache.solr.core.SolrCore java.lang.IllegalStateException: unexpected docvalues type NONE for field 'load_date_dt' (expected=NUMERIC). Use UninvertingReader or index with docvalues.
solrSortBy = null; // CollectionSchema.load_date_dt.getSolrFieldName() + " asc"; solrSortBy = null; // CollectionSchema.load_date_dt.getSolrFieldName() + " asc";
this.chunksize = sb.getConfigInt(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 200);
} }
/** /**
@ -146,8 +148,8 @@ public class RecrawlBusyThread extends AbstractBusyThread {
*/ */
@Override @Override
public boolean job() { public boolean job() {
// other crawls are running, do nothing // more than chunksize crawls are running, do nothing
if (sb.crawlQueues.coreCrawlJobSize() > 0) { if (sb.crawlQueues.coreCrawlJobSize() > this.chunksize) {
return false; return false;
} }

Loading…
Cancel
Save