Added a stop condition to the Recrawl busy thread

pull/154/head
luccioman 7 years ago
parent 421728d25a
commit b2af25b14f

@ -122,7 +122,7 @@ public class IndexReIndexMonitor_p {
inclerrdoc = post.getBoolean("includefailedurls"); inclerrdoc = post.getBoolean("includefailedurls");
} }
if (recrawlbt == null) { if (recrawlbt == null || recrawlbt.shutdownInProgress()) {
prop.put("recrawljobrunning_simulationResult", 0); prop.put("recrawljobrunning_simulationResult", 0);
if (post.containsKey("recrawlnow") && sb.index.fulltext().connectedLocalSolr()) { if (post.containsKey("recrawlnow") && sb.index.fulltext().connectedLocalSolr()) {
sb.deployThread(RecrawlBusyThread.THREAD_NAME, "ReCrawl", "recrawl existing documents", null, sb.deployThread(RecrawlBusyThread.THREAD_NAME, "ReCrawl", "recrawl existing documents", null,
@ -165,7 +165,7 @@ public class IndexReIndexMonitor_p {
// just post status of recrawlThread // just post status of recrawlThread
if (recrawlbt != null && !recrawlbt.shutdownInProgress()) { // provide status if (recrawlbt != null && !recrawlbt.shutdownInProgress()) { // provide status
prop.put("recrawljobrunning", 1); prop.put("recrawljobrunning", 1);
prop.put("recrawljobrunning_docCount", ((RecrawlBusyThread) recrawlbt).urlsfound); prop.put("recrawljobrunning_docCount", ((RecrawlBusyThread) recrawlbt).getUrlsToRecrawl());
prop.put("recrawljobrunning_recrawlquerytext", ((RecrawlBusyThread) recrawlbt).getQuery()); prop.put("recrawljobrunning_recrawlquerytext", ((RecrawlBusyThread) recrawlbt).getQuery());
prop.put("recrawljobrunning_includefailedurls", ((RecrawlBusyThread) recrawlbt).getIncludeFailed()); prop.put("recrawljobrunning_includefailedurls", ((RecrawlBusyThread) recrawlbt).getIncludeFailed());
} else { } else {

@ -45,7 +45,7 @@ import org.apache.solr.common.SolrDocumentList;
* and feeds the found urls to the crawler to recrawl the documents. * and feeds the found urls to the crawler to recrawl the documents.
* This is intended to keep the index up-to-date * This is intended to keep the index up-to-date
* Currently the doucments are selected by expired fresh_date_dt field * Currently the doucments are selected by expired fresh_date_dt field
* an added to the crawler in smaller chunks (see chunksize) as long as no other crawl is runnin. * an added to the crawler in smaller chunks (see chunksize) as long as no other crawl is running.
*/ */
public class RecrawlBusyThread extends AbstractBusyThread { public class RecrawlBusyThread extends AbstractBusyThread {
@ -66,12 +66,18 @@ public class RecrawlBusyThread extends AbstractBusyThread {
private int chunkstart = 0; private int chunkstart = 0;
private final int chunksize; private final int chunksize;
final Switchboard sb; private final Switchboard sb;
/** buffer of urls to recrawl */ /** buffer of urls to recrawl */
private final Set<DigestURL> urlstack; private final Set<DigestURL> urlstack;
public long urlsfound = 0;
/** The total number of candidate URLs found for recrawl */
private long urlsToRecrawl = 0;
private String solrSortBy; private String solrSortBy;
/** Set to true when more URLs are still to be processed */
private boolean moreToRecrawl = true;
/** /**
* @param xsb * @param xsb
@ -92,10 +98,10 @@ public class RecrawlBusyThread extends AbstractBusyThread {
this.sb = xsb; this.sb = xsb;
this.currentQuery = query; this.currentQuery = query;
this.includefailed = includeFailed; this.includefailed = includeFailed;
urlstack = new HashSet<DigestURL>(); this.urlstack = new HashSet<DigestURL>();
// workaround to prevent solr exception on existing index (not fully reindexed) since intro of schema with docvalues // workaround to prevent solr exception on existing index (not fully reindexed) since intro of schema with docvalues
// org.apache.solr.core.SolrCore java.lang.IllegalStateException: unexpected docvalues type NONE for field 'load_date_dt' (expected=NUMERIC). Use UninvertingReader or index with docvalues. // org.apache.solr.core.SolrCore java.lang.IllegalStateException: unexpected docvalues type NONE for field 'load_date_dt' (expected=NUMERIC). Use UninvertingReader or index with docvalues.
solrSortBy = null; // CollectionSchema.load_date_dt.getSolrFieldName() + " asc"; this.solrSortBy = null; // CollectionSchema.load_date_dt.getSolrFieldName() + " asc";
this.chunksize = sb.getConfigInt(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 200); this.chunksize = sb.getConfigInt(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 200);
} }
@ -191,10 +197,21 @@ public class RecrawlBusyThread extends AbstractBusyThread {
return false; return false;
} }
boolean didSomething = false;
if (this.urlstack.isEmpty()) { if (this.urlstack.isEmpty()) {
return processSingleQuery(); if(!this.moreToRecrawl) {
/* We do not remove the thread from the Switchboard worker threads using serverSwitch.terminateThread(String,boolean),
* because we want to be able to provide a report after its termination */
terminate(false);
} else {
this.moreToRecrawl = processSingleQuery();
/* Even if no more URLs are to recrawl, the job has done something by searching the Solr index */
didSomething = true;
}
} else {
didSomething = feedToCrawler();
} }
return feedToCrawler(); return didSomething;
} }
@ -208,21 +225,22 @@ public class RecrawlBusyThread extends AbstractBusyThread {
} }
SolrDocumentList docList = null; SolrDocumentList docList = null;
SolrConnector solrConnector = sb.index.fulltext().getDefaultConnector(); SolrConnector solrConnector = sb.index.fulltext().getDefaultConnector();
if (!solrConnector.isClosed()) { if (solrConnector.isClosed()) {
try { this.urlsToRecrawl = 0;
// query all or only httpstatus=200 depending on includefailed flag return false;
docList = solrConnector.getDocumentListByQuery(RecrawlBusyThread.buildSelectionQuery(this.currentQuery, this.includefailed), }
this.solrSortBy, this.chunkstart, this.chunksize, CollectionSchema.sku.getSolrFieldName());
this.urlsfound = docList.getNumFound(); try {
} catch (Throwable e) { // query all or only httpstatus=200 depending on includefailed flag
this.urlsfound = 0; docList = solrConnector.getDocumentListByQuery(RecrawlBusyThread.buildSelectionQuery(this.currentQuery, this.includefailed),
} this.solrSortBy, this.chunkstart, this.chunksize, CollectionSchema.sku.getSolrFieldName());
} else { this.urlsToRecrawl = docList.getNumFound();
this.urlsfound =0; } catch (final Throwable e) {
this.urlsToRecrawl = 0;
} }
if (docList != null) { if (docList != null) {
for (SolrDocument doc : docList) { for (final SolrDocument doc : docList) {
try { try {
this.urlstack.add(new DigestURL((String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()))); this.urlstack.add(new DigestURL((String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName())));
} catch (MalformedURLException ex) { } catch (MalformedURLException ex) {
@ -237,10 +255,8 @@ public class RecrawlBusyThread extends AbstractBusyThread {
this.chunkstart = this.chunkstart + this.chunksize; this.chunkstart = this.chunkstart + this.chunksize;
} }
if (this.urlsfound <= this.chunkstart) { if (docList == null || docList.size() < this.chunksize) {
this.chunkstart = 0;
return false; return false;
// TODO: add a stop condition
} }
return true; return true;
} }
@ -249,6 +265,13 @@ public class RecrawlBusyThread extends AbstractBusyThread {
public int getJobCount() { public int getJobCount() {
return this.urlstack.size(); return this.urlstack.size();
} }
/**
* @return The total number of candidate URLs found for recrawl
*/
public long getUrlsToRecrawl() {
return this.urlsToRecrawl;
}
@Override @Override
public void freemem() { public void freemem() {

Loading…
Cancel
Save