Added a stop condition to the Recrawl busy thread

7 years ago · b2af25b14f
parent 421728d25a
commit b2af25b14f
2 changed files with 47 additions and 24 deletions
--- a/htroot/IndexReIndexMonitor_p.java
+++ b/htroot/IndexReIndexMonitor_p.java
@ -122,7 +122,7 @@ public class IndexReIndexMonitor_p {
                inclerrdoc = post.getBoolean("includefailedurls");
            }

-            if (recrawlbt == null) {
+            if (recrawlbt == null || recrawlbt.shutdownInProgress()) {
                prop.put("recrawljobrunning_simulationResult", 0);
                if (post.containsKey("recrawlnow") && sb.index.fulltext().connectedLocalSolr()) {
 					sb.deployThread(RecrawlBusyThread.THREAD_NAME, "ReCrawl", "recrawl existing documents", null,
@ -165,7 +165,7 @@ public class IndexReIndexMonitor_p {
        // just post status of recrawlThread
        if (recrawlbt != null && !recrawlbt.shutdownInProgress()) { // provide status
            prop.put("recrawljobrunning", 1);
-            prop.put("recrawljobrunning_docCount", ((RecrawlBusyThread) recrawlbt).urlsfound);
+            prop.put("recrawljobrunning_docCount", ((RecrawlBusyThread) recrawlbt).getUrlsToRecrawl());
            prop.put("recrawljobrunning_recrawlquerytext", ((RecrawlBusyThread) recrawlbt).getQuery());
            prop.put("recrawljobrunning_includefailedurls", ((RecrawlBusyThread) recrawlbt).getIncludeFailed());
        } else {
--- a/source/net/yacy/crawler/RecrawlBusyThread.java
+++ b/source/net/yacy/crawler/RecrawlBusyThread.java
@ -45,7 +45,7 @@ import org.apache.solr.common.SolrDocumentList;
 * and feeds the found urls to the crawler to recrawl the documents.
 * This is intended to keep the index up-to-date
 * Currently the doucments are selected by expired fresh_date_dt field
- * an added to the crawler in smaller chunks (see chunksize) as long as no other crawl is runnin.
+ * an added to the crawler in smaller chunks (see chunksize) as long as no other crawl is running.
 */
 public class RecrawlBusyThread extends AbstractBusyThread {

@ -66,12 +66,18 @@ public class RecrawlBusyThread extends AbstractBusyThread {
    
    private int chunkstart = 0;
    private final int chunksize;
-    final Switchboard sb;
+    private final Switchboard sb;
    
    /** buffer of urls to recrawl */
    private final Set<DigestURL> urlstack;
-    public long urlsfound = 0;
+    
+    /** The total number of candidate URLs found for recrawl */
+    private long urlsToRecrawl = 0;
+    
    private String solrSortBy;
+    
+    /** Set to true when more URLs are still to be processed */
+    private boolean moreToRecrawl = true;

 	/**
 	 * @param xsb
@ -92,10 +98,10 @@ public class RecrawlBusyThread extends AbstractBusyThread {
        this.sb = xsb;
        this.currentQuery = query;
        this.includefailed = includeFailed;
-        urlstack = new HashSet<DigestURL>();
+        this.urlstack = new HashSet<DigestURL>();
        // workaround to prevent solr exception on existing index (not fully reindexed) since intro of schema with docvalues
        // org.apache.solr.core.SolrCore java.lang.IllegalStateException: unexpected docvalues type NONE for field 'load_date_dt' (expected=NUMERIC). Use UninvertingReader or index with docvalues.
-        solrSortBy = null; // CollectionSchema.load_date_dt.getSolrFieldName() + " asc";
+        this.solrSortBy = null; // CollectionSchema.load_date_dt.getSolrFieldName() + " asc";
        this.chunksize = sb.getConfigInt(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 200);
    }

@ -191,10 +197,21 @@ public class RecrawlBusyThread extends AbstractBusyThread {
            return false;
        }

+        boolean didSomething = false;
        if (this.urlstack.isEmpty()) {
-            return processSingleQuery();
+        	if(!this.moreToRecrawl) {
+        		/* We do not remove the thread from the Switchboard worker threads using serverSwitch.terminateThread(String,boolean),
+        		 * because we want to be able to provide a report after its termination */
+        		terminate(false);
+        	} else {
+        		this.moreToRecrawl = processSingleQuery();
+        		/* Even if no more URLs are to recrawl, the job has done something by searching the Solr index */
+        		didSomething = true;
+        	}
+        } else {
+        	didSomething = feedToCrawler();
        }
-        return feedToCrawler();
+        return didSomething;

    }

@ -208,21 +225,22 @@ public class RecrawlBusyThread extends AbstractBusyThread {
        }
        SolrDocumentList docList = null;
        SolrConnector solrConnector = sb.index.fulltext().getDefaultConnector();
-        if (!solrConnector.isClosed()) {
-            try {
-                // query all or only httpstatus=200 depending on includefailed flag
-                docList = solrConnector.getDocumentListByQuery(RecrawlBusyThread.buildSelectionQuery(this.currentQuery, this.includefailed),
-                        this.solrSortBy, this.chunkstart, this.chunksize, CollectionSchema.sku.getSolrFieldName());
-                this.urlsfound = docList.getNumFound();
-            } catch (Throwable e) {
-                this.urlsfound = 0;
-            }
-        } else {
-            this.urlsfound =0;
+        if (solrConnector.isClosed()) {
+        	this.urlsToRecrawl = 0;
+        	return false;
+        }
+        
+        try {
+            // query all or only httpstatus=200 depending on includefailed flag
+            docList = solrConnector.getDocumentListByQuery(RecrawlBusyThread.buildSelectionQuery(this.currentQuery, this.includefailed),
+                this.solrSortBy, this.chunkstart, this.chunksize, CollectionSchema.sku.getSolrFieldName());
+            this.urlsToRecrawl = docList.getNumFound();
+        } catch (final Throwable e) {
+        	this.urlsToRecrawl = 0;
        }

        if (docList != null) {
-            for (SolrDocument doc : docList) {
+            for (final SolrDocument doc : docList) {
                try {
                    this.urlstack.add(new DigestURL((String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName())));
                } catch (MalformedURLException ex) {
@ -237,10 +255,8 @@ public class RecrawlBusyThread extends AbstractBusyThread {
            this.chunkstart = this.chunkstart + this.chunksize;
        }
        
-        if (this.urlsfound <= this.chunkstart) {
-            this.chunkstart = 0;
+        if (docList == null || docList.size() < this.chunksize) {
            return false;
-            // TODO: add a stop condition
        }
        return true;
    }
@ -249,6 +265,13 @@ public class RecrawlBusyThread extends AbstractBusyThread {
    public int getJobCount() {
        return this.urlstack.size();
    }
+    
+    /**
+     * @return The total number of candidate URLs found for recrawl
+     */
+    public long getUrlsToRecrawl() {
+		return this.urlsToRecrawl;
+	}

    @Override
    public void freemem() {