Made possible to customize selection query before launching a recrawl

7 years ago · 421728d25a
parent fab6e54fec
commit 421728d25a
5 changed files with 100 additions and 27 deletions
--- a/htroot/IndexReIndexMonitor_p.html
+++ b/htroot/IndexReIndexMonitor_p.html
@ -68,10 +68,24 @@
          <fieldset>

          #(recrawljobrunning)#
+            <div class="form-group">
+            	<label>Solr query <input type="text" name="recrawlquerytext" size="40" value="#[recrawlquerytext]#" /></label>
+            	<input type="submit" name="simulateRecrawl" value="Simulate" class="btn btn-default" title="Check only how many documents would be selected for recrawl"/>
+            </div>
+            #(simulationResult)#
+            ::
+            <div class="alert alert-success" role="alert">#[docCount]# documents selected for recrawl.</div>
+            ::
+            <div class="alert alert-danger" role="alert">An error occurred when trying to run the selection query.</div>
+            ::
+            <div class="alert alert-danger" role="alert">The Solr index is not connected. Please restart your peer.</div>
+            #(/simulationResult)#
+            <div class="form-group">
+            	<label>Include failed URLs <input type="checkbox" name="includefailedurls" #(includefailedurls)#::checked="checked"#(/includefailedurls)# /></label>
+            </div>
+            <input type="submit" name="recrawlDefaults" value="Set defaults" class="btn btn-default" title="Reset to default values"/>
            <input type="submit" name="recrawlnow" value="start recrawl job now" class="btn btn-primary"/>
-            to re-crawl documents with fresh_date_dt before today.
-            <p></p>
-            <p><small>after starting the recrawl job you can apply a custom Solr query to select documents to be processed</small></p>
+            to re-crawl documents selected with the given query.
            ::<input type="submit" name="stoprecrawl" value="stop recrawl job" class="btn btn-danger"/> 
          #(/recrawljobrunning)#

--- a/htroot/IndexReIndexMonitor_p.java
+++ b/htroot/IndexReIndexMonitor_p.java
@ -17,9 +17,14 @@
 * along with this program in the file lgpl21.txt If not, see
 * <http://www.gnu.org/licenses/>.
 */
+import net.yacy.cora.federate.solr.connector.SolrConnector;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.sorting.OrderedScoreMap;
+import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.kelondro.workflow.BusyThread;
+
+import java.io.IOException;
+
 import net.yacy.migration;
 import net.yacy.crawler.RecrawlBusyThread;
 import net.yacy.data.TransactionManager;
@ -101,21 +106,46 @@ public class IndexReIndexMonitor_p {
        // recrawl job handling
        BusyThread recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME);
        
+    	String recrawlQuery = RecrawlBusyThread.DEFAULT_QUERY;
+        boolean inclerrdoc = RecrawlBusyThread.DEFAULT_INCLUDE_FAILED;
        // to signal that a setting shall change the form provides a fixed parameter setup=recrawljob, if not present return status only
        if (post != null && "recrawljob".equals(post.get("setup"))) { // it's a command to recrawlThread
        	
        	/* Check the transaction is valid */
        	TransactionManager.checkPostTransaction(header, post);
        	
+        	if(post.containsKey("recrawlquerytext")) {
+        		recrawlQuery = post.get("recrawlquerytext");
+        	}
+        	
+            if (post.containsKey("includefailedurls")) {
+                inclerrdoc = post.getBoolean("includefailedurls");
+            }
+
            if (recrawlbt == null) {
+                prop.put("recrawljobrunning_simulationResult", 0);
                if (post.containsKey("recrawlnow") && sb.index.fulltext().connectedLocalSolr()) {
-                    sb.deployThread(RecrawlBusyThread.THREAD_NAME,
-                            "ReCrawl",
-                            "recrawl existing documents",
-                            null,
-                            new RecrawlBusyThread(Switchboard.getSwitchboard()),
-                            1000);
+					sb.deployThread(RecrawlBusyThread.THREAD_NAME, "ReCrawl", "recrawl existing documents", null,
+							new RecrawlBusyThread(Switchboard.getSwitchboard(), recrawlQuery, inclerrdoc), 1000);
                    recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME);
+                } else if(post.containsKey("simulateRecrawl") && sb.index.fulltext().connectedLocalSolr()) {
+                    SolrConnector solrConnector = sb.index.fulltext().getDefaultConnector();
+                    if (!solrConnector.isClosed()) {
+                        try {
+                            // query all or only httpstatus=200 depending on includefailed flag
+                            final long count = solrConnector.getCountByQuery(RecrawlBusyThread.buildSelectionQuery(recrawlQuery, inclerrdoc));
+                            prop.put("recrawljobrunning_simulationResult", 1);
+                            prop.put("recrawljobrunning_simulationResult_docCount", count);
+                        } catch (final IOException e) {
+                        	prop.put("recrawljobrunning_simulationResult", 2);
+                        	ConcurrentLog.logException(e);
+                        }
+                    } else {
+                    	prop.put("recrawljobrunning_simulationResult", 3);
+                    }
+                } else if(post.containsKey("recrawlDefaults")) {
+                	recrawlQuery = RecrawlBusyThread.DEFAULT_QUERY;
+                    inclerrdoc = RecrawlBusyThread.DEFAULT_INCLUDE_FAILED;
                }
            } else {
                if (post.containsKey("stoprecrawl")) {
@ -124,14 +154,9 @@ public class IndexReIndexMonitor_p {
                }
            }

-            boolean inclerrdoc = false;
-            if (post.containsKey("includefailedurls")) {
-                inclerrdoc = post.getBoolean("includefailedurls");
-            }
-
            if (recrawlbt != null && !recrawlbt.shutdownInProgress()) {
                if (post.containsKey("updquery") && post.containsKey("recrawlquerytext")) {
-                    ((RecrawlBusyThread) recrawlbt).setQuery(post.get("recrawlquerytext"),inclerrdoc);
+                    ((RecrawlBusyThread) recrawlbt).setQuery(recrawlQuery, inclerrdoc);
                } else {
                    ((RecrawlBusyThread) recrawlbt).setIncludeFailed(inclerrdoc);
                }
@ -145,6 +170,8 @@ public class IndexReIndexMonitor_p {
            prop.put("recrawljobrunning_includefailedurls", ((RecrawlBusyThread) recrawlbt).getIncludeFailed());
        } else {
            prop.put("recrawljobrunning", 0);
+            prop.put("recrawljobrunning_recrawlquerytext", recrawlQuery);
+            prop.put("recrawljobrunning_includefailedurls", inclerrdoc);
        }

        // return rewrite properties
--- a/locales/de.lng
+++ b/locales/de.lng
@ -2094,8 +2094,6 @@ Documents are added to the crawler only if no other crawls are active==Dokumente
 and are added in small chunks.==und wird in kleinen Blöcken verarbeitet.
 "start recrawl job now"=="Starte Re-Crawl-Job jetzt"
 "stop recrawl job"=="Beende Re-Crawl-Job"
-to re-crawl documents with fresh_date_dt before today.==um Dokumente mit fresh_date_dt vor Heute erneut zu crawlen.
-after starting the recrawl job you can apply a custom Solr query to select documents to be processed==nach dem Start des Re-Crawl-Jobs kann die Solr Abfrage bearbeitet werden um gewünschte Dokumente zu verarbeiten
 Re-Crawl Query Details==Re-Crawl Abfrage Details
 Documents to process==Dokumente in Warteschlange
 Current Query==Aktuelle Abfrage
--- a/locales/master.lng.xlf
+++ b/locales/master.lng.xlf
@ -5262,10 +5262,7 @@
       <source>"stop recrawl job"</source>
    </trans-unit>
    <trans-unit id="c8a1de3a" xml:space="preserve" approved="no" translate="yes">
-       <source>to re-crawl documents with fresh_date_dt before today.</source>
-    </trans-unit>
-    <trans-unit id="35a2988c" xml:space="preserve" approved="no" translate="yes">
-       <source>after starting the recrawl job you can apply a custom Solr query to select documents to be processed</source>
+       <source>to re-crawl documents selected with the given query.</source>
    </trans-unit>
    <trans-unit id="5c907d7" xml:space="preserve" approved="no" translate="yes">
       <source>Re-Crawl Query Details</source>
--- a/source/net/yacy/crawler/RecrawlBusyThread.java
+++ b/source/net/yacy/crawler/RecrawlBusyThread.java
@ -49,18 +49,40 @@ import org.apache.solr.common.SolrDocumentList;
 */
 public class RecrawlBusyThread extends AbstractBusyThread {

+	/** The thread name */
    public final static String THREAD_NAME = "recrawlindex";
    
-    private String currentQuery = CollectionSchema.fresh_date_dt.getSolrFieldName()+":[* TO NOW/DAY-1DAY]"; // current query
-    private boolean includefailed = false; // flag if docs with httpstatus_i <> 200 shall be recrawled
+    /** The default selection query */
+    public static final String DEFAULT_QUERY = CollectionSchema.fresh_date_dt.getSolrFieldName()+":[* TO NOW/DAY-1DAY]";
+    
+    /** Default value for inclusion or not of documents with a https status different from 200 (success) */
+    public static final boolean DEFAULT_INCLUDE_FAILED = false;
+
+    /** The current query selecting documents to recrawl */
+    private String currentQuery;
+    
+    /** flag if docs with httpstatus_i <> 200 shall be recrawled */
+    private boolean includefailed;
+    
    private int chunkstart = 0;
    private final int chunksize;
    final Switchboard sb;
-    private final Set<DigestURL> urlstack; // buffer of urls to recrawl
+    
+    /** buffer of urls to recrawl */
+    private final Set<DigestURL> urlstack;
    public long urlsfound = 0;
    private String solrSortBy;

-    public RecrawlBusyThread(Switchboard xsb) {
+	/**
+	 * @param xsb
+	 *            the Switchboard instance holding server environment
+	 * @param query
+	 *            the Solr selection query
+	 * @param includeFailed
+	 *            set to true when documents with a https status different from 200
+	 *            (success) must be included
+	 */
+    public RecrawlBusyThread(final Switchboard xsb, final String query, final boolean includeFailed) {
        super(3000, 1000); // set lower limits of cycle delay
        setName(THREAD_NAME);
        this.setIdleSleep(10*60000); // set actual cycle delays
@ -68,6 +90,8 @@ public class RecrawlBusyThread extends AbstractBusyThread {
        this.setPriority(Thread.MIN_PRIORITY);

        this.sb = xsb;
+        this.currentQuery = query;
+        this.includefailed = includeFailed;
        urlstack = new HashSet<DigestURL>();
        // workaround to prevent solr exception on existing index (not fully reindexed) since intro of schema with docvalues
        // org.apache.solr.core.SolrCore java.lang.IllegalStateException: unexpected docvalues type NONE for field 'load_date_dt' (expected=NUMERIC). Use UninvertingReader or index with docvalues.
@ -91,6 +115,19 @@ public class RecrawlBusyThread extends AbstractBusyThread {
        return this.currentQuery;
    }
    
+	/**
+	 * 
+	 * @param queryBase
+	 *            the base query
+	 * @param includeFailed
+	 *            set to true when documents with a https status different from 200
+	 *            (success) must be included
+	 * @return the Solr selection query for candidate URLs to recrawl
+	 */
+	public static final String buildSelectionQuery(final String queryBase, final boolean includeFailed) {
+		return includeFailed ? queryBase : queryBase + " AND (" + CollectionSchema.httpstatus_i.name() + ":200)";
+	}
+
    /**
     * Flag to include failed urls (httpstatus_i <> 200)
     * if true -> currentQuery is used as is,
@ -174,7 +211,7 @@ public class RecrawlBusyThread extends AbstractBusyThread {
        if (!solrConnector.isClosed()) {
            try {
                // query all or only httpstatus=200 depending on includefailed flag
-                docList = solrConnector.getDocumentListByQuery(this.includefailed ? currentQuery : currentQuery + " AND (" + CollectionSchema.httpstatus_i.name() + ":200)",
+                docList = solrConnector.getDocumentListByQuery(RecrawlBusyThread.buildSelectionQuery(this.currentQuery, this.includefailed),
                        this.solrSortBy, this.chunkstart, this.chunksize, CollectionSchema.sku.getSolrFieldName());
                this.urlsfound = docList.getNumFound();
            } catch (Throwable e) {