From 72f6a0b0b2505bbb9f5b87aef6e20f5a27646032 Mon Sep 17 00:00:00 2001
From: reger <reger18@arcor.de>
Date: Sat, 6 Jun 2015 18:45:39 +0200
Subject: [PATCH] enhance recrawl job - allow to modify the query to select
 documents to  process (after job has started) - allow to include failed urls
 (httpstatus <> 200)

---
 htroot/IndexReIndexMonitor_p.html             | 44 ++++++++++++---
 htroot/IndexReIndexMonitor_p.java             | 53 +++++++++++++------
 .../net/yacy/crawler/RecrawlBusyThread.java   | 41 ++++++++++++--
 3 files changed, 109 insertions(+), 29 deletions(-)
diff --git a/htroot/IndexReIndexMonitor_p.html b/htroot/IndexReIndexMonitor_p.html
index 455f75053..2fecd83bc 100644
--- a/htroot/IndexReIndexMonitor_p.html
+++ b/htroot/IndexReIndexMonitor_p.html
@@ -61,19 +61,47 @@
         <p>Searches the local index and selects documents to add to the crawler (recrawl the document).
            This runs transparent as background job. Documents are added to the crawler only if no other crawls are active
            and are added in small chunks.</p>
-        <form action="IndexReIndexMonitor_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
+        <form action="IndexReIndexMonitor_p.html?setup=recrawljob" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
+          <table><tr valign="top"><td>
           <fieldset>
+
           #(recrawljobrunning)#                
             <input type="submit" name="recrawlnow" value="start recrawl job now" class="btn btn-primary"/>
             to re-crawl documents with fresh_date_dt before today.
-            ::<input type="submit" name="stoprecrawl" value="stop recrawl job" class="btn btn-danger"/>
-            <table>
-              <tr>  
-                <td>Documents to process</td> <td>#[docCount]#</td> <td> with fresh_date_dt before today</td>
-              </tr>    
-            </table>
-          #(/recrawljobrunning)# 
+            <p></p>
+            <p><small>after starting the recrawl job you can apply a custom Solr query to select documents to be processed</small></p>
+            ::<input type="submit" name="stoprecrawl" value="stop recrawl job" class="btn btn-danger"/> 
+          #(/recrawljobrunning)#
+
           </fieldset>
+          </td>
+          <td>
+          #(recrawljobrunning)#::
+            <fieldset><legend>Re-Crawl Query Details</legend>
+              <table>
+                <tr>
+                  <td>Documents to process</td><td>#[docCount]#</td>
+                </tr>
+                <tr>
+                  <td>Current Query</td><td>#[recrawlquerytext]#</td>
+                </tr>
+                <tr>
+                    <td>&nbsp;</td><td> </td>
+                </tr>
+                <tr>
+                    <td>&nbsp;</td><td> </td>
+                </tr>                  
+                <tr>
+                  <td>Edit Solr Query</td><td><input type="text" name="recrawlquerytext" size="40" value="#[recrawlquerytext]#" /><input type="submit" name="updquery" value="update" class="btn btn-sm btn-default"/></td>
+                </tr>
+                <tr>
+                  <td>include failed urls</td><td><input type="checkbox" name="includefailedurls" onchange="this.form.submit()" #(includefailedurls)#::checked="checked"#(/includefailedurls)# /></td>
+                </tr>                  
+              </table>
+            </fieldset>
+          #(/recrawljobrunning)#
+          </td>
+          </tr></table>
         </form>
         #%env/templates/footer.template%#
     </body>
diff --git a/htroot/IndexReIndexMonitor_p.java b/htroot/IndexReIndexMonitor_p.java
index beaaf32c1..1022435af 100644
--- a/htroot/IndexReIndexMonitor_p.java
+++ b/htroot/IndexReIndexMonitor_p.java
@@ -90,27 +90,46 @@ public class IndexReIndexMonitor_p {
 
         // recrawl job handling
         BusyThread recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME);
-        if (recrawlbt == null) {
-            if (post != null && post.containsKey("recrawlnow") && sb.index.fulltext().connectedLocalSolr()) {
-                sb.deployThread(RecrawlBusyThread.THREAD_NAME,
-                        "ReCrawl",
-                        "recrawl existing documents",
-                        null,
-                        new RecrawlBusyThread(Switchboard.getSwitchboard()),
-                        1000);
-                recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME);
-            }
-        }
         
-        if (recrawlbt != null) {
-            if (post != null && post.containsKey("stoprecrawl")) {
-                sb.terminateThread(RecrawlBusyThread.THREAD_NAME, false);
-                prop.put("recrawljobrunning",0);
+        // to signal that a setting shall change the form provides a fixed parameter setup=recrawljob, if not present return status only
+        if (post != null && "recrawljob".equals(post.get("setup"))) { // it's a command to recrawlThread
 
+            if (recrawlbt == null) {
+                if (post.containsKey("recrawlnow") && sb.index.fulltext().connectedLocalSolr()) {
+                    sb.deployThread(RecrawlBusyThread.THREAD_NAME,
+                            "ReCrawl",
+                            "recrawl existing documents",
+                            null,
+                            new RecrawlBusyThread(Switchboard.getSwitchboard()),
+                            1000);
+                    recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME);
+                }
             } else {
-                prop.put("recrawljobrunning", 1);
-                prop.put("recrawljobrunning_docCount", ((RecrawlBusyThread) recrawlbt).urlsfound);
+                if (post.containsKey("stoprecrawl")) {
+                    sb.terminateThread(RecrawlBusyThread.THREAD_NAME, false);
+                    prop.put("recrawljobrunning", 0);
+                }
+            }
+
+            boolean inclerrdoc = false;
+            if (post.containsKey("includefailedurls")) {
+                inclerrdoc = post.getBoolean("includefailedurls");
             }
+
+            if (recrawlbt != null && !recrawlbt.shutdownInProgress()) {
+                if (post.containsKey("updquery") && post.containsKey("recrawlquerytext")) {
+                    ((RecrawlBusyThread) recrawlbt).setQuery(post.get("recrawlquerytext"),inclerrdoc);
+                } else {
+                    ((RecrawlBusyThread) recrawlbt).setIncludeFailed(inclerrdoc);
+                }
+            }
+        }
+        // just post status of recrawlThread
+        if (recrawlbt != null && !recrawlbt.shutdownInProgress()) { // provide status
+            prop.put("recrawljobrunning", 1);
+            prop.put("recrawljobrunning_docCount", ((RecrawlBusyThread) recrawlbt).urlsfound);
+            prop.put("recrawljobrunning_recrawlquerytext", ((RecrawlBusyThread) recrawlbt).getQuery());
+            prop.put("recrawljobrunning_includefailedurls", ((RecrawlBusyThread) recrawlbt).getIncludeFailed());
         } else {
             prop.put("recrawljobrunning", 0);
         }
diff --git a/source/net/yacy/crawler/RecrawlBusyThread.java b/source/net/yacy/crawler/RecrawlBusyThread.java
index dc0d2e95e..d896abf7f 100644
--- a/source/net/yacy/crawler/RecrawlBusyThread.java
+++ b/source/net/yacy/crawler/RecrawlBusyThread.java
@@ -49,7 +49,8 @@ public class RecrawlBusyThread extends AbstractBusyThread {
 
     public final static String THREAD_NAME = "recrawlindex";
 
-    public String currentQuery = CollectionSchema.fresh_date_dt.getSolrFieldName()+":[* TO NOW/DAY-1DAY]"; // current query
+    private String currentQuery = CollectionSchema.fresh_date_dt.getSolrFieldName()+":[* TO NOW/DAY-1DAY]"; // current query
+    private boolean includefailed = false; // flag if docs with httpstatus_i <> 200 shall be recrawled
     private int chunkstart = 0;
     private int chunksize = 200;
     final Switchboard sb;
@@ -66,8 +67,39 @@ public class RecrawlBusyThread extends AbstractBusyThread {
         urlstack = new HashSet<DigestURL>();
     }
 
+    /**
+     * Set the query to select documents to recrawl
+     * and resets the counter to start a fresh query loop
+     * @param q select query
+     * @param includefailedurls true=all http status docs are recrawled, false=httpstatus=200 docs are recrawled
+     */
+    public void setQuery(String q, boolean includefailedurls) {
+        this.currentQuery = q;
+        this.includefailed = includefailedurls;
+        this.chunkstart = 0;
+    }
+
+    public String getQuery () {
+        return this.currentQuery;
+    }
+
+    /**
+     * Flag to include failed urls (httpstatus_i <> 200)
+     * if true -> currentQuery is used as is,
+     * if false -> the term " AND (httpstatus_i:200)" is appended to currentQuery
+     * @param includefailedurls
+     */
+    public void setIncludeFailed(boolean includefailedurls) {
+        this.includefailed = includefailedurls;
+    }
+
+    public boolean getIncludeFailed () {
+        return this.includefailed;
+    }
+
     /**
      * feed urls to the local crawler
+     * (Switchboard.addToCrawler() is not used here, as there existing urls are always skiped)
      *
      * @return true if urls were added/accepted to the crawler
      */
@@ -81,7 +113,7 @@ public class RecrawlBusyThread extends AbstractBusyThread {
             for (DigestURL url : this.urlstack) {
                 final Request request = sb.loader.request(url, true, true);
                 String acceptedError = sb.crawlStacker.checkAcceptanceChangeable(url, profile, 0);
-                if (acceptedError == null) {
+                if (!includefailed && acceptedError == null) { // skip check if failed docs to be included
                     acceptedError = sb.crawlStacker.checkAcceptanceInitially(url, profile);
                 }
                 if (acceptedError != null) {
@@ -134,8 +166,9 @@ public class RecrawlBusyThread extends AbstractBusyThread {
         SolrConnector solrConnector = sb.index.fulltext().getDefaultConnector();
         if (!solrConnector.isClosed()) {
             try {
-                docList = solrConnector.getDocumentListByQuery(currentQuery + " AND (" + CollectionSchema.httpstatus_i.name() + ":200)",
-                        CollectionSchema.fresh_date_dt.getSolrFieldName() + " asc", this.chunkstart, this.chunksize, CollectionSchema.sku.getSolrFieldName());
+                // query all or only httpstatus=200 depending on includefailed flag
+                docList = solrConnector.getDocumentListByQuery(this.includefailed  ? currentQuery : currentQuery + " AND (" + CollectionSchema.httpstatus_i.name() + ":200)",
+                        CollectionSchema.load_date_dt.getSolrFieldName() + " asc", this.chunkstart, this.chunksize, CollectionSchema.sku.getSolrFieldName());
                 this.urlsfound = docList.getNumFound();
             } catch (Throwable e) {
                 this.urlsfound = 0;