enhance recrawl job

- allow to modify the query to select documents to process (after job has started) - allow to include failed urls (httpstatus <> 200)
10 years ago · 72f6a0b0b2
parent e0a23c56c7
commit 72f6a0b0b2
3 changed files with 109 additions and 29 deletions
--- a/htroot/IndexReIndexMonitor_p.html
+++ b/htroot/IndexReIndexMonitor_p.html
@ -61,19 +61,47 @@
        <p>Searches the local index and selects documents to add to the crawler (recrawl the document).
           This runs transparent as background job. Documents are added to the crawler only if no other crawls are active
           and are added in small chunks.</p>
-        <form action="IndexReIndexMonitor_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
+        <form action="IndexReIndexMonitor_p.html?setup=recrawljob" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
+          <table><tr valign="top"><td>
          <fieldset>
+
          #(recrawljobrunning)#                
            <input type="submit" name="recrawlnow" value="start recrawl job now" class="btn btn-primary"/>
            to re-crawl documents with fresh_date_dt before today.
+            <p></p>
+            <p><small>after starting the recrawl job you can apply a custom Solr query to select documents to be processed</small></p>
            ::<input type="submit" name="stoprecrawl" value="stop recrawl job" class="btn btn-danger"/> 
+          #(/recrawljobrunning)#
+
+          </fieldset>
+          </td>
+          <td>
+          #(recrawljobrunning)#::
+            <fieldset><legend>Re-Crawl Query Details</legend>
              <table>
                <tr>
-                <td>Documents to process</td> <td>#[docCount]#</td> <td> with fresh_date_dt before today</td>
+                  <td>Documents to process</td><td>#[docCount]#</td>
+                </tr>
+                <tr>
+                  <td>Current Query</td><td>#[recrawlquerytext]#</td>
+                </tr>
+                <tr>
+                    <td>&nbsp;</td><td> </td>
+                </tr>
+                <tr>
+                    <td>&nbsp;</td><td> </td>
+                </tr>                  
+                <tr>
+                  <td>Edit Solr Query</td><td><input type="text" name="recrawlquerytext" size="40" value="#[recrawlquerytext]#" /><input type="submit" name="updquery" value="update" class="btn btn-sm btn-default"/></td>
+                </tr>
+                <tr>
+                  <td>include failed urls</td><td><input type="checkbox" name="includefailedurls" onchange="this.form.submit()" #(includefailedurls)#::checked="checked"#(/includefailedurls)# /></td>
                </tr>                  
              </table>
-          #(/recrawljobrunning)# 
            </fieldset>
+          #(/recrawljobrunning)#
+          </td>
+          </tr></table>
        </form>
        #%env/templates/footer.template%#
    </body>
--- a/htroot/IndexReIndexMonitor_p.java
+++ b/htroot/IndexReIndexMonitor_p.java
@ -90,8 +90,12 @@ public class IndexReIndexMonitor_p {

        // recrawl job handling
        BusyThread recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME);
+        
+        // to signal that a setting shall change the form provides a fixed parameter setup=recrawljob, if not present return status only
+        if (post != null && "recrawljob".equals(post.get("setup"))) { // it's a command to recrawlThread
+
            if (recrawlbt == null) {
-            if (post != null && post.containsKey("recrawlnow") && sb.index.fulltext().connectedLocalSolr()) {
+                if (post.containsKey("recrawlnow") && sb.index.fulltext().connectedLocalSolr()) {
                    sb.deployThread(RecrawlBusyThread.THREAD_NAME,
                            "ReCrawl",
                            "recrawl existing documents",
@ -100,17 +104,32 @@ public class IndexReIndexMonitor_p {
                            1000);
                    recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME);
                }
-        }
-        
-        if (recrawlbt != null) {
-            if (post != null && post.containsKey("stoprecrawl")) {
+            } else {
+                if (post.containsKey("stoprecrawl")) {
                    sb.terminateThread(RecrawlBusyThread.THREAD_NAME, false);
                    prop.put("recrawljobrunning", 0);
+                }
+            }

+            boolean inclerrdoc = false;
+            if (post.containsKey("includefailedurls")) {
+                inclerrdoc = post.getBoolean("includefailedurls");
+            }
+
+            if (recrawlbt != null && !recrawlbt.shutdownInProgress()) {
+                if (post.containsKey("updquery") && post.containsKey("recrawlquerytext")) {
+                    ((RecrawlBusyThread) recrawlbt).setQuery(post.get("recrawlquerytext"),inclerrdoc);
                } else {
+                    ((RecrawlBusyThread) recrawlbt).setIncludeFailed(inclerrdoc);
+                }
+            }
+        }
+        // just post status of recrawlThread
+        if (recrawlbt != null && !recrawlbt.shutdownInProgress()) { // provide status
            prop.put("recrawljobrunning", 1);
            prop.put("recrawljobrunning_docCount", ((RecrawlBusyThread) recrawlbt).urlsfound);
-            }
+            prop.put("recrawljobrunning_recrawlquerytext", ((RecrawlBusyThread) recrawlbt).getQuery());
+            prop.put("recrawljobrunning_includefailedurls", ((RecrawlBusyThread) recrawlbt).getIncludeFailed());
        } else {
            prop.put("recrawljobrunning", 0);
        }
--- a/source/net/yacy/crawler/RecrawlBusyThread.java
+++ b/source/net/yacy/crawler/RecrawlBusyThread.java
@ -49,7 +49,8 @@ public class RecrawlBusyThread extends AbstractBusyThread {

    public final static String THREAD_NAME = "recrawlindex";

-    public String currentQuery = CollectionSchema.fresh_date_dt.getSolrFieldName()+":[* TO NOW/DAY-1DAY]"; // current query
+    private String currentQuery = CollectionSchema.fresh_date_dt.getSolrFieldName()+":[* TO NOW/DAY-1DAY]"; // current query
+    private boolean includefailed = false; // flag if docs with httpstatus_i <> 200 shall be recrawled
    private int chunkstart = 0;
    private int chunksize = 200;
    final Switchboard sb;
@ -66,8 +67,39 @@ public class RecrawlBusyThread extends AbstractBusyThread {
        urlstack = new HashSet<DigestURL>();
    }

+    /**
+     * Set the query to select documents to recrawl
+     * and resets the counter to start a fresh query loop
+     * @param q select query
+     * @param includefailedurls true=all http status docs are recrawled, false=httpstatus=200 docs are recrawled
+     */
+    public void setQuery(String q, boolean includefailedurls) {
+        this.currentQuery = q;
+        this.includefailed = includefailedurls;
+        this.chunkstart = 0;
+    }
+
+    public String getQuery () {
+        return this.currentQuery;
+    }
+
+    /**
+     * Flag to include failed urls (httpstatus_i <> 200)
+     * if true -> currentQuery is used as is,
+     * if false -> the term " AND (httpstatus_i:200)" is appended to currentQuery
+     * @param includefailedurls
+     */
+    public void setIncludeFailed(boolean includefailedurls) {
+        this.includefailed = includefailedurls;
+    }
+
+    public boolean getIncludeFailed () {
+        return this.includefailed;
+    }
+
    /**
     * feed urls to the local crawler
+     * (Switchboard.addToCrawler() is not used here, as there existing urls are always skiped)
     *
     * @return true if urls were added/accepted to the crawler
     */
@ -81,7 +113,7 @@ public class RecrawlBusyThread extends AbstractBusyThread {
            for (DigestURL url : this.urlstack) {
                final Request request = sb.loader.request(url, true, true);
                String acceptedError = sb.crawlStacker.checkAcceptanceChangeable(url, profile, 0);
-                if (acceptedError == null) {
+                if (!includefailed && acceptedError == null) { // skip check if failed docs to be included
                    acceptedError = sb.crawlStacker.checkAcceptanceInitially(url, profile);
                }
                if (acceptedError != null) {
@ -134,8 +166,9 @@ public class RecrawlBusyThread extends AbstractBusyThread {
        SolrConnector solrConnector = sb.index.fulltext().getDefaultConnector();
        if (!solrConnector.isClosed()) {
            try {
-                docList = solrConnector.getDocumentListByQuery(currentQuery + " AND (" + CollectionSchema.httpstatus_i.name() + ":200)",
-                        CollectionSchema.fresh_date_dt.getSolrFieldName() + " asc", this.chunkstart, this.chunksize, CollectionSchema.sku.getSolrFieldName());
+                // query all or only httpstatus=200 depending on includefailed flag
+                docList = solrConnector.getDocumentListByQuery(this.includefailed  ? currentQuery : currentQuery + " AND (" + CollectionSchema.httpstatus_i.name() + ":200)",
+                        CollectionSchema.load_date_dt.getSolrFieldName() + " asc", this.chunkstart, this.chunksize, CollectionSchema.sku.getSolrFieldName());
                this.urlsfound = docList.getNumFound();
            } catch (Throwable e) {
                this.urlsfound = 0;