diff --git a/htroot/IndexReIndexMonitor_p.html b/htroot/IndexReIndexMonitor_p.html index 8ad897c97..dc5c769bf 100644 --- a/htroot/IndexReIndexMonitor_p.html +++ b/htroot/IndexReIndexMonitor_p.html @@ -67,11 +67,25 @@
- #(recrawljobrunning)# + #(recrawljobrunning)# +
+ + +
+ #(simulationResult)# + :: + + :: + + :: + + #(/simulationResult)# +
+ +
+ - to re-crawl documents with fresh_date_dt before today. -

-

after starting the recrawl job you can apply a custom Solr query to select documents to be processed

+ to re-crawl documents selected with the given query. :: #(/recrawljobrunning)# diff --git a/htroot/IndexReIndexMonitor_p.java b/htroot/IndexReIndexMonitor_p.java index 2fb5b3057..88d489c5d 100644 --- a/htroot/IndexReIndexMonitor_p.java +++ b/htroot/IndexReIndexMonitor_p.java @@ -17,9 +17,14 @@ * along with this program in the file lgpl21.txt If not, see * . */ +import net.yacy.cora.federate.solr.connector.SolrConnector; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.sorting.OrderedScoreMap; +import net.yacy.cora.util.ConcurrentLog; import net.yacy.kelondro.workflow.BusyThread; + +import java.io.IOException; + import net.yacy.migration; import net.yacy.crawler.RecrawlBusyThread; import net.yacy.data.TransactionManager; @@ -101,21 +106,46 @@ public class IndexReIndexMonitor_p { // recrawl job handling BusyThread recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME); + String recrawlQuery = RecrawlBusyThread.DEFAULT_QUERY; + boolean inclerrdoc = RecrawlBusyThread.DEFAULT_INCLUDE_FAILED; // to signal that a setting shall change the form provides a fixed parameter setup=recrawljob, if not present return status only if (post != null && "recrawljob".equals(post.get("setup"))) { // it's a command to recrawlThread /* Check the transaction is valid */ TransactionManager.checkPostTransaction(header, post); + + if(post.containsKey("recrawlquerytext")) { + recrawlQuery = post.get("recrawlquerytext"); + } + + if (post.containsKey("includefailedurls")) { + inclerrdoc = post.getBoolean("includefailedurls"); + } if (recrawlbt == null) { + prop.put("recrawljobrunning_simulationResult", 0); if (post.containsKey("recrawlnow") && sb.index.fulltext().connectedLocalSolr()) { - sb.deployThread(RecrawlBusyThread.THREAD_NAME, - "ReCrawl", - "recrawl existing documents", - null, - new RecrawlBusyThread(Switchboard.getSwitchboard()), - 1000); + sb.deployThread(RecrawlBusyThread.THREAD_NAME, "ReCrawl", "recrawl existing documents", null, + new RecrawlBusyThread(Switchboard.getSwitchboard(), recrawlQuery, inclerrdoc), 1000); recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME); + } else if(post.containsKey("simulateRecrawl") && sb.index.fulltext().connectedLocalSolr()) { + SolrConnector solrConnector = sb.index.fulltext().getDefaultConnector(); + if (!solrConnector.isClosed()) { + try { + // query all or only httpstatus=200 depending on includefailed flag + final long count = solrConnector.getCountByQuery(RecrawlBusyThread.buildSelectionQuery(recrawlQuery, inclerrdoc)); + prop.put("recrawljobrunning_simulationResult", 1); + prop.put("recrawljobrunning_simulationResult_docCount", count); + } catch (final IOException e) { + prop.put("recrawljobrunning_simulationResult", 2); + ConcurrentLog.logException(e); + } + } else { + prop.put("recrawljobrunning_simulationResult", 3); + } + } else if(post.containsKey("recrawlDefaults")) { + recrawlQuery = RecrawlBusyThread.DEFAULT_QUERY; + inclerrdoc = RecrawlBusyThread.DEFAULT_INCLUDE_FAILED; } } else { if (post.containsKey("stoprecrawl")) { @@ -124,14 +154,9 @@ public class IndexReIndexMonitor_p { } } - boolean inclerrdoc = false; - if (post.containsKey("includefailedurls")) { - inclerrdoc = post.getBoolean("includefailedurls"); - } - if (recrawlbt != null && !recrawlbt.shutdownInProgress()) { if (post.containsKey("updquery") && post.containsKey("recrawlquerytext")) { - ((RecrawlBusyThread) recrawlbt).setQuery(post.get("recrawlquerytext"),inclerrdoc); + ((RecrawlBusyThread) recrawlbt).setQuery(recrawlQuery, inclerrdoc); } else { ((RecrawlBusyThread) recrawlbt).setIncludeFailed(inclerrdoc); } @@ -145,6 +170,8 @@ public class IndexReIndexMonitor_p { prop.put("recrawljobrunning_includefailedurls", ((RecrawlBusyThread) recrawlbt).getIncludeFailed()); } else { prop.put("recrawljobrunning", 0); + prop.put("recrawljobrunning_recrawlquerytext", recrawlQuery); + prop.put("recrawljobrunning_includefailedurls", inclerrdoc); } // return rewrite properties diff --git a/locales/de.lng b/locales/de.lng index 5eaaa566c..e588479b2 100644 --- a/locales/de.lng +++ b/locales/de.lng @@ -2094,8 +2094,6 @@ Documents are added to the crawler only if no other crawls are active==Dokumente and are added in small chunks.==und wird in kleinen Blöcken verarbeitet. "start recrawl job now"=="Starte Re-Crawl-Job jetzt" "stop recrawl job"=="Beende Re-Crawl-Job" -to re-crawl documents with fresh_date_dt before today.==um Dokumente mit fresh_date_dt vor Heute erneut zu crawlen. -after starting the recrawl job you can apply a custom Solr query to select documents to be processed==nach dem Start des Re-Crawl-Jobs kann die Solr Abfrage bearbeitet werden um gewünschte Dokumente zu verarbeiten Re-Crawl Query Details==Re-Crawl Abfrage Details Documents to process==Dokumente in Warteschlange Current Query==Aktuelle Abfrage diff --git a/locales/master.lng.xlf b/locales/master.lng.xlf index 8ef63ee46..622178c2d 100644 --- a/locales/master.lng.xlf +++ b/locales/master.lng.xlf @@ -5262,10 +5262,7 @@ "stop recrawl job" - to re-crawl documents with fresh_date_dt before today. - - - after starting the recrawl job you can apply a custom Solr query to select documents to be processed + to re-crawl documents selected with the given query. Re-Crawl Query Details diff --git a/source/net/yacy/crawler/RecrawlBusyThread.java b/source/net/yacy/crawler/RecrawlBusyThread.java index 87c317909..465316b76 100644 --- a/source/net/yacy/crawler/RecrawlBusyThread.java +++ b/source/net/yacy/crawler/RecrawlBusyThread.java @@ -49,18 +49,40 @@ import org.apache.solr.common.SolrDocumentList; */ public class RecrawlBusyThread extends AbstractBusyThread { + /** The thread name */ public final static String THREAD_NAME = "recrawlindex"; + + /** The default selection query */ + public static final String DEFAULT_QUERY = CollectionSchema.fresh_date_dt.getSolrFieldName()+":[* TO NOW/DAY-1DAY]"; + + /** Default value for inclusion or not of documents with a https status different from 200 (success) */ + public static final boolean DEFAULT_INCLUDE_FAILED = false; - private String currentQuery = CollectionSchema.fresh_date_dt.getSolrFieldName()+":[* TO NOW/DAY-1DAY]"; // current query - private boolean includefailed = false; // flag if docs with httpstatus_i <> 200 shall be recrawled + /** The current query selecting documents to recrawl */ + private String currentQuery; + + /** flag if docs with httpstatus_i <> 200 shall be recrawled */ + private boolean includefailed; + private int chunkstart = 0; private final int chunksize; final Switchboard sb; - private final Set urlstack; // buffer of urls to recrawl + + /** buffer of urls to recrawl */ + private final Set urlstack; public long urlsfound = 0; private String solrSortBy; - public RecrawlBusyThread(Switchboard xsb) { + /** + * @param xsb + * the Switchboard instance holding server environment + * @param query + * the Solr selection query + * @param includeFailed + * set to true when documents with a https status different from 200 + * (success) must be included + */ + public RecrawlBusyThread(final Switchboard xsb, final String query, final boolean includeFailed) { super(3000, 1000); // set lower limits of cycle delay setName(THREAD_NAME); this.setIdleSleep(10*60000); // set actual cycle delays @@ -68,6 +90,8 @@ public class RecrawlBusyThread extends AbstractBusyThread { this.setPriority(Thread.MIN_PRIORITY); this.sb = xsb; + this.currentQuery = query; + this.includefailed = includeFailed; urlstack = new HashSet(); // workaround to prevent solr exception on existing index (not fully reindexed) since intro of schema with docvalues // org.apache.solr.core.SolrCore java.lang.IllegalStateException: unexpected docvalues type NONE for field 'load_date_dt' (expected=NUMERIC). Use UninvertingReader or index with docvalues. @@ -90,6 +114,19 @@ public class RecrawlBusyThread extends AbstractBusyThread { public String getQuery () { return this.currentQuery; } + + /** + * + * @param queryBase + * the base query + * @param includeFailed + * set to true when documents with a https status different from 200 + * (success) must be included + * @return the Solr selection query for candidate URLs to recrawl + */ + public static final String buildSelectionQuery(final String queryBase, final boolean includeFailed) { + return includeFailed ? queryBase : queryBase + " AND (" + CollectionSchema.httpstatus_i.name() + ":200)"; + } /** * Flag to include failed urls (httpstatus_i <> 200) @@ -174,7 +211,7 @@ public class RecrawlBusyThread extends AbstractBusyThread { if (!solrConnector.isClosed()) { try { // query all or only httpstatus=200 depending on includefailed flag - docList = solrConnector.getDocumentListByQuery(this.includefailed ? currentQuery : currentQuery + " AND (" + CollectionSchema.httpstatus_i.name() + ":200)", + docList = solrConnector.getDocumentListByQuery(RecrawlBusyThread.buildSelectionQuery(this.currentQuery, this.includefailed), this.solrSortBy, this.chunkstart, this.chunksize, CollectionSchema.sku.getSolrFieldName()); this.urlsfound = docList.getNumFound(); } catch (Throwable e) {