Made possible to customize selection query before launching a recrawl

pull/154/head
luccioman 7 years ago
parent fab6e54fec
commit 421728d25a

@ -68,10 +68,24 @@
<fieldset>
#(recrawljobrunning)#
<div class="form-group">
<label>Solr query <input type="text" name="recrawlquerytext" size="40" value="#[recrawlquerytext]#" /></label>
<input type="submit" name="simulateRecrawl" value="Simulate" class="btn btn-default" title="Check only how many documents would be selected for recrawl"/>
</div>
#(simulationResult)#
::
<div class="alert alert-success" role="alert">#[docCount]# documents selected for recrawl.</div>
::
<div class="alert alert-danger" role="alert">An error occurred when trying to run the selection query.</div>
::
<div class="alert alert-danger" role="alert">The Solr index is not connected. Please restart your peer.</div>
#(/simulationResult)#
<div class="form-group">
<label>Include failed URLs <input type="checkbox" name="includefailedurls" #(includefailedurls)#::checked="checked"#(/includefailedurls)# /></label>
</div>
<input type="submit" name="recrawlDefaults" value="Set defaults" class="btn btn-default" title="Reset to default values"/>
<input type="submit" name="recrawlnow" value="start recrawl job now" class="btn btn-primary"/>
to re-crawl documents with fresh_date_dt before today.
<p></p>
<p><small>after starting the recrawl job you can apply a custom Solr query to select documents to be processed</small></p>
to re-crawl documents selected with the given query.
::<input type="submit" name="stoprecrawl" value="stop recrawl job" class="btn btn-danger"/>
#(/recrawljobrunning)#

@ -17,9 +17,14 @@
* along with this program in the file lgpl21.txt If not, see
* <http://www.gnu.org/licenses/>.
*/
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.sorting.OrderedScoreMap;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.kelondro.workflow.BusyThread;
import java.io.IOException;
import net.yacy.migration;
import net.yacy.crawler.RecrawlBusyThread;
import net.yacy.data.TransactionManager;
@ -101,21 +106,46 @@ public class IndexReIndexMonitor_p {
// recrawl job handling
BusyThread recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME);
String recrawlQuery = RecrawlBusyThread.DEFAULT_QUERY;
boolean inclerrdoc = RecrawlBusyThread.DEFAULT_INCLUDE_FAILED;
// to signal that a setting shall change the form provides a fixed parameter setup=recrawljob, if not present return status only
if (post != null && "recrawljob".equals(post.get("setup"))) { // it's a command to recrawlThread
/* Check the transaction is valid */
TransactionManager.checkPostTransaction(header, post);
if(post.containsKey("recrawlquerytext")) {
recrawlQuery = post.get("recrawlquerytext");
}
if (post.containsKey("includefailedurls")) {
inclerrdoc = post.getBoolean("includefailedurls");
}
if (recrawlbt == null) {
prop.put("recrawljobrunning_simulationResult", 0);
if (post.containsKey("recrawlnow") && sb.index.fulltext().connectedLocalSolr()) {
sb.deployThread(RecrawlBusyThread.THREAD_NAME,
"ReCrawl",
"recrawl existing documents",
null,
new RecrawlBusyThread(Switchboard.getSwitchboard()),
1000);
sb.deployThread(RecrawlBusyThread.THREAD_NAME, "ReCrawl", "recrawl existing documents", null,
new RecrawlBusyThread(Switchboard.getSwitchboard(), recrawlQuery, inclerrdoc), 1000);
recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME);
} else if(post.containsKey("simulateRecrawl") && sb.index.fulltext().connectedLocalSolr()) {
SolrConnector solrConnector = sb.index.fulltext().getDefaultConnector();
if (!solrConnector.isClosed()) {
try {
// query all or only httpstatus=200 depending on includefailed flag
final long count = solrConnector.getCountByQuery(RecrawlBusyThread.buildSelectionQuery(recrawlQuery, inclerrdoc));
prop.put("recrawljobrunning_simulationResult", 1);
prop.put("recrawljobrunning_simulationResult_docCount", count);
} catch (final IOException e) {
prop.put("recrawljobrunning_simulationResult", 2);
ConcurrentLog.logException(e);
}
} else {
prop.put("recrawljobrunning_simulationResult", 3);
}
} else if(post.containsKey("recrawlDefaults")) {
recrawlQuery = RecrawlBusyThread.DEFAULT_QUERY;
inclerrdoc = RecrawlBusyThread.DEFAULT_INCLUDE_FAILED;
}
} else {
if (post.containsKey("stoprecrawl")) {
@ -124,14 +154,9 @@ public class IndexReIndexMonitor_p {
}
}
boolean inclerrdoc = false;
if (post.containsKey("includefailedurls")) {
inclerrdoc = post.getBoolean("includefailedurls");
}
if (recrawlbt != null && !recrawlbt.shutdownInProgress()) {
if (post.containsKey("updquery") && post.containsKey("recrawlquerytext")) {
((RecrawlBusyThread) recrawlbt).setQuery(post.get("recrawlquerytext"),inclerrdoc);
((RecrawlBusyThread) recrawlbt).setQuery(recrawlQuery, inclerrdoc);
} else {
((RecrawlBusyThread) recrawlbt).setIncludeFailed(inclerrdoc);
}
@ -145,6 +170,8 @@ public class IndexReIndexMonitor_p {
prop.put("recrawljobrunning_includefailedurls", ((RecrawlBusyThread) recrawlbt).getIncludeFailed());
} else {
prop.put("recrawljobrunning", 0);
prop.put("recrawljobrunning_recrawlquerytext", recrawlQuery);
prop.put("recrawljobrunning_includefailedurls", inclerrdoc);
}
// return rewrite properties

@ -2094,8 +2094,6 @@ Documents are added to the crawler only if no other crawls are active==Dokumente
and are added in small chunks.==und wird in kleinen Blöcken verarbeitet.
"start recrawl job now"=="Starte Re-Crawl-Job jetzt"
"stop recrawl job"=="Beende Re-Crawl-Job"
to re-crawl documents with fresh_date_dt before today.==um Dokumente mit fresh_date_dt vor Heute erneut zu crawlen.
after starting the recrawl job you can apply a custom Solr query to select documents to be processed==nach dem Start des Re-Crawl-Jobs kann die Solr Abfrage bearbeitet werden um gewünschte Dokumente zu verarbeiten
Re-Crawl Query Details==Re-Crawl Abfrage Details
Documents to process==Dokumente in Warteschlange
Current Query==Aktuelle Abfrage

@ -5262,10 +5262,7 @@
<source>"stop recrawl job"</source>
</trans-unit>
<trans-unit id="c8a1de3a" xml:space="preserve" approved="no" translate="yes">
<source>to re-crawl documents with fresh_date_dt before today.</source>
</trans-unit>
<trans-unit id="35a2988c" xml:space="preserve" approved="no" translate="yes">
<source>after starting the recrawl job you can apply a custom Solr query to select documents to be processed</source>
<source>to re-crawl documents selected with the given query.</source>
</trans-unit>
<trans-unit id="5c907d7" xml:space="preserve" approved="no" translate="yes">
<source>Re-Crawl Query Details</source>

@ -49,18 +49,40 @@ import org.apache.solr.common.SolrDocumentList;
*/
public class RecrawlBusyThread extends AbstractBusyThread {
/** The thread name */
public final static String THREAD_NAME = "recrawlindex";
private String currentQuery = CollectionSchema.fresh_date_dt.getSolrFieldName()+":[* TO NOW/DAY-1DAY]"; // current query
private boolean includefailed = false; // flag if docs with httpstatus_i <> 200 shall be recrawled
/** The default selection query */
public static final String DEFAULT_QUERY = CollectionSchema.fresh_date_dt.getSolrFieldName()+":[* TO NOW/DAY-1DAY]";
/** Default value for inclusion or not of documents with a https status different from 200 (success) */
public static final boolean DEFAULT_INCLUDE_FAILED = false;
/** The current query selecting documents to recrawl */
private String currentQuery;
/** flag if docs with httpstatus_i <> 200 shall be recrawled */
private boolean includefailed;
private int chunkstart = 0;
private final int chunksize;
final Switchboard sb;
private final Set<DigestURL> urlstack; // buffer of urls to recrawl
/** buffer of urls to recrawl */
private final Set<DigestURL> urlstack;
public long urlsfound = 0;
private String solrSortBy;
public RecrawlBusyThread(Switchboard xsb) {
/**
* @param xsb
* the Switchboard instance holding server environment
* @param query
* the Solr selection query
* @param includeFailed
* set to true when documents with a https status different from 200
* (success) must be included
*/
public RecrawlBusyThread(final Switchboard xsb, final String query, final boolean includeFailed) {
super(3000, 1000); // set lower limits of cycle delay
setName(THREAD_NAME);
this.setIdleSleep(10*60000); // set actual cycle delays
@ -68,6 +90,8 @@ public class RecrawlBusyThread extends AbstractBusyThread {
this.setPriority(Thread.MIN_PRIORITY);
this.sb = xsb;
this.currentQuery = query;
this.includefailed = includeFailed;
urlstack = new HashSet<DigestURL>();
// workaround to prevent solr exception on existing index (not fully reindexed) since intro of schema with docvalues
// org.apache.solr.core.SolrCore java.lang.IllegalStateException: unexpected docvalues type NONE for field 'load_date_dt' (expected=NUMERIC). Use UninvertingReader or index with docvalues.
@ -91,6 +115,19 @@ public class RecrawlBusyThread extends AbstractBusyThread {
return this.currentQuery;
}
/**
*
* @param queryBase
* the base query
* @param includeFailed
* set to true when documents with a https status different from 200
* (success) must be included
* @return the Solr selection query for candidate URLs to recrawl
*/
public static final String buildSelectionQuery(final String queryBase, final boolean includeFailed) {
return includeFailed ? queryBase : queryBase + " AND (" + CollectionSchema.httpstatus_i.name() + ":200)";
}
/**
* Flag to include failed urls (httpstatus_i <> 200)
* if true -> currentQuery is used as is,
@ -174,7 +211,7 @@ public class RecrawlBusyThread extends AbstractBusyThread {
if (!solrConnector.isClosed()) {
try {
// query all or only httpstatus=200 depending on includefailed flag
docList = solrConnector.getDocumentListByQuery(this.includefailed ? currentQuery : currentQuery + " AND (" + CollectionSchema.httpstatus_i.name() + ":200)",
docList = solrConnector.getDocumentListByQuery(RecrawlBusyThread.buildSelectionQuery(this.currentQuery, this.includefailed),
this.solrSortBy, this.chunkstart, this.chunksize, CollectionSchema.sku.getSolrFieldName());
this.urlsfound = docList.getNumFound();
} catch (Throwable e) {

Loading…
Cancel
Save