enhance recrawl job

- allow to modify the query to select documents to  process (after job has started)
- allow to include failed urls (httpstatus <> 200)
pull/8/head
reger 10 years ago
parent e0a23c56c7
commit 72f6a0b0b2

@ -61,19 +61,47 @@
<p>Searches the local index and selects documents to add to the crawler (recrawl the document). <p>Searches the local index and selects documents to add to the crawler (recrawl the document).
This runs transparent as background job. Documents are added to the crawler only if no other crawls are active This runs transparent as background job. Documents are added to the crawler only if no other crawls are active
and are added in small chunks.</p> and are added in small chunks.</p>
<form action="IndexReIndexMonitor_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8"> <form action="IndexReIndexMonitor_p.html?setup=recrawljob" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<table><tr valign="top"><td>
<fieldset> <fieldset>
#(recrawljobrunning)# #(recrawljobrunning)#
<input type="submit" name="recrawlnow" value="start recrawl job now" class="btn btn-primary"/> <input type="submit" name="recrawlnow" value="start recrawl job now" class="btn btn-primary"/>
to re-crawl documents with fresh_date_dt before today. to re-crawl documents with fresh_date_dt before today.
<p></p>
<p><small>after starting the recrawl job you can apply a custom Solr query to select documents to be processed</small></p>
::<input type="submit" name="stoprecrawl" value="stop recrawl job" class="btn btn-danger"/> ::<input type="submit" name="stoprecrawl" value="stop recrawl job" class="btn btn-danger"/>
<table>
<tr>
<td>Documents to process</td> <td>#[docCount]#</td> <td> with fresh_date_dt before today</td>
</tr>
</table>
#(/recrawljobrunning)# #(/recrawljobrunning)#
</fieldset> </fieldset>
</td>
<td>
#(recrawljobrunning)#::
<fieldset><legend>Re-Crawl Query Details</legend>
<table>
<tr>
<td>Documents to process</td><td>#[docCount]#</td>
</tr>
<tr>
<td>Current Query</td><td>#[recrawlquerytext]#</td>
</tr>
<tr>
<td>&nbsp;</td><td> </td>
</tr>
<tr>
<td>&nbsp;</td><td> </td>
</tr>
<tr>
<td>Edit Solr Query</td><td><input type="text" name="recrawlquerytext" size="40" value="#[recrawlquerytext]#" /><input type="submit" name="updquery" value="update" class="btn btn-sm btn-default"/></td>
</tr>
<tr>
<td>include failed urls</td><td><input type="checkbox" name="includefailedurls" onchange="this.form.submit()" #(includefailedurls)#::checked="checked"#(/includefailedurls)# /></td>
</tr>
</table>
</fieldset>
#(/recrawljobrunning)#
</td>
</tr></table>
</form> </form>
#%env/templates/footer.template%# #%env/templates/footer.template%#
</body> </body>

@ -90,27 +90,46 @@ public class IndexReIndexMonitor_p {
// recrawl job handling // recrawl job handling
BusyThread recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME); BusyThread recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME);
if (recrawlbt == null) {
if (post != null && post.containsKey("recrawlnow") && sb.index.fulltext().connectedLocalSolr()) {
sb.deployThread(RecrawlBusyThread.THREAD_NAME,
"ReCrawl",
"recrawl existing documents",
null,
new RecrawlBusyThread(Switchboard.getSwitchboard()),
1000);
recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME);
}
}
if (recrawlbt != null) { // to signal that a setting shall change the form provides a fixed parameter setup=recrawljob, if not present return status only
if (post != null && post.containsKey("stoprecrawl")) { if (post != null && "recrawljob".equals(post.get("setup"))) { // it's a command to recrawlThread
sb.terminateThread(RecrawlBusyThread.THREAD_NAME, false);
prop.put("recrawljobrunning",0);
if (recrawlbt == null) {
if (post.containsKey("recrawlnow") && sb.index.fulltext().connectedLocalSolr()) {
sb.deployThread(RecrawlBusyThread.THREAD_NAME,
"ReCrawl",
"recrawl existing documents",
null,
new RecrawlBusyThread(Switchboard.getSwitchboard()),
1000);
recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME);
}
} else { } else {
prop.put("recrawljobrunning", 1); if (post.containsKey("stoprecrawl")) {
prop.put("recrawljobrunning_docCount", ((RecrawlBusyThread) recrawlbt).urlsfound); sb.terminateThread(RecrawlBusyThread.THREAD_NAME, false);
prop.put("recrawljobrunning", 0);
}
}
boolean inclerrdoc = false;
if (post.containsKey("includefailedurls")) {
inclerrdoc = post.getBoolean("includefailedurls");
} }
if (recrawlbt != null && !recrawlbt.shutdownInProgress()) {
if (post.containsKey("updquery") && post.containsKey("recrawlquerytext")) {
((RecrawlBusyThread) recrawlbt).setQuery(post.get("recrawlquerytext"),inclerrdoc);
} else {
((RecrawlBusyThread) recrawlbt).setIncludeFailed(inclerrdoc);
}
}
}
// just post status of recrawlThread
if (recrawlbt != null && !recrawlbt.shutdownInProgress()) { // provide status
prop.put("recrawljobrunning", 1);
prop.put("recrawljobrunning_docCount", ((RecrawlBusyThread) recrawlbt).urlsfound);
prop.put("recrawljobrunning_recrawlquerytext", ((RecrawlBusyThread) recrawlbt).getQuery());
prop.put("recrawljobrunning_includefailedurls", ((RecrawlBusyThread) recrawlbt).getIncludeFailed());
} else { } else {
prop.put("recrawljobrunning", 0); prop.put("recrawljobrunning", 0);
} }

@ -49,7 +49,8 @@ public class RecrawlBusyThread extends AbstractBusyThread {
public final static String THREAD_NAME = "recrawlindex"; public final static String THREAD_NAME = "recrawlindex";
public String currentQuery = CollectionSchema.fresh_date_dt.getSolrFieldName()+":[* TO NOW/DAY-1DAY]"; // current query private String currentQuery = CollectionSchema.fresh_date_dt.getSolrFieldName()+":[* TO NOW/DAY-1DAY]"; // current query
private boolean includefailed = false; // flag if docs with httpstatus_i <> 200 shall be recrawled
private int chunkstart = 0; private int chunkstart = 0;
private int chunksize = 200; private int chunksize = 200;
final Switchboard sb; final Switchboard sb;
@ -66,8 +67,39 @@ public class RecrawlBusyThread extends AbstractBusyThread {
urlstack = new HashSet<DigestURL>(); urlstack = new HashSet<DigestURL>();
} }
/**
* Set the query to select documents to recrawl
* and resets the counter to start a fresh query loop
* @param q select query
* @param includefailedurls true=all http status docs are recrawled, false=httpstatus=200 docs are recrawled
*/
public void setQuery(String q, boolean includefailedurls) {
this.currentQuery = q;
this.includefailed = includefailedurls;
this.chunkstart = 0;
}
public String getQuery () {
return this.currentQuery;
}
/**
* Flag to include failed urls (httpstatus_i <> 200)
* if true -> currentQuery is used as is,
* if false -> the term " AND (httpstatus_i:200)" is appended to currentQuery
* @param includefailedurls
*/
public void setIncludeFailed(boolean includefailedurls) {
this.includefailed = includefailedurls;
}
public boolean getIncludeFailed () {
return this.includefailed;
}
/** /**
* feed urls to the local crawler * feed urls to the local crawler
* (Switchboard.addToCrawler() is not used here, as there existing urls are always skiped)
* *
* @return true if urls were added/accepted to the crawler * @return true if urls were added/accepted to the crawler
*/ */
@ -81,7 +113,7 @@ public class RecrawlBusyThread extends AbstractBusyThread {
for (DigestURL url : this.urlstack) { for (DigestURL url : this.urlstack) {
final Request request = sb.loader.request(url, true, true); final Request request = sb.loader.request(url, true, true);
String acceptedError = sb.crawlStacker.checkAcceptanceChangeable(url, profile, 0); String acceptedError = sb.crawlStacker.checkAcceptanceChangeable(url, profile, 0);
if (acceptedError == null) { if (!includefailed && acceptedError == null) { // skip check if failed docs to be included
acceptedError = sb.crawlStacker.checkAcceptanceInitially(url, profile); acceptedError = sb.crawlStacker.checkAcceptanceInitially(url, profile);
} }
if (acceptedError != null) { if (acceptedError != null) {
@ -134,8 +166,9 @@ public class RecrawlBusyThread extends AbstractBusyThread {
SolrConnector solrConnector = sb.index.fulltext().getDefaultConnector(); SolrConnector solrConnector = sb.index.fulltext().getDefaultConnector();
if (!solrConnector.isClosed()) { if (!solrConnector.isClosed()) {
try { try {
docList = solrConnector.getDocumentListByQuery(currentQuery + " AND (" + CollectionSchema.httpstatus_i.name() + ":200)", // query all or only httpstatus=200 depending on includefailed flag
CollectionSchema.fresh_date_dt.getSolrFieldName() + " asc", this.chunkstart, this.chunksize, CollectionSchema.sku.getSolrFieldName()); docList = solrConnector.getDocumentListByQuery(this.includefailed ? currentQuery : currentQuery + " AND (" + CollectionSchema.httpstatus_i.name() + ":200)",
CollectionSchema.load_date_dt.getSolrFieldName() + " asc", this.chunkstart, this.chunksize, CollectionSchema.sku.getSolrFieldName());
this.urlsfound = docList.getNumFound(); this.urlsfound = docList.getNumFound();
} catch (Throwable e) { } catch (Throwable e) {
this.urlsfound = 0; this.urlsfound = 0;

Loading…
Cancel
Save