Added a report info box about eventual last terminated recrawl job

For easier monitoring of recrawls.
pull/154/head
luccioman 7 years ago
parent b2af25b14f
commit 433e241e4f

@ -62,7 +62,38 @@
<p>Searches the local index and selects documents to add to the crawler (recrawl the document). <p>Searches the local index and selects documents to add to the crawler (recrawl the document).
This runs transparent as background job. Documents are added to the crawler only if no other crawls are active This runs transparent as background job. Documents are added to the crawler only if no other crawls are active
and are added in small chunks.</p> and are added in small chunks.</p>
<form action="IndexReIndexMonitor_p.html?setup=recrawljob" method="post" enctype="multipart/form-data" accept-charset="UTF-8"> <div class="container-fluid">
<div class="row">
#(recrawljobrunning)#
#(recrawlReport)#::
<div class="col-md-10 col-lg-6">
<div class="panel panel-info">
<div class="panel-heading">
<h3 class="panel-title">Last Re-Crawl job report</h3>
</div>
<div class="panel-body">
<table class="table">
<tbody>
<tr>
<th scope="row">Start time</th>
<td>#[startTime]#</td>
</tr>
<tr>
<th scope="row">End time</th>
<td>#[endTime]#</td>
</tr>
<tr>
<th scope="row">Count</th>
<td>#[recrawledUrlsCount]# URLs added to the crawler queue for recrawl</td>
</tr>
</tbody>
</table>
</div>
</div>
</div>
#(/recrawlReport)#
#(/recrawljobrunning)#
<form action="IndexReIndexMonitor_p.html?setup=recrawljob" method="post" enctype="multipart/form-data" accept-charset="UTF-8" class="col-md-10 col-lg-6">
<input type="hidden" name="transactionToken" value="#[transactionToken]#" /> <input type="hidden" name="transactionToken" value="#[transactionToken]#" />
<table><tr valign="top"><td> <table><tr valign="top"><td>
<fieldset> <fieldset>
@ -119,6 +150,8 @@
</td> </td>
</tr></table> </tr></table>
</form> </form>
</div>
</div>
#%env/templates/footer.template%# #%env/templates/footer.template%#
</body> </body>
</html> </html>

@ -17,17 +17,21 @@
* along with this program in the file lgpl21.txt If not, see * along with this program in the file lgpl21.txt If not, see
* <http://www.gnu.org/licenses/>. * <http://www.gnu.org/licenses/>.
*/ */
import java.io.IOException;
import java.time.DateTimeException;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.time.format.FormatStyle;
import java.util.Locale;
import net.yacy.migration;
import net.yacy.cora.federate.solr.connector.SolrConnector; import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.sorting.OrderedScoreMap; import net.yacy.cora.sorting.OrderedScoreMap;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
import net.yacy.kelondro.workflow.BusyThread;
import java.io.IOException;
import net.yacy.migration;
import net.yacy.crawler.RecrawlBusyThread; import net.yacy.crawler.RecrawlBusyThread;
import net.yacy.data.TransactionManager; import net.yacy.data.TransactionManager;
import net.yacy.kelondro.workflow.BusyThread;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
import net.yacy.search.index.ReindexSolrBusyThread; import net.yacy.search.index.ReindexSolrBusyThread;
import net.yacy.server.serverObjects; import net.yacy.server.serverObjects;
@ -149,7 +153,9 @@ public class IndexReIndexMonitor_p {
} }
} else { } else {
if (post.containsKey("stoprecrawl")) { if (post.containsKey("stoprecrawl")) {
sb.terminateThread(RecrawlBusyThread.THREAD_NAME, false); /* We do not remove the thread from the Switchboard worker threads using serverSwitch.terminateThread(String,boolean),
* because we want to be able to provide a report after its termination */
recrawlbt.terminate(false);
prop.put("recrawljobrunning", 0); prop.put("recrawljobrunning", 0);
} }
} }
@ -169,7 +175,8 @@ public class IndexReIndexMonitor_p {
prop.put("recrawljobrunning_recrawlquerytext", ((RecrawlBusyThread) recrawlbt).getQuery()); prop.put("recrawljobrunning_recrawlquerytext", ((RecrawlBusyThread) recrawlbt).getQuery());
prop.put("recrawljobrunning_includefailedurls", ((RecrawlBusyThread) recrawlbt).getIncludeFailed()); prop.put("recrawljobrunning_includefailedurls", ((RecrawlBusyThread) recrawlbt).getIncludeFailed());
} else { } else {
prop.put("recrawljobrunning", 0); prop.put("recrawljobrunning", 0);
processRecrawlReport(header, sb, prop, (RecrawlBusyThread)recrawlbt);
prop.put("recrawljobrunning_recrawlquerytext", recrawlQuery); prop.put("recrawljobrunning_recrawlquerytext", recrawlQuery);
prop.put("recrawljobrunning_includefailedurls", inclerrdoc); prop.put("recrawljobrunning_includefailedurls", inclerrdoc);
} }
@ -177,4 +184,53 @@ public class IndexReIndexMonitor_p {
// return rewrite properties // return rewrite properties
return prop; return prop;
} }
/**
* Write information on the eventual last recrawl job terminated
* @param header current request header. Must not be null.
* @param sb Switchboard instance holding server environment
* @param prop this template result
* @param recrawlbt the eventual terminated recrawl thread
*/
private static void processRecrawlReport(final RequestHeader header, final Switchboard sb,
final serverObjects prop, final RecrawlBusyThread recrawlbt) {
if (recrawlbt != null) {
prop.put("recrawljobrunning_recrawlReport", 1);
String lng = sb.getConfig("locale.language", Locale.ENGLISH.getLanguage());
Locale formatLocale;
if ("browser".equals(lng)) {
/* Only use the client locale when locale.language is set to browser */
formatLocale = header.getLocale();
} else {
formatLocale = Locale.forLanguageTag(lng);
}
final DateTimeFormatter formatter = DateTimeFormatter.ofLocalizedDateTime(FormatStyle.MEDIUM)
.withLocale(formatLocale);
prop.put("recrawljobrunning_recrawlReport_startTime", formatDateTime(formatter, recrawlbt.getStartTime()));
prop.put("recrawljobrunning_recrawlReport_endTime", formatDateTime(formatter, recrawlbt.getEndTime()));
prop.put("recrawljobrunning_recrawlReport_recrawledUrlsCount", recrawlbt.getRecrawledUrlsCount());
} else {
prop.put("recrawljobrunning_recrawlReport", 0);
}
}
/**
* @param formatter the formatter to use. Must not be null.
* @param time the date/time value to format. Can be null.
* @return a string representing the formatted date/time, eventually empty.
*/
protected static String formatDateTime(final DateTimeFormatter formatter, final LocalDateTime time) {
String formattedTime;
if(time != null) {
try {
formattedTime = time.format(formatter);
} catch(final DateTimeException e) {
/* Fallback to ISO-8601 on any eventual formatting failure */
formattedTime = time.toString();
}
} else {
formattedTime = "";
}
return formattedTime;
}
} }

@ -25,8 +25,13 @@ package net.yacy.crawler;
import java.io.IOException; import java.io.IOException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.time.LocalDateTime;
import java.util.HashSet; import java.util.HashSet;
import java.util.Set; import java.util.Set;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.connector.SolrConnector; import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
@ -37,8 +42,6 @@ import net.yacy.kelondro.workflow.AbstractBusyThread;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants; import net.yacy.search.SwitchboardConstants;
import net.yacy.search.schema.CollectionSchema; import net.yacy.search.schema.CollectionSchema;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
/** /**
* Selects documents by a query from the local index * Selects documents by a query from the local index
@ -74,10 +77,19 @@ public class RecrawlBusyThread extends AbstractBusyThread {
/** The total number of candidate URLs found for recrawl */ /** The total number of candidate URLs found for recrawl */
private long urlsToRecrawl = 0; private long urlsToRecrawl = 0;
/** Total number of URLs added to the crawler queue for recrawl */
private long recrawledUrlsCount = 0;
private String solrSortBy; private String solrSortBy;
/** Set to true when more URLs are still to be processed */ /** Set to true when more URLs are still to be processed */
private boolean moreToRecrawl = true; private boolean moreToRecrawl = true;
/** The recrawl job start time */
private LocalDateTime startTime;
/** The recrawl job end time */
private LocalDateTime endTime;
/** /**
* @param xsb * @param xsb
@ -117,7 +129,7 @@ public class RecrawlBusyThread extends AbstractBusyThread {
this.chunkstart = 0; this.chunkstart = 0;
} }
public String getQuery () { public String getQuery() {
return this.currentQuery; return this.currentQuery;
} }
@ -178,6 +190,7 @@ public class RecrawlBusyThread extends AbstractBusyThread {
ConcurrentLog.info(THREAD_NAME, "addToCrawler: failed to add " + url.toNormalform(true) + ": " + s); ConcurrentLog.info(THREAD_NAME, "addToCrawler: failed to add " + url.toNormalform(true) + ": " + s);
} else { } else {
added++; added++;
this.recrawledUrlsCount++;
} }
} }
this.urlstack.clear(); this.urlstack.clear();
@ -212,7 +225,18 @@ public class RecrawlBusyThread extends AbstractBusyThread {
didSomething = feedToCrawler(); didSomething = feedToCrawler();
} }
return didSomething; return didSomething;
}
@Override
public synchronized void start() {
this.startTime = LocalDateTime.now();
super.start();
}
@Override
public void terminate(boolean waitFor) {
super.terminate(waitFor);
this.endTime = LocalDateTime.now();
} }
/** /**
@ -273,6 +297,23 @@ public class RecrawlBusyThread extends AbstractBusyThread {
return this.urlsToRecrawl; return this.urlsToRecrawl;
} }
/**
* @return The total number of URLs added to the crawler queue for recrawl
*/
public long getRecrawledUrlsCount() {
return this.recrawledUrlsCount;
}
/** @return The recrawl job start time */
public LocalDateTime getStartTime() {
return this.startTime;
}
/** @return The recrawl job end time */
public LocalDateTime getEndTime() {
return this.endTime;
}
@Override @Override
public void freemem() { public void freemem() {
this.urlstack.clear(); this.urlstack.clear();

Loading…
Cancel
Save