Added more details to the recrawl job report

pull/154/head
luccioman 7 years ago
parent d95d393a0d
commit 4e03335625

@ -132,12 +132,17 @@
<h3 class="panel-title">#(jobStatus)#::::Last #(/jobStatus)#Re-Crawl job report</h3> <h3 class="panel-title">#(jobStatus)#::::Last #(/jobStatus)#Re-Crawl job report</h3>
</div> </div>
<div class="panel-body"> <div class="panel-body">
#(error)#::<div class="alert alert-danger" role="alert">The job terminated early due to an error when requesting the Solr index.</div>#(/error)#
<table class="table"> <table class="table">
<tbody> <tbody>
<tr> <tr>
<th scope="row">Status</th> <th scope="row">Status</th>
<td>#(jobStatus)#Running::Shutdown in progress::Terminated#(/jobStatus)#</td> <td>#(jobStatus)#Running::Shutdown in progress::Terminated#(/jobStatus)#</td>
</tr> </tr>
<tr>
<th scope="row">Query</th>
<td>#[recrawlquerytext]#</td>
</tr>
<tr> <tr>
<th scope="row">Start time</th> <th scope="row">Start time</th>
<td>#[startTime]#</td> <td>#[startTime]#</td>
@ -147,8 +152,16 @@
<td>#[endTime]#</td> <td>#[endTime]#</td>
</tr> </tr>
<tr> <tr>
<th scope="row">Count</th> <th scope="row" title="URLs added to the crawler queue for recrawl">Recrawled URLs</th>
<td>#[recrawledUrlsCount]# URLs added to the crawler queue for recrawl</td> <td>#[recrawledUrlsCount]#</td>
</tr>
<tr>
<th scope="row" title="URLs rejected for some reason by the crawl stacker or the crawler queue. Please check the logs for more details.">Rejected URLs</th>
<td>#[rejectedUrlsCount]#</td>
</tr>
<tr>
<th scope="row">Malformed URLs</th>
<td title="#[malformedUrlsDeletedCount]# deleted from the index">#[malformedUrlsCount]#</td>
</tr> </tr>
</tbody> </tbody>
</table> </table>

@ -257,6 +257,24 @@ public class IndexReIndexMonitor_p {
final serverObjects prop, final RecrawlBusyThread recrawlbt) { final serverObjects prop, final RecrawlBusyThread recrawlbt) {
if (recrawlbt != null) { if (recrawlbt != null) {
prop.put("recrawlReport", 1); prop.put("recrawlReport", 1);
prop.put("recrawlReport_error", recrawlbt.isTerminatedBySolrFailure());
int jobStatus;
if(recrawlbt.isAlive()) {
if(recrawlbt.shutdownInProgress()) {
jobStatus = 1; // Shutdown in progress
} else {
jobStatus = 0; // Running
}
} else {
jobStatus = 2; // Terminated
}
prop.put("recrawlReport_jobStatus", jobStatus);
prop.put("recrawlReport_recrawlquerytext", recrawlbt.getQuery());
Locale formatLocale; Locale formatLocale;
if (sb != null) { if (sb != null) {
String lng = sb.getConfig("locale.language", Locale.ENGLISH.getLanguage()); String lng = sb.getConfig("locale.language", Locale.ENGLISH.getLanguage());
@ -272,20 +290,12 @@ public class IndexReIndexMonitor_p {
} }
final DateTimeFormatter formatter = DateTimeFormatter.ofLocalizedDateTime(FormatStyle.MEDIUM) final DateTimeFormatter formatter = DateTimeFormatter.ofLocalizedDateTime(FormatStyle.MEDIUM)
.withLocale(formatLocale); .withLocale(formatLocale);
int jobStatus;
if(recrawlbt.isAlive()) {
if(recrawlbt.shutdownInProgress()) {
jobStatus = 1; // Shutdown in progress
} else {
jobStatus = 0; // Running
}
} else {
jobStatus = 2; // Terminated
}
prop.put("recrawlReport_jobStatus", jobStatus);
prop.put("recrawlReport_startTime", formatDateTime(formatter, recrawlbt.getStartTime())); prop.put("recrawlReport_startTime", formatDateTime(formatter, recrawlbt.getStartTime()));
prop.put("recrawlReport_endTime", formatDateTime(formatter, recrawlbt.getEndTime())); prop.put("recrawlReport_endTime", formatDateTime(formatter, recrawlbt.getEndTime()));
prop.put("recrawlReport_recrawledUrlsCount", recrawlbt.getRecrawledUrlsCount()); prop.put("recrawlReport_recrawledUrlsCount", recrawlbt.getRecrawledUrlsCount());
prop.put("recrawlReport_rejectedUrlsCount", recrawlbt.getRejectedUrlsCount());
prop.put("recrawlReport_malformedUrlsCount", recrawlbt.getMalformedUrlsCount());
prop.put("recrawlReport_malformedUrlsDeletedCount", recrawlbt.getMalformedUrlsDeletedCount());
} else { } else {
prop.put("recrawlReport", 0); prop.put("recrawlReport", 0);
} }

@ -80,11 +80,23 @@ public class RecrawlBusyThread extends AbstractBusyThread {
/** Total number of URLs added to the crawler queue for recrawl */ /** Total number of URLs added to the crawler queue for recrawl */
private long recrawledUrlsCount = 0; private long recrawledUrlsCount = 0;
/** Total number of URLs rejected for some reason by the crawl stacker or the crawler queue */
private long rejectedUrlsCount = 0;
/** Total number of malformed URLs found */
private long malformedUrlsCount = 0;
/** Total number of malformed URLs deleted from index */
private long malformedUrlsDeletedCount = 0;
private String solrSortBy; private String solrSortBy;
/** Set to true when more URLs are still to be processed */ /** Set to true when more URLs are still to be processed */
private boolean moreToRecrawl = true; private boolean moreToRecrawl = true;
/** True when the job terminated early because an error occurred when requesting the Solr index, or the Solr index was closed */
private boolean terminatedBySolrFailure = false;
/** The recrawl job start time */ /** The recrawl job start time */
private LocalDateTime startTime; private LocalDateTime startTime;
@ -173,13 +185,14 @@ public class RecrawlBusyThread extends AbstractBusyThread {
if (!this.urlstack.isEmpty()) { if (!this.urlstack.isEmpty()) {
final CrawlProfile profile = sb.crawler.defaultTextSnippetGlobalProfile; final CrawlProfile profile = sb.crawler.defaultTextSnippetGlobalProfile;
for (DigestURL url : this.urlstack) { for (final DigestURL url : this.urlstack) {
final Request request = sb.loader.request(url, true, true); final Request request = sb.loader.request(url, true, true);
String acceptedError = sb.crawlStacker.checkAcceptanceChangeable(url, profile, 0); String acceptedError = sb.crawlStacker.checkAcceptanceChangeable(url, profile, 0);
if (!includefailed && acceptedError == null) { // skip check if failed docs to be included if (!includefailed && acceptedError == null) { // skip check if failed docs to be included
acceptedError = sb.crawlStacker.checkAcceptanceInitially(url, profile); acceptedError = sb.crawlStacker.checkAcceptanceInitially(url, profile);
} }
if (acceptedError != null) { if (acceptedError != null) {
this.rejectedUrlsCount++;
ConcurrentLog.info(THREAD_NAME, "addToCrawler: cannot load " + url.toNormalform(true) + ": " + acceptedError); ConcurrentLog.info(THREAD_NAME, "addToCrawler: cannot load " + url.toNormalform(true) + ": " + acceptedError);
continue; continue;
} }
@ -187,6 +200,7 @@ public class RecrawlBusyThread extends AbstractBusyThread {
s = sb.crawlQueues.noticeURL.push(NoticedURL.StackType.LOCAL, request, profile, sb.robots); s = sb.crawlQueues.noticeURL.push(NoticedURL.StackType.LOCAL, request, profile, sb.robots);
if (s != null) { if (s != null) {
this.rejectedUrlsCount++;
ConcurrentLog.info(THREAD_NAME, "addToCrawler: failed to add " + url.toNormalform(true) + ": " + s); ConcurrentLog.info(THREAD_NAME, "addToCrawler: failed to add " + url.toNormalform(true) + ": " + s);
} else { } else {
added++; added++;
@ -248,9 +262,10 @@ public class RecrawlBusyThread extends AbstractBusyThread {
return true; return true;
} }
SolrDocumentList docList = null; SolrDocumentList docList = null;
SolrConnector solrConnector = sb.index.fulltext().getDefaultConnector(); final SolrConnector solrConnector = sb.index.fulltext().getDefaultConnector();
if (solrConnector.isClosed()) { if (solrConnector.isClosed()) {
this.urlsToRecrawl = 0; this.urlsToRecrawl = 0;
this.terminatedBySolrFailure = true;
return false; return false;
} }
@ -261,17 +276,20 @@ public class RecrawlBusyThread extends AbstractBusyThread {
this.urlsToRecrawl = docList.getNumFound(); this.urlsToRecrawl = docList.getNumFound();
} catch (final Throwable e) { } catch (final Throwable e) {
this.urlsToRecrawl = 0; this.urlsToRecrawl = 0;
this.terminatedBySolrFailure = true;
} }
if (docList != null) { if (docList != null) {
for (final SolrDocument doc : docList) { for (final SolrDocument doc : docList) {
try { try {
this.urlstack.add(new DigestURL((String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()))); this.urlstack.add(new DigestURL((String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName())));
} catch (MalformedURLException ex) { } catch (final MalformedURLException ex) {
this.malformedUrlsCount++;
try { // if index entry hasn't a valid url (useless), delete it try { // if index entry hasn't a valid url (useless), delete it
solrConnector.deleteById((String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName())); solrConnector.deleteById((String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()));
this.malformedUrlsDeletedCount++;
ConcurrentLog.severe(THREAD_NAME, "deleted index document with invalid url " + (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName())); ConcurrentLog.severe(THREAD_NAME, "deleted index document with invalid url " + (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
} catch (IOException ex1) { } catch (final IOException ex1) {
ConcurrentLog.severe(THREAD_NAME, ex1.getMessage()); ConcurrentLog.severe(THREAD_NAME, ex1.getMessage());
} }
} }
@ -297,12 +315,42 @@ public class RecrawlBusyThread extends AbstractBusyThread {
return this.urlsToRecrawl; return this.urlsToRecrawl;
} }
/** /**
* @return The total number of URLs added to the crawler queue for recrawl * @return The total number of URLs added to the crawler queue for recrawl
*/ */
public long getRecrawledUrlsCount() { public long getRecrawledUrlsCount() {
return this.recrawledUrlsCount; return this.recrawledUrlsCount;
} }
/**
* @return The total number of URLs rejected for some reason by the crawl
* stacker or the crawler queue
*/
public long getRejectedUrlsCount() {
return this.rejectedUrlsCount;
}
/**
* @return The total number of malformed URLs found
*/
public long getMalformedUrlsCount() {
return this.malformedUrlsCount;
}
/**
* @return The total number of malformed URLs deleted from index
*/
public long getMalformedUrlsDeletedCount() {
return this.malformedUrlsDeletedCount;
}
/**
* @return true when the job terminated early because an error occurred when
* requesting the Solr index, or the Solr index was closed
*/
public boolean isTerminatedBySolrFailure() {
return this.terminatedBySolrFailure;
}
/** @return The recrawl job start time */ /** @return The recrawl job start time */
public LocalDateTime getStartTime() { public LocalDateTime getStartTime() {

Loading…
Cancel
Save