From 4e033356257b0ef969dabcc2a22460712add4a52 Mon Sep 17 00:00:00 2001 From: luccioman Date: Fri, 12 Jan 2018 11:47:13 +0100 Subject: [PATCH] Added more details to the recrawl job report --- htroot/IndexReIndexMonitor_p.html | 17 ++++- htroot/IndexReIndexMonitor_p.java | 32 ++++++---- .../net/yacy/crawler/RecrawlBusyThread.java | 62 ++++++++++++++++--- 3 files changed, 91 insertions(+), 20 deletions(-) diff --git a/htroot/IndexReIndexMonitor_p.html b/htroot/IndexReIndexMonitor_p.html index 51e15593c..3fea776c5 100644 --- a/htroot/IndexReIndexMonitor_p.html +++ b/htroot/IndexReIndexMonitor_p.html @@ -132,12 +132,17 @@

#(jobStatus)#::::Last #(/jobStatus)#Re-Crawl job report

+ #(error)#::#(/error)# + + + + @@ -147,8 +152,16 @@ - - + + + + + + + + + +
Status #(jobStatus)#Running::Shutdown in progress::Terminated#(/jobStatus)#
Query#[recrawlquerytext]#
Start time #[startTime]##[endTime]#
Count#[recrawledUrlsCount]# URLs added to the crawler queue for recrawlRecrawled URLs#[recrawledUrlsCount]#
Rejected URLs#[rejectedUrlsCount]#
Malformed URLs#[malformedUrlsCount]#
diff --git a/htroot/IndexReIndexMonitor_p.java b/htroot/IndexReIndexMonitor_p.java index 9d3acac58..04c3c881b 100644 --- a/htroot/IndexReIndexMonitor_p.java +++ b/htroot/IndexReIndexMonitor_p.java @@ -257,6 +257,24 @@ public class IndexReIndexMonitor_p { final serverObjects prop, final RecrawlBusyThread recrawlbt) { if (recrawlbt != null) { prop.put("recrawlReport", 1); + + prop.put("recrawlReport_error", recrawlbt.isTerminatedBySolrFailure()); + + + int jobStatus; + if(recrawlbt.isAlive()) { + if(recrawlbt.shutdownInProgress()) { + jobStatus = 1; // Shutdown in progress + } else { + jobStatus = 0; // Running + } + } else { + jobStatus = 2; // Terminated + } + prop.put("recrawlReport_jobStatus", jobStatus); + + prop.put("recrawlReport_recrawlquerytext", recrawlbt.getQuery()); + Locale formatLocale; if (sb != null) { String lng = sb.getConfig("locale.language", Locale.ENGLISH.getLanguage()); @@ -272,20 +290,12 @@ public class IndexReIndexMonitor_p { } final DateTimeFormatter formatter = DateTimeFormatter.ofLocalizedDateTime(FormatStyle.MEDIUM) .withLocale(formatLocale); - int jobStatus; - if(recrawlbt.isAlive()) { - if(recrawlbt.shutdownInProgress()) { - jobStatus = 1; // Shutdown in progress - } else { - jobStatus = 0; // Running - } - } else { - jobStatus = 2; // Terminated - } - prop.put("recrawlReport_jobStatus", jobStatus); prop.put("recrawlReport_startTime", formatDateTime(formatter, recrawlbt.getStartTime())); prop.put("recrawlReport_endTime", formatDateTime(formatter, recrawlbt.getEndTime())); prop.put("recrawlReport_recrawledUrlsCount", recrawlbt.getRecrawledUrlsCount()); + prop.put("recrawlReport_rejectedUrlsCount", recrawlbt.getRejectedUrlsCount()); + prop.put("recrawlReport_malformedUrlsCount", recrawlbt.getMalformedUrlsCount()); + prop.put("recrawlReport_malformedUrlsDeletedCount", recrawlbt.getMalformedUrlsDeletedCount()); } else { prop.put("recrawlReport", 0); } diff --git a/source/net/yacy/crawler/RecrawlBusyThread.java b/source/net/yacy/crawler/RecrawlBusyThread.java index 9fe6ce5e4..07d6393c8 100644 --- a/source/net/yacy/crawler/RecrawlBusyThread.java +++ b/source/net/yacy/crawler/RecrawlBusyThread.java @@ -80,11 +80,23 @@ public class RecrawlBusyThread extends AbstractBusyThread { /** Total number of URLs added to the crawler queue for recrawl */ private long recrawledUrlsCount = 0; + /** Total number of URLs rejected for some reason by the crawl stacker or the crawler queue */ + private long rejectedUrlsCount = 0; + + /** Total number of malformed URLs found */ + private long malformedUrlsCount = 0; + + /** Total number of malformed URLs deleted from index */ + private long malformedUrlsDeletedCount = 0; + private String solrSortBy; /** Set to true when more URLs are still to be processed */ private boolean moreToRecrawl = true; + /** True when the job terminated early because an error occurred when requesting the Solr index, or the Solr index was closed */ + private boolean terminatedBySolrFailure = false; + /** The recrawl job start time */ private LocalDateTime startTime; @@ -173,13 +185,14 @@ public class RecrawlBusyThread extends AbstractBusyThread { if (!this.urlstack.isEmpty()) { final CrawlProfile profile = sb.crawler.defaultTextSnippetGlobalProfile; - for (DigestURL url : this.urlstack) { + for (final DigestURL url : this.urlstack) { final Request request = sb.loader.request(url, true, true); String acceptedError = sb.crawlStacker.checkAcceptanceChangeable(url, profile, 0); if (!includefailed && acceptedError == null) { // skip check if failed docs to be included acceptedError = sb.crawlStacker.checkAcceptanceInitially(url, profile); } if (acceptedError != null) { + this.rejectedUrlsCount++; ConcurrentLog.info(THREAD_NAME, "addToCrawler: cannot load " + url.toNormalform(true) + ": " + acceptedError); continue; } @@ -187,6 +200,7 @@ public class RecrawlBusyThread extends AbstractBusyThread { s = sb.crawlQueues.noticeURL.push(NoticedURL.StackType.LOCAL, request, profile, sb.robots); if (s != null) { + this.rejectedUrlsCount++; ConcurrentLog.info(THREAD_NAME, "addToCrawler: failed to add " + url.toNormalform(true) + ": " + s); } else { added++; @@ -248,9 +262,10 @@ public class RecrawlBusyThread extends AbstractBusyThread { return true; } SolrDocumentList docList = null; - SolrConnector solrConnector = sb.index.fulltext().getDefaultConnector(); + final SolrConnector solrConnector = sb.index.fulltext().getDefaultConnector(); if (solrConnector.isClosed()) { this.urlsToRecrawl = 0; + this.terminatedBySolrFailure = true; return false; } @@ -261,17 +276,20 @@ public class RecrawlBusyThread extends AbstractBusyThread { this.urlsToRecrawl = docList.getNumFound(); } catch (final Throwable e) { this.urlsToRecrawl = 0; + this.terminatedBySolrFailure = true; } if (docList != null) { for (final SolrDocument doc : docList) { try { this.urlstack.add(new DigestURL((String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()))); - } catch (MalformedURLException ex) { + } catch (final MalformedURLException ex) { + this.malformedUrlsCount++; try { // if index entry hasn't a valid url (useless), delete it solrConnector.deleteById((String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName())); + this.malformedUrlsDeletedCount++; ConcurrentLog.severe(THREAD_NAME, "deleted index document with invalid url " + (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName())); - } catch (IOException ex1) { + } catch (final IOException ex1) { ConcurrentLog.severe(THREAD_NAME, ex1.getMessage()); } } @@ -297,12 +315,42 @@ public class RecrawlBusyThread extends AbstractBusyThread { return this.urlsToRecrawl; } - /** - * @return The total number of URLs added to the crawler queue for recrawl - */ + /** + * @return The total number of URLs added to the crawler queue for recrawl + */ public long getRecrawledUrlsCount() { return this.recrawledUrlsCount; } + + /** + * @return The total number of URLs rejected for some reason by the crawl + * stacker or the crawler queue + */ + public long getRejectedUrlsCount() { + return this.rejectedUrlsCount; + } + + /** + * @return The total number of malformed URLs found + */ + public long getMalformedUrlsCount() { + return this.malformedUrlsCount; + } + + /** + * @return The total number of malformed URLs deleted from index + */ + public long getMalformedUrlsDeletedCount() { + return this.malformedUrlsDeletedCount; + } + + /** + * @return true when the job terminated early because an error occurred when + * requesting the Solr index, or the Solr index was closed + */ + public boolean isTerminatedBySolrFailure() { + return this.terminatedBySolrFailure; + } /** @return The recrawl job start time */ public LocalDateTime getStartTime() {