From 80785b785e9db4ff0464cd370c6994a381fa59b0 Mon Sep 17 00:00:00 2001 From: sgaebel Date: Thu, 9 Jul 2020 19:32:16 +0200 Subject: [PATCH] adds deleting during recrawl --- htroot/IndexReIndexMonitor_p.html | 6 +++ htroot/IndexReIndexMonitor_p.java | 13 ++++- .../net/yacy/crawler/RecrawlBusyThread.java | 48 ++++++++++++++----- 3 files changed, 53 insertions(+), 14 deletions(-) diff --git a/htroot/IndexReIndexMonitor_p.html b/htroot/IndexReIndexMonitor_p.html index bf53426d6..3ef32d53e 100644 --- a/htroot/IndexReIndexMonitor_p.html +++ b/htroot/IndexReIndexMonitor_p.html @@ -73,6 +73,9 @@
+
+ +
to re-crawl documents selected with the given query. @@ -91,6 +94,9 @@
+
+ +
#(/recrawljobrunning)# diff --git a/htroot/IndexReIndexMonitor_p.java b/htroot/IndexReIndexMonitor_p.java index 71c9eceed..80ed6f0a6 100644 --- a/htroot/IndexReIndexMonitor_p.java +++ b/htroot/IndexReIndexMonitor_p.java @@ -123,6 +123,7 @@ public class IndexReIndexMonitor_p { String recrawlQuery = RecrawlBusyThread.DEFAULT_QUERY; boolean inclerrdoc = RecrawlBusyThread.DEFAULT_INCLUDE_FAILED; + boolean deleteOnRecrawl = RecrawlBusyThread.DEFAULT_DELETE_ON_RECRAWL; // to signal that a setting shall change the form provides a fixed parameter setup=recrawljob, if not present return status only if (post != null && "recrawljob".equals(post.get("setup"))) { // it's a command to recrawlThread @@ -136,13 +137,17 @@ public class IndexReIndexMonitor_p { if (post.containsKey("includefailedurls")) { inclerrdoc = post.getBoolean("includefailedurls"); } + + if (post.containsKey("deleteOnRecrawl")) { + deleteOnRecrawl = post.getBoolean("deleteOnRecrawl"); + } if (recrawlbt == null || recrawlbt.shutdownInProgress()) { prop.put("recrawljobrunning_simulationResult", 0); prop.put("recrawljobrunning_error", 0); if (post.containsKey("recrawlnow")) { sb.deployThread(RecrawlBusyThread.THREAD_NAME, "ReCrawl", "recrawl existing documents", null, - new RecrawlBusyThread(Switchboard.getSwitchboard(), recrawlQuery, inclerrdoc), 1000); + new RecrawlBusyThread(Switchboard.getSwitchboard(), recrawlQuery, inclerrdoc, deleteOnRecrawl), 1000); recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME); /* store this call as an api call for easy scheduling possibility */ @@ -192,6 +197,7 @@ public class IndexReIndexMonitor_p { if(post.containsKey("recrawlDefaults")) { recrawlQuery = RecrawlBusyThread.DEFAULT_QUERY; inclerrdoc = RecrawlBusyThread.DEFAULT_INCLUDE_FAILED; + deleteOnRecrawl = RecrawlBusyThread.DEFAULT_DELETE_ON_RECRAWL; } } else { if (post.containsKey("stoprecrawl")) { @@ -204,9 +210,10 @@ public class IndexReIndexMonitor_p { if (recrawlbt != null && !recrawlbt.shutdownInProgress()) { if (post.containsKey("updquery") && post.containsKey("recrawlquerytext")) { - ((RecrawlBusyThread) recrawlbt).setQuery(recrawlQuery, inclerrdoc); + ((RecrawlBusyThread) recrawlbt).setQuery(recrawlQuery, inclerrdoc, deleteOnRecrawl); } else { ((RecrawlBusyThread) recrawlbt).setIncludeFailed(inclerrdoc); + ((RecrawlBusyThread) recrawlbt).setDeleteOnRecrawl(deleteOnRecrawl); } } } @@ -219,10 +226,12 @@ public class IndexReIndexMonitor_p { prop.put("recrawljobrunning_docCount", ((RecrawlBusyThread) recrawlbt).getUrlsToRecrawl()); prop.put("recrawljobrunning_recrawlquerytext", ((RecrawlBusyThread) recrawlbt).getQuery()); prop.put("recrawljobrunning_includefailedurls", ((RecrawlBusyThread) recrawlbt).getIncludeFailed()); + prop.put("recrawljobrunning_deleteOnRecrawl", ((RecrawlBusyThread) recrawlbt).getDeleteOnRecrawl()); } else { prop.put("recrawljobrunning", 0); prop.put("recrawljobrunning_recrawlquerytext", recrawlQuery); prop.put("recrawljobrunning_includefailedurls", inclerrdoc); + prop.put("recrawljobrunning_deleteOnRecrawl", deleteOnRecrawl); } // return rewrite properties diff --git a/source/net/yacy/crawler/RecrawlBusyThread.java b/source/net/yacy/crawler/RecrawlBusyThread.java index c658834ab..3a214a8be 100644 --- a/source/net/yacy/crawler/RecrawlBusyThread.java +++ b/source/net/yacy/crawler/RecrawlBusyThread.java @@ -26,8 +26,10 @@ package net.yacy.crawler; import java.io.IOException; import java.net.MalformedURLException; import java.time.LocalDateTime; +import java.util.ArrayList; import java.util.Date; import java.util.HashSet; +import java.util.List; import java.util.Set; import org.apache.solr.common.SolrDocument; @@ -44,7 +46,6 @@ import net.yacy.crawler.data.NoticedURL; import net.yacy.crawler.retrieval.Request; import net.yacy.kelondro.workflow.AbstractBusyThread; import net.yacy.search.Switchboard; -import net.yacy.search.SwitchboardConstants; import net.yacy.search.schema.CollectionSchema; /** @@ -64,6 +65,9 @@ public class RecrawlBusyThread extends AbstractBusyThread { /** Default value for inclusion or not of documents with a https status different from 200 (success) */ public static final boolean DEFAULT_INCLUDE_FAILED = false; + + /** The default value whether to delete on Recrawl */ + public static final boolean DEFAULT_DELETE_ON_RECRAWL = false; /** The current query selecting documents to recrawl */ private String currentQuery; @@ -71,6 +75,9 @@ public class RecrawlBusyThread extends AbstractBusyThread { /** flag if docs with httpstatus_i <> 200 shall be recrawled */ private boolean includefailed; + /** flag whether to delete on Recrawl */ + private boolean deleteOnRecrawl; + private int chunkstart = 0; private final int chunksize = 100; private final Switchboard sb; @@ -116,16 +123,17 @@ public class RecrawlBusyThread extends AbstractBusyThread { * set to true when documents with a https status different from 200 * (success) must be included */ - public RecrawlBusyThread(final Switchboard xsb, final String query, final boolean includeFailed) { + public RecrawlBusyThread(final Switchboard xsb, final String query, final boolean includeFailed, final boolean deleteOnRecrawl) { super(3000, 1000); // set lower limits of cycle delay setName(THREAD_NAME); this.setIdleSleep(10*60000); // set actual cycle delays this.setBusySleep(2*60000); this.setPriority(Thread.MIN_PRIORITY); - + this.setLoadPreReqisite(1); this.sb = xsb; this.currentQuery = query; this.includefailed = includeFailed; + this.deleteOnRecrawl = deleteOnRecrawl; this.urlstack = new HashSet(); // workaround to prevent solr exception on existing index (not fully reindexed) since intro of schema with docvalues // org.apache.solr.core.SolrCore java.lang.IllegalStateException: unexpected docvalues type NONE for field 'load_date_dt' (expected=NUMERIC). Use UninvertingReader or index with docvalues. @@ -143,10 +151,12 @@ public class RecrawlBusyThread extends AbstractBusyThread { * and resets the counter to start a fresh query loop * @param q select query * @param includefailedurls true=all http status docs are recrawled, false=httpstatus=200 docs are recrawled + * @param deleteOnRecrawl */ - public void setQuery(String q, boolean includefailedurls) { + public void setQuery(String q, boolean includefailedurls, final boolean deleteOnRecrawl) { this.currentQuery = q; this.includefailed = includefailedurls; + this.deleteOnRecrawl = deleteOnRecrawl; this.chunkstart = 0; } @@ -180,6 +190,14 @@ public class RecrawlBusyThread extends AbstractBusyThread { public boolean getIncludeFailed () { return this.includefailed; } + + public void setDeleteOnRecrawl(final boolean deleteOnRecrawl) { + this.deleteOnRecrawl = deleteOnRecrawl; + } + + public boolean getDeleteOnRecrawl() { + return this.deleteOnRecrawl; + } /** * feed urls to the local crawler @@ -290,21 +308,27 @@ public class RecrawlBusyThread extends AbstractBusyThread { } if (docList != null) { + List tobedeletedIDs = new ArrayList<>(); for (final SolrDocument doc : docList) { try { this.urlstack.add(new DigestURL((String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()))); + if (deleteOnRecrawl) tobedeletedIDs.add((String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName())); } catch (final MalformedURLException ex) { this.malformedUrlsCount++; - try { // if index entry hasn't a valid url (useless), delete it - solrConnector.deleteById((String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName())); - this.malformedUrlsDeletedCount++; - ConcurrentLog.severe(THREAD_NAME, "deleted index document with invalid url " + (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName())); - } catch (final IOException ex1) { - ConcurrentLog.severe(THREAD_NAME, ex1.getMessage()); - } + // if index entry hasn't a valid url (useless), delete it + tobedeletedIDs.add((String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName())); + this.malformedUrlsDeletedCount++; + ConcurrentLog.severe(THREAD_NAME, "deleted index document with invalid url " + (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName())); } } - this.chunkstart = this.chunkstart + this.chunksize; + + if (!tobedeletedIDs.isEmpty()) try { + solrConnector.deleteByIds(tobedeletedIDs); + } catch (IOException e) { + ConcurrentLog.severe(THREAD_NAME, "error deleting IDs ", e); + } + + this.chunkstart = deleteOnRecrawl? 0 : this.chunkstart + this.chunksize; } if (docList == null || docList.size() < this.chunksize) {