From 72f6a0b0b2505bbb9f5b87aef6e20f5a27646032 Mon Sep 17 00:00:00 2001 From: reger Date: Sat, 6 Jun 2015 18:45:39 +0200 Subject: [PATCH] enhance recrawl job - allow to modify the query to select documents to process (after job has started) - allow to include failed urls (httpstatus <> 200) --- htroot/IndexReIndexMonitor_p.html | 44 ++++++++++++--- htroot/IndexReIndexMonitor_p.java | 53 +++++++++++++------ .../net/yacy/crawler/RecrawlBusyThread.java | 41 ++++++++++++-- 3 files changed, 109 insertions(+), 29 deletions(-) diff --git a/htroot/IndexReIndexMonitor_p.html b/htroot/IndexReIndexMonitor_p.html index 455f75053..2fecd83bc 100644 --- a/htroot/IndexReIndexMonitor_p.html +++ b/htroot/IndexReIndexMonitor_p.html @@ -61,19 +61,47 @@

Searches the local index and selects documents to add to the crawler (recrawl the document). This runs transparent as background job. Documents are added to the crawler only if no other crawls are active and are added in small chunks.

-
+ + + +
+ #(recrawljobrunning)# to re-crawl documents with fresh_date_dt before today. - :: - - - - -
Documents to process #[docCount]# with fresh_date_dt before today
- #(/recrawljobrunning)# +

+

after starting the recrawl job you can apply a custom Solr query to select documents to be processed

+ :: + #(/recrawljobrunning)# +
+
+ #(recrawljobrunning)#:: +
Re-Crawl Query Details + + + + + + + + + + + + + + + + + + + +
Documents to process#[docCount]#
Current Query#[recrawlquerytext]#
 
 
Edit Solr Query
include failed urls
+
+ #(/recrawljobrunning)# +
#%env/templates/footer.template%# diff --git a/htroot/IndexReIndexMonitor_p.java b/htroot/IndexReIndexMonitor_p.java index beaaf32c1..1022435af 100644 --- a/htroot/IndexReIndexMonitor_p.java +++ b/htroot/IndexReIndexMonitor_p.java @@ -90,27 +90,46 @@ public class IndexReIndexMonitor_p { // recrawl job handling BusyThread recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME); - if (recrawlbt == null) { - if (post != null && post.containsKey("recrawlnow") && sb.index.fulltext().connectedLocalSolr()) { - sb.deployThread(RecrawlBusyThread.THREAD_NAME, - "ReCrawl", - "recrawl existing documents", - null, - new RecrawlBusyThread(Switchboard.getSwitchboard()), - 1000); - recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME); - } - } - if (recrawlbt != null) { - if (post != null && post.containsKey("stoprecrawl")) { - sb.terminateThread(RecrawlBusyThread.THREAD_NAME, false); - prop.put("recrawljobrunning",0); + // to signal that a setting shall change the form provides a fixed parameter setup=recrawljob, if not present return status only + if (post != null && "recrawljob".equals(post.get("setup"))) { // it's a command to recrawlThread + if (recrawlbt == null) { + if (post.containsKey("recrawlnow") && sb.index.fulltext().connectedLocalSolr()) { + sb.deployThread(RecrawlBusyThread.THREAD_NAME, + "ReCrawl", + "recrawl existing documents", + null, + new RecrawlBusyThread(Switchboard.getSwitchboard()), + 1000); + recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME); + } } else { - prop.put("recrawljobrunning", 1); - prop.put("recrawljobrunning_docCount", ((RecrawlBusyThread) recrawlbt).urlsfound); + if (post.containsKey("stoprecrawl")) { + sb.terminateThread(RecrawlBusyThread.THREAD_NAME, false); + prop.put("recrawljobrunning", 0); + } + } + + boolean inclerrdoc = false; + if (post.containsKey("includefailedurls")) { + inclerrdoc = post.getBoolean("includefailedurls"); } + + if (recrawlbt != null && !recrawlbt.shutdownInProgress()) { + if (post.containsKey("updquery") && post.containsKey("recrawlquerytext")) { + ((RecrawlBusyThread) recrawlbt).setQuery(post.get("recrawlquerytext"),inclerrdoc); + } else { + ((RecrawlBusyThread) recrawlbt).setIncludeFailed(inclerrdoc); + } + } + } + // just post status of recrawlThread + if (recrawlbt != null && !recrawlbt.shutdownInProgress()) { // provide status + prop.put("recrawljobrunning", 1); + prop.put("recrawljobrunning_docCount", ((RecrawlBusyThread) recrawlbt).urlsfound); + prop.put("recrawljobrunning_recrawlquerytext", ((RecrawlBusyThread) recrawlbt).getQuery()); + prop.put("recrawljobrunning_includefailedurls", ((RecrawlBusyThread) recrawlbt).getIncludeFailed()); } else { prop.put("recrawljobrunning", 0); } diff --git a/source/net/yacy/crawler/RecrawlBusyThread.java b/source/net/yacy/crawler/RecrawlBusyThread.java index dc0d2e95e..d896abf7f 100644 --- a/source/net/yacy/crawler/RecrawlBusyThread.java +++ b/source/net/yacy/crawler/RecrawlBusyThread.java @@ -49,7 +49,8 @@ public class RecrawlBusyThread extends AbstractBusyThread { public final static String THREAD_NAME = "recrawlindex"; - public String currentQuery = CollectionSchema.fresh_date_dt.getSolrFieldName()+":[* TO NOW/DAY-1DAY]"; // current query + private String currentQuery = CollectionSchema.fresh_date_dt.getSolrFieldName()+":[* TO NOW/DAY-1DAY]"; // current query + private boolean includefailed = false; // flag if docs with httpstatus_i <> 200 shall be recrawled private int chunkstart = 0; private int chunksize = 200; final Switchboard sb; @@ -66,8 +67,39 @@ public class RecrawlBusyThread extends AbstractBusyThread { urlstack = new HashSet(); } + /** + * Set the query to select documents to recrawl + * and resets the counter to start a fresh query loop + * @param q select query + * @param includefailedurls true=all http status docs are recrawled, false=httpstatus=200 docs are recrawled + */ + public void setQuery(String q, boolean includefailedurls) { + this.currentQuery = q; + this.includefailed = includefailedurls; + this.chunkstart = 0; + } + + public String getQuery () { + return this.currentQuery; + } + + /** + * Flag to include failed urls (httpstatus_i <> 200) + * if true -> currentQuery is used as is, + * if false -> the term " AND (httpstatus_i:200)" is appended to currentQuery + * @param includefailedurls + */ + public void setIncludeFailed(boolean includefailedurls) { + this.includefailed = includefailedurls; + } + + public boolean getIncludeFailed () { + return this.includefailed; + } + /** * feed urls to the local crawler + * (Switchboard.addToCrawler() is not used here, as there existing urls are always skiped) * * @return true if urls were added/accepted to the crawler */ @@ -81,7 +113,7 @@ public class RecrawlBusyThread extends AbstractBusyThread { for (DigestURL url : this.urlstack) { final Request request = sb.loader.request(url, true, true); String acceptedError = sb.crawlStacker.checkAcceptanceChangeable(url, profile, 0); - if (acceptedError == null) { + if (!includefailed && acceptedError == null) { // skip check if failed docs to be included acceptedError = sb.crawlStacker.checkAcceptanceInitially(url, profile); } if (acceptedError != null) { @@ -134,8 +166,9 @@ public class RecrawlBusyThread extends AbstractBusyThread { SolrConnector solrConnector = sb.index.fulltext().getDefaultConnector(); if (!solrConnector.isClosed()) { try { - docList = solrConnector.getDocumentListByQuery(currentQuery + " AND (" + CollectionSchema.httpstatus_i.name() + ":200)", - CollectionSchema.fresh_date_dt.getSolrFieldName() + " asc", this.chunkstart, this.chunksize, CollectionSchema.sku.getSolrFieldName()); + // query all or only httpstatus=200 depending on includefailed flag + docList = solrConnector.getDocumentListByQuery(this.includefailed ? currentQuery : currentQuery + " AND (" + CollectionSchema.httpstatus_i.name() + ":200)", + CollectionSchema.load_date_dt.getSolrFieldName() + " asc", this.chunkstart, this.chunksize, CollectionSchema.sku.getSolrFieldName()); this.urlsfound = docList.getNumFound(); } catch (Throwable e) { this.urlsfound = 0;