From ace71a8877cfb5310314b038864c4b6725dff6d1 Mon Sep 17 00:00:00 2001 From: reger Date: Sat, 16 May 2015 01:23:08 +0200 Subject: [PATCH] Initial (experimental) implementation of index update/re-crawl job added to IndexReIndexMonitor_p.html Selects existing documents from index and feeds it to the crawler. currently only the field fresh_date_dt is used determine documents for recrawl (fresh_date_dt:[* TO NOW-1DAY] Documents are added in small chunks (200) to the crawler, only if no other crawl is running. --- htroot/IndexReIndexMonitor_p.html | 22 ++- htroot/IndexReIndexMonitor_p.java | 47 ++++- .../net/yacy/crawler/RecrawlBusyThread.java | 184 ++++++++++++++++++ 3 files changed, 242 insertions(+), 11 deletions(-) create mode 100644 source/net/yacy/crawler/RecrawlBusyThread.java diff --git a/htroot/IndexReIndexMonitor_p.html b/htroot/IndexReIndexMonitor_p.html index cb5a179f4..455f75053 100644 --- a/htroot/IndexReIndexMonitor_p.html +++ b/htroot/IndexReIndexMonitor_p.html @@ -17,7 +17,7 @@ Documents in current queue #[querysize]# - #(reindexjobrunning)#::#(/reindexjobrunning)# + #(reindexjobrunning)#::#(/reindexjobrunning)# Documents processed @@ -37,7 +37,7 @@ #(reindexjobrunning)# - :: + :: #(/reindexjobrunning)#

#[infomessage]#

@@ -57,6 +57,24 @@ #(/reindexjobrunning)# +

Re-Crawl Index Documents

+

Searches the local index and selects documents to add to the crawler (recrawl the document). + This runs transparent as background job. Documents are added to the crawler only if no other crawls are active + and are added in small chunks.

+
+
+ #(recrawljobrunning)# + + to re-crawl documents with fresh_date_dt before today. + :: + + + + +
Documents to process #[docCount]# with fresh_date_dt before today
+ #(/recrawljobrunning)# +
+
#%env/templates/footer.template%# diff --git a/htroot/IndexReIndexMonitor_p.java b/htroot/IndexReIndexMonitor_p.java index e5689fc20..beaaf32c1 100644 --- a/htroot/IndexReIndexMonitor_p.java +++ b/htroot/IndexReIndexMonitor_p.java @@ -21,6 +21,7 @@ import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.sorting.OrderedScoreMap; import net.yacy.kelondro.workflow.BusyThread; import net.yacy.migration; +import net.yacy.crawler.RecrawlBusyThread; import net.yacy.search.Switchboard; import net.yacy.search.index.ReindexSolrBusyThread; @@ -36,26 +37,26 @@ public class IndexReIndexMonitor_p { prop.put("docsprocessed", "0"); prop.put("currentselectquery",""); - BusyThread bt = sb.getThread(ReindexSolrBusyThread.THREAD_NAME); - if (bt == null) { + BusyThread reidxbt = sb.getThread(ReindexSolrBusyThread.THREAD_NAME); + if (reidxbt == null) { if (post != null && post.containsKey("reindexnow") && sb.index.fulltext().connectedLocalSolr()) { migration.reindexToschema(sb); prop.put("querysize", "0"); prop.put("infomessage","reindex job started"); - bt = sb.getThread(ReindexSolrBusyThread.THREAD_NAME); //get new created job for following posts + reidxbt = sb.getThread(ReindexSolrBusyThread.THREAD_NAME); //get new created job for following posts } } - if (bt != null) { + if (reidxbt != null) { prop.put("reindexjobrunning", 1); - prop.put("querysize", bt.getJobCount()); + prop.put("querysize", reidxbt.getJobCount()); - if (bt instanceof ReindexSolrBusyThread) { - prop.put("docsprocessed", ((ReindexSolrBusyThread) bt).getProcessed()); - prop.put("currentselectquery","q="+((ReindexSolrBusyThread) bt).getCurrentQuery()); + if (reidxbt instanceof ReindexSolrBusyThread) { + prop.put("docsprocessed", ((ReindexSolrBusyThread) reidxbt).getProcessed()); + prop.put("currentselectquery","q="+((ReindexSolrBusyThread) reidxbt).getCurrentQuery()); // prepare list of fields in queue - final OrderedScoreMap querylist = ((ReindexSolrBusyThread) bt).getQueryList(); + final OrderedScoreMap querylist = ((ReindexSolrBusyThread) reidxbt).getQueryList(); if (querylist != null) { int i = 0; for (String oneqs : querylist) { // just use fieldname from query (fieldname:[* TO *]) @@ -86,6 +87,34 @@ public class IndexReIndexMonitor_p { prop.putHTML("infomessage", "! reindex works only with embedded Solr index !"); } } + + // recrawl job handling + BusyThread recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME); + if (recrawlbt == null) { + if (post != null && post.containsKey("recrawlnow") && sb.index.fulltext().connectedLocalSolr()) { + sb.deployThread(RecrawlBusyThread.THREAD_NAME, + "ReCrawl", + "recrawl existing documents", + null, + new RecrawlBusyThread(Switchboard.getSwitchboard()), + 1000); + recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME); + } + } + + if (recrawlbt != null) { + if (post != null && post.containsKey("stoprecrawl")) { + sb.terminateThread(RecrawlBusyThread.THREAD_NAME, false); + prop.put("recrawljobrunning",0); + + } else { + prop.put("recrawljobrunning", 1); + prop.put("recrawljobrunning_docCount", ((RecrawlBusyThread) recrawlbt).urlsfound); + } + } else { + prop.put("recrawljobrunning", 0); + } + // return rewrite properties return prop; } diff --git a/source/net/yacy/crawler/RecrawlBusyThread.java b/source/net/yacy/crawler/RecrawlBusyThread.java new file mode 100644 index 000000000..e04d7915b --- /dev/null +++ b/source/net/yacy/crawler/RecrawlBusyThread.java @@ -0,0 +1,184 @@ +/** + * RecrawlBusyThread.java + * Copyright 2015 by Burkhard Buelte + * First released 15.05.2015 at http://yacy.net + * + * This is a part of YaCy, a peer-to-peer based web search engine + * + * LICENSE + * + * This library is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the Free + * Software Foundation; either version 2.1 of the License, or (at your option) + * any later version. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more + * details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt If not, see + * . + */ +package net.yacy.crawler; + +import java.net.MalformedURLException; +import java.util.HashSet; +import java.util.Set; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.federate.solr.connector.SolrConnector; +import net.yacy.cora.util.ConcurrentLog; +import net.yacy.crawler.data.CrawlProfile; +import net.yacy.crawler.data.NoticedURL; +import net.yacy.crawler.retrieval.Request; +import net.yacy.kelondro.workflow.AbstractBusyThread; +import net.yacy.search.Switchboard; +import net.yacy.search.schema.CollectionSchema; +import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; +import org.apache.solr.common.params.CommonParams; + +/** + * Selects documents by a query from the local index + * and feeds the found urls to the crawler to recrawl the documents. + * This is intended to keep the index up-to-date + * Currently the doucments are selected by expired fresh_date_dt field + * an added to the crawler in smaller chunks (see chunksize) as long as no other crawl is runnin. + */ +public class RecrawlBusyThread extends AbstractBusyThread { + + public final static String THREAD_NAME = "recrawlindex"; + + public String currentQuery = CollectionSchema.fresh_date_dt.getSolrFieldName()+":[* TO NOW/DAY-1DAY]"; // current query + private int chunkstart = 0; + private int chunksize = 200; + final Switchboard sb; + private Set urlstack; // buffer of urls to recrawl + public long urlsfound = 0; + + public RecrawlBusyThread(Switchboard xsb) { + super(3000, 1000); // set lower limits of cycle delay + this.setIdleSleep(10*60000); // set actual cycle delays + this.setBusySleep(2*60000); + + this.sb = xsb; + urlstack = new HashSet(); + + } + + /** + * feed urls to the local crawler + * + * @return true if urls were added/accepted to the crawler + */ + private boolean feedToCrawler() { + + int added = 0; + + if (!this.urlstack.isEmpty()) { + final CrawlProfile profile = sb.crawler.defaultTextSnippetGlobalProfile; + + for (DigestURL url : this.urlstack) { + final Request request = sb.loader.request(url, true, true); + String acceptedError = sb.crawlStacker.checkAcceptanceChangeable(url, profile, 0); + if (acceptedError == null) { + acceptedError = sb.crawlStacker.checkAcceptanceInitially(url, profile); + } + if (acceptedError != null) { + ConcurrentLog.info(THREAD_NAME, "addToCrawler: cannot load " + url.toNormalform(true) + ": " + acceptedError); + continue; + } + final String s; + s = sb.crawlQueues.noticeURL.push(NoticedURL.StackType.LOCAL, request, profile, sb.robots); + + if (s != null) { + ConcurrentLog.info(THREAD_NAME, "addToCrawler: failed to add " + url.toNormalform(true) + ": " + s); + } else { + added++; + } + } + this.urlstack.clear(); + } + + if (added > 0) { + return true; + } + return false; + } + + /** + * Process query and hand over urls to the crawler + * + * @return true if something processed + */ + @Override + public boolean job() { + if (sb.crawlQueues.coreCrawlJobSize() > 0) { + return false; + } + + if (this.urlstack.isEmpty()) { + processSingleQuery(); + return true; + } else { + return feedToCrawler(); + } + + } + + /** + * Selects documents to recrawl the urls + */ + private void processSingleQuery() { + if (!this.urlstack.isEmpty()) { + return; + } + SolrDocumentList docList = null; + SolrQuery solrQuery = new SolrQuery(); + solrQuery.set(CommonParams.Q, currentQuery + " AND (" + CollectionSchema.httpstatus_i.name() + ":200)"); // except this yacy special + solrQuery.set("sort", CollectionSchema.fresh_date_dt.getSolrFieldName() + " asc"); + solrQuery.set(CommonParams.FL, CollectionSchema.sku.getSolrFieldName()); + solrQuery.set(CommonParams.ROWS, this.chunksize); + solrQuery.set(CommonParams.START, this.chunkstart); + + SolrConnector solrConnector = sb.index.fulltext().getDefaultConnector(); + if (!solrConnector.isClosed()) { + try { + QueryResponse rsp = solrConnector.getResponseByParams(solrQuery); + docList = rsp.getResults(); + this.urlsfound = docList.getNumFound(); + } catch (Throwable e) { + } + } + + if (docList != null) { + for (SolrDocument doc : docList) { + try { + this.urlstack.add(new DigestURL((String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()))); + } catch (MalformedURLException ex) { + } + } + + this.chunkstart = this.chunkstart + urlstack.size(); + + if (docList.getNumFound() <= this.chunkstart) { + this.chunkstart = 0; + } + } + + } + + @Override + public int getJobCount() { + return this.urlstack.size(); + } + + @Override + public void freemem() { + this.urlstack.clear(); + } + +}