diff --git a/htroot/IndexReIndexMonitor_p.html b/htroot/IndexReIndexMonitor_p.html index cb5a179f4..455f75053 100644 --- a/htroot/IndexReIndexMonitor_p.html +++ b/htroot/IndexReIndexMonitor_p.html @@ -17,7 +17,7 @@ Documents in current queue #[querysize]# - #(reindexjobrunning)#::#(/reindexjobrunning)# + #(reindexjobrunning)#::#(/reindexjobrunning)# Documents processed @@ -37,7 +37,7 @@ #(reindexjobrunning)# - :: + :: #(/reindexjobrunning)#

#[infomessage]#

@@ -57,6 +57,24 @@ #(/reindexjobrunning)# +

Re-Crawl Index Documents

+

Searches the local index and selects documents to add to the crawler (recrawl the document). + This runs transparent as background job. Documents are added to the crawler only if no other crawls are active + and are added in small chunks.

+
+
+ #(recrawljobrunning)# + + to re-crawl documents with fresh_date_dt before today. + :: + + + + +
Documents to process #[docCount]# with fresh_date_dt before today
+ #(/recrawljobrunning)# +
+
#%env/templates/footer.template%# diff --git a/htroot/IndexReIndexMonitor_p.java b/htroot/IndexReIndexMonitor_p.java index e5689fc20..beaaf32c1 100644 --- a/htroot/IndexReIndexMonitor_p.java +++ b/htroot/IndexReIndexMonitor_p.java @@ -21,6 +21,7 @@ import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.sorting.OrderedScoreMap; import net.yacy.kelondro.workflow.BusyThread; import net.yacy.migration; +import net.yacy.crawler.RecrawlBusyThread; import net.yacy.search.Switchboard; import net.yacy.search.index.ReindexSolrBusyThread; @@ -36,26 +37,26 @@ public class IndexReIndexMonitor_p { prop.put("docsprocessed", "0"); prop.put("currentselectquery",""); - BusyThread bt = sb.getThread(ReindexSolrBusyThread.THREAD_NAME); - if (bt == null) { + BusyThread reidxbt = sb.getThread(ReindexSolrBusyThread.THREAD_NAME); + if (reidxbt == null) { if (post != null && post.containsKey("reindexnow") && sb.index.fulltext().connectedLocalSolr()) { migration.reindexToschema(sb); prop.put("querysize", "0"); prop.put("infomessage","reindex job started"); - bt = sb.getThread(ReindexSolrBusyThread.THREAD_NAME); //get new created job for following posts + reidxbt = sb.getThread(ReindexSolrBusyThread.THREAD_NAME); //get new created job for following posts } } - if (bt != null) { + if (reidxbt != null) { prop.put("reindexjobrunning", 1); - prop.put("querysize", bt.getJobCount()); + prop.put("querysize", reidxbt.getJobCount()); - if (bt instanceof ReindexSolrBusyThread) { - prop.put("docsprocessed", ((ReindexSolrBusyThread) bt).getProcessed()); - prop.put("currentselectquery","q="+((ReindexSolrBusyThread) bt).getCurrentQuery()); + if (reidxbt instanceof ReindexSolrBusyThread) { + prop.put("docsprocessed", ((ReindexSolrBusyThread) reidxbt).getProcessed()); + prop.put("currentselectquery","q="+((ReindexSolrBusyThread) reidxbt).getCurrentQuery()); // prepare list of fields in queue - final OrderedScoreMap querylist = ((ReindexSolrBusyThread) bt).getQueryList(); + final OrderedScoreMap querylist = ((ReindexSolrBusyThread) reidxbt).getQueryList(); if (querylist != null) { int i = 0; for (String oneqs : querylist) { // just use fieldname from query (fieldname:[* TO *]) @@ -86,6 +87,34 @@ public class IndexReIndexMonitor_p { prop.putHTML("infomessage", "! reindex works only with embedded Solr index !"); } } + + // recrawl job handling + BusyThread recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME); + if (recrawlbt == null) { + if (post != null && post.containsKey("recrawlnow") && sb.index.fulltext().connectedLocalSolr()) { + sb.deployThread(RecrawlBusyThread.THREAD_NAME, + "ReCrawl", + "recrawl existing documents", + null, + new RecrawlBusyThread(Switchboard.getSwitchboard()), + 1000); + recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME); + } + } + + if (recrawlbt != null) { + if (post != null && post.containsKey("stoprecrawl")) { + sb.terminateThread(RecrawlBusyThread.THREAD_NAME, false); + prop.put("recrawljobrunning",0); + + } else { + prop.put("recrawljobrunning", 1); + prop.put("recrawljobrunning_docCount", ((RecrawlBusyThread) recrawlbt).urlsfound); + } + } else { + prop.put("recrawljobrunning", 0); + } + // return rewrite properties return prop; } diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java index ee5a8415b..5bb3c068f 100644 --- a/source/net/yacy/cora/document/id/MultiProtocolURL.java +++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java @@ -519,7 +519,7 @@ public class MultiProtocolURL implements Serializable, Comparable element: getAttributes().entrySet()) { qtmp.append('&'); - qtmp.append(element.getKey()); + qtmp.append(escape(element.getKey())); qtmp.append('='); qtmp.append(escape(element.getValue())); } @@ -1007,6 +1007,11 @@ public class MultiProtocolURL implements Serializable, Comparable getAttributes() { Map map = new LinkedHashMap<>(); if (this.searchpart == null) return map; @@ -1016,7 +1021,7 @@ public class MultiProtocolURL implements Serializable, Comparable. + */ +package net.yacy.crawler; + +import java.net.MalformedURLException; +import java.util.HashSet; +import java.util.Set; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.federate.solr.connector.SolrConnector; +import net.yacy.cora.util.ConcurrentLog; +import net.yacy.crawler.data.CrawlProfile; +import net.yacy.crawler.data.NoticedURL; +import net.yacy.crawler.retrieval.Request; +import net.yacy.kelondro.workflow.AbstractBusyThread; +import net.yacy.search.Switchboard; +import net.yacy.search.schema.CollectionSchema; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; + +/** + * Selects documents by a query from the local index + * and feeds the found urls to the crawler to recrawl the documents. + * This is intended to keep the index up-to-date + * Currently the doucments are selected by expired fresh_date_dt field + * an added to the crawler in smaller chunks (see chunksize) as long as no other crawl is runnin. + */ +public class RecrawlBusyThread extends AbstractBusyThread { + + public final static String THREAD_NAME = "recrawlindex"; + + public String currentQuery = CollectionSchema.fresh_date_dt.getSolrFieldName()+":[* TO NOW/DAY-1DAY]"; // current query + private int chunkstart = 0; + private int chunksize = 200; + final Switchboard sb; + private Set urlstack; // buffer of urls to recrawl + public long urlsfound = 0; + + public RecrawlBusyThread(Switchboard xsb) { + super(3000, 1000); // set lower limits of cycle delay + this.setIdleSleep(10*60000); // set actual cycle delays + this.setBusySleep(2*60000); + this.setPriority(Thread.MIN_PRIORITY); + + this.sb = xsb; + urlstack = new HashSet(); + } + + /** + * feed urls to the local crawler + * + * @return true if urls were added/accepted to the crawler + */ + private boolean feedToCrawler() { + + int added = 0; + + if (!this.urlstack.isEmpty()) { + final CrawlProfile profile = sb.crawler.defaultTextSnippetGlobalProfile; + + for (DigestURL url : this.urlstack) { + final Request request = sb.loader.request(url, true, true); + String acceptedError = sb.crawlStacker.checkAcceptanceChangeable(url, profile, 0); + if (acceptedError == null) { + acceptedError = sb.crawlStacker.checkAcceptanceInitially(url, profile); + } + if (acceptedError != null) { + ConcurrentLog.info(THREAD_NAME, "addToCrawler: cannot load " + url.toNormalform(true) + ": " + acceptedError); + continue; + } + final String s; + s = sb.crawlQueues.noticeURL.push(NoticedURL.StackType.LOCAL, request, profile, sb.robots); + + if (s != null) { + ConcurrentLog.info(THREAD_NAME, "addToCrawler: failed to add " + url.toNormalform(true) + ": " + s); + } else { + added++; + } + } + this.urlstack.clear(); + } + return (added > 0); + } + + /** + * Process query and hand over urls to the crawler + * + * @return true if something processed + */ + @Override + public boolean job() { + // other crawls are running, do nothing + if (sb.crawlQueues.coreCrawlJobSize() > 0) { + return false; + } + + if (this.urlstack.isEmpty()) { + return processSingleQuery(); + } else { + return feedToCrawler(); + } + + } + + /** + * Selects documents to recrawl the urls + * @return true if query has more results + */ + private boolean processSingleQuery() { + if (!this.urlstack.isEmpty()) { + return true; + } + SolrDocumentList docList = null; + SolrConnector solrConnector = sb.index.fulltext().getDefaultConnector(); + if (!solrConnector.isClosed()) { + try { + docList = solrConnector.getDocumentListByQuery(currentQuery + " AND (" + CollectionSchema.httpstatus_i.name() + ":200)", + CollectionSchema.fresh_date_dt.getSolrFieldName() + " asc", this.chunkstart, this.chunksize, CollectionSchema.sku.getSolrFieldName()); + this.urlsfound = docList.getNumFound(); + } catch (Throwable e) { + this.urlsfound = 0; + } + } else { + this.urlsfound =0; + } + + if (docList != null) { + for (SolrDocument doc : docList) { + try { + this.urlstack.add(new DigestURL((String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()))); + } catch (MalformedURLException ex) { + } + } + this.chunkstart = this.chunkstart + this.chunksize; + } + + if (this.urlsfound <= this.chunkstart) { + this.chunkstart = 0; + return false; + // TODO: add a stop condition + } + return true; + } + + @Override + public int getJobCount() { + return this.urlstack.size(); + } + + @Override + public void freemem() { + this.urlstack.clear(); + } + +} diff --git a/source/net/yacy/crawler/retrieval/FileLoader.java b/source/net/yacy/crawler/retrieval/FileLoader.java index 9a27e737f..cf0b683e7 100644 --- a/source/net/yacy/crawler/retrieval/FileLoader.java +++ b/source/net/yacy/crawler/retrieval/FileLoader.java @@ -60,7 +60,7 @@ public class FileLoader { public Response load(final Request request, boolean acceptOnlyParseable) throws IOException { DigestURL url = request.url(); - if (!url.getProtocol().equals("file")) throw new IOException("wrong loader for FileLoader: " + url.getProtocol()); + if (!url.getProtocol().equals("file")) throw new IOException("wrong protocol for FileLoader: " + url.getProtocol()); RequestHeader requestHeader = new RequestHeader(); if (request.referrerhash() != null) { diff --git a/source/net/yacy/kelondro/workflow/AbstractBusyThread.java b/source/net/yacy/kelondro/workflow/AbstractBusyThread.java index de15a0d80..898bcd435 100644 --- a/source/net/yacy/kelondro/workflow/AbstractBusyThread.java +++ b/source/net/yacy/kelondro/workflow/AbstractBusyThread.java @@ -252,7 +252,7 @@ public abstract class AbstractBusyThread extends AbstractThread implements BusyT // do a clean-up this.freemem(); // sleep a while - ratz(this.idlePause + 1000*(outofmemoryCycles++)); + ratz(this.idlePause + 1000*(outofmemoryCycles++ % 0x0F)); // limit extra sleep time (oomCycles can grow big over time) idletime += System.currentTimeMillis() - timestamp; } } diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index aa0655195..56308ebb2 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -3294,7 +3294,7 @@ public final class Switchboard extends serverSwitch { if (acceptedError == null) acceptedError = this.crawlStacker.checkAcceptanceInitially(url, profile); if (acceptedError != null) { this.log.info("addToCrawler: cannot load " + url.toNormalform(true) + ": " + acceptedError); - return; + continue; } final String s; if (asglobal) { diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 0084e4f13..362de281e 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -1888,7 +1888,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri List a = new ArrayList(dimension); for (int i = 0; i < dimension; i++) a.add("http"); if (iplist == null) return a; - for (Object ip: iplist) a.set(Integer.parseInt(((String) ip).substring(0, 3)), ((String) ip).substring(4)); + for (Object ip : iplist) { + // ip format is 001-https but can be 4 digits 1011-https + int i = ((String) ip).indexOf('-'); + a.set(Integer.parseInt(((String) ip).substring(0, i)), ((String) ip).substring(i+1)); + } return a; } diff --git a/test/net/yacy/cora/document/id/MultiProtocolURLTest.java b/test/net/yacy/cora/document/id/MultiProtocolURLTest.java index 790aaf64d..ad4555c69 100644 --- a/test/net/yacy/cora/document/id/MultiProtocolURLTest.java +++ b/test/net/yacy/cora/document/id/MultiProtocolURLTest.java @@ -4,6 +4,7 @@ import static org.junit.Assert.*; import java.net.MalformedURLException; import java.util.LinkedHashMap; +import java.util.Map; import java.util.TreeSet; import org.junit.Test; @@ -158,7 +159,7 @@ public class MultiProtocolURLTest { for (String[] testString : testStrings) { // desired conversion result - System.out.print("orig uri: " + testString[0]); + System.out.print("toNormalform orig uri: " + testString[0]); String shouldBe = testString[1]; // conversion result String resultUrl = new MultiProtocolURL(testString[0]).toNormalform(true); @@ -167,6 +168,30 @@ public class MultiProtocolURLTest { System.out.println(" -> " + resultUrl); } } + + /** + * Test of getAttribute method, of class MultiProtocolURL. + */ + @Test + public void testGetAttribute() throws Exception { + // some test url/uri with problems in the past + String[][] testStrings = new String[][]{ + // teststring , expectedresult + new String[]{"http://yacy.net?&test", "test"} + }; + + for (String[] testString : testStrings) { + // desired conversion result + System.out.print("test getAttribute: " + testString[0]); + String shouldBe = testString[1]; + + MultiProtocolURL resultUrl = new MultiProtocolURL(testString[0]); + Map attr = resultUrl.getAttributes(); + + assertEquals("", attr.get(shouldBe)); + System.out.println(" -> " + resultUrl.toNormalform(false)); + } + } }