From c37dda8849cd2f4a8264bc218c14f7cc0683c840 Mon Sep 17 00:00:00 2001 From: reger Date: Tue, 12 May 2015 01:09:10 +0200 Subject: [PATCH 1/8] fix NPE on MultiProtocolURL on url with parameter value and '=' in getAttribute - added test case for it --- .../cora/document/id/MultiProtocolURL.java | 7 ++++- .../document/id/MultiProtocolURLTest.java | 27 ++++++++++++++++++- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java index ee5a8415b..78e470ebf 100644 --- a/source/net/yacy/cora/document/id/MultiProtocolURL.java +++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java @@ -1007,6 +1007,11 @@ public class MultiProtocolURL implements Serializable, Comparable getAttributes() { Map map = new LinkedHashMap<>(); if (this.searchpart == null) return map; @@ -1016,7 +1021,7 @@ public class MultiProtocolURL implements Serializable, Comparable " + resultUrl); } } + + /** + * Test of getAttribute method, of class MultiProtocolURL. + */ + @Test + public void testGetAttribute() throws Exception { + // some test url/uri with problems in the past + String[][] testStrings = new String[][]{ + // teststring , expectedresult + new String[]{"http://yacy.net?&test", "test"} + }; + + for (String[] testString : testStrings) { + // desired conversion result + System.out.print("test getAttribute: " + testString[0]); + String shouldBe = testString[1]; + + MultiProtocolURL resultUrl = new MultiProtocolURL(testString[0]); + Map attr = resultUrl.getAttributes(); + + assertEquals("", attr.get(shouldBe)); + System.out.println(" -> " + resultUrl.toNormalform(false)); + } + } } From 5c67c4d4600dfefe452bd6b26b88baacc2e49483 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 12 May 2015 12:06:21 +0200 Subject: [PATCH 2/8] fix for latest commit, see https://github.com/yacy/yacy_search_server/commit/f810915717579d490259d70610dc4118b7c6e6e9#commitcomment-11145880 --- source/net/yacy/cora/document/id/MultiProtocolURL.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java index ee5a8415b..3cb66219b 100644 --- a/source/net/yacy/cora/document/id/MultiProtocolURL.java +++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java @@ -519,7 +519,7 @@ public class MultiProtocolURL implements Serializable, Comparable element: getAttributes().entrySet()) { qtmp.append('&'); - qtmp.append(element.getKey()); + qtmp.append(escape(element.getKey())); qtmp.append('='); qtmp.append(escape(element.getValue())); } From 2bc9cb582878011d928e93b191f9b4f945afc521 Mon Sep 17 00:00:00 2001 From: reger Date: Wed, 13 May 2015 21:58:43 +0200 Subject: [PATCH 3/8] fix early return in addToCrawler check / handle all supplied urls after error url --- source/net/yacy/search/Switchboard.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index aa0655195..56308ebb2 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -3294,7 +3294,7 @@ public final class Switchboard extends serverSwitch { if (acceptedError == null) acceptedError = this.crawlStacker.checkAcceptanceInitially(url, profile); if (acceptedError != null) { this.log.info("addToCrawler: cannot load " + url.toNormalform(true) + ": " + acceptedError); - return; + continue; } final String s; if (asglobal) { From f3ce99bfb8f054be1eb9860c70a2df1960b6d963 Mon Sep 17 00:00:00 2001 From: reger Date: Thu, 14 May 2015 00:03:09 +0200 Subject: [PATCH 4/8] fix extract of inboundlinks_protocol_sxt url counter maybe > 999 --- source/net/yacy/search/schema/CollectionConfiguration.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 0084e4f13..362de281e 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -1888,7 +1888,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri List a = new ArrayList(dimension); for (int i = 0; i < dimension; i++) a.add("http"); if (iplist == null) return a; - for (Object ip: iplist) a.set(Integer.parseInt(((String) ip).substring(0, 3)), ((String) ip).substring(4)); + for (Object ip : iplist) { + // ip format is 001-https but can be 4 digits 1011-https + int i = ((String) ip).indexOf('-'); + a.set(Integer.parseInt(((String) ip).substring(0, i)), ((String) ip).substring(i+1)); + } return a; } From 141cd8045636bff269a7268a886833e25b5b2564 Mon Sep 17 00:00:00 2001 From: reger Date: Sat, 16 May 2015 00:01:54 +0200 Subject: [PATCH 5/8] correct log msg text --- source/net/yacy/crawler/retrieval/FileLoader.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/net/yacy/crawler/retrieval/FileLoader.java b/source/net/yacy/crawler/retrieval/FileLoader.java index 9a27e737f..cf0b683e7 100644 --- a/source/net/yacy/crawler/retrieval/FileLoader.java +++ b/source/net/yacy/crawler/retrieval/FileLoader.java @@ -60,7 +60,7 @@ public class FileLoader { public Response load(final Request request, boolean acceptOnlyParseable) throws IOException { DigestURL url = request.url(); - if (!url.getProtocol().equals("file")) throw new IOException("wrong loader for FileLoader: " + url.getProtocol()); + if (!url.getProtocol().equals("file")) throw new IOException("wrong protocol for FileLoader: " + url.getProtocol()); RequestHeader requestHeader = new RequestHeader(); if (request.referrerhash() != null) { From ace71a8877cfb5310314b038864c4b6725dff6d1 Mon Sep 17 00:00:00 2001 From: reger Date: Sat, 16 May 2015 01:23:08 +0200 Subject: [PATCH 6/8] Initial (experimental) implementation of index update/re-crawl job added to IndexReIndexMonitor_p.html Selects existing documents from index and feeds it to the crawler. currently only the field fresh_date_dt is used determine documents for recrawl (fresh_date_dt:[* TO NOW-1DAY] Documents are added in small chunks (200) to the crawler, only if no other crawl is running. --- htroot/IndexReIndexMonitor_p.html | 22 ++- htroot/IndexReIndexMonitor_p.java | 47 ++++- .../net/yacy/crawler/RecrawlBusyThread.java | 184 ++++++++++++++++++ 3 files changed, 242 insertions(+), 11 deletions(-) create mode 100644 source/net/yacy/crawler/RecrawlBusyThread.java diff --git a/htroot/IndexReIndexMonitor_p.html b/htroot/IndexReIndexMonitor_p.html index cb5a179f4..455f75053 100644 --- a/htroot/IndexReIndexMonitor_p.html +++ b/htroot/IndexReIndexMonitor_p.html @@ -17,7 +17,7 @@ Documents in current queue #[querysize]# - #(reindexjobrunning)#::#(/reindexjobrunning)# + #(reindexjobrunning)#::#(/reindexjobrunning)# Documents processed @@ -37,7 +37,7 @@ #(reindexjobrunning)# - :: + :: #(/reindexjobrunning)#

#[infomessage]#

@@ -57,6 +57,24 @@ #(/reindexjobrunning)# +

Re-Crawl Index Documents

+

Searches the local index and selects documents to add to the crawler (recrawl the document). + This runs transparent as background job. Documents are added to the crawler only if no other crawls are active + and are added in small chunks.

+
+
+ #(recrawljobrunning)# + + to re-crawl documents with fresh_date_dt before today. + :: + + + + +
Documents to process #[docCount]# with fresh_date_dt before today
+ #(/recrawljobrunning)# +
+
#%env/templates/footer.template%# diff --git a/htroot/IndexReIndexMonitor_p.java b/htroot/IndexReIndexMonitor_p.java index e5689fc20..beaaf32c1 100644 --- a/htroot/IndexReIndexMonitor_p.java +++ b/htroot/IndexReIndexMonitor_p.java @@ -21,6 +21,7 @@ import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.sorting.OrderedScoreMap; import net.yacy.kelondro.workflow.BusyThread; import net.yacy.migration; +import net.yacy.crawler.RecrawlBusyThread; import net.yacy.search.Switchboard; import net.yacy.search.index.ReindexSolrBusyThread; @@ -36,26 +37,26 @@ public class IndexReIndexMonitor_p { prop.put("docsprocessed", "0"); prop.put("currentselectquery",""); - BusyThread bt = sb.getThread(ReindexSolrBusyThread.THREAD_NAME); - if (bt == null) { + BusyThread reidxbt = sb.getThread(ReindexSolrBusyThread.THREAD_NAME); + if (reidxbt == null) { if (post != null && post.containsKey("reindexnow") && sb.index.fulltext().connectedLocalSolr()) { migration.reindexToschema(sb); prop.put("querysize", "0"); prop.put("infomessage","reindex job started"); - bt = sb.getThread(ReindexSolrBusyThread.THREAD_NAME); //get new created job for following posts + reidxbt = sb.getThread(ReindexSolrBusyThread.THREAD_NAME); //get new created job for following posts } } - if (bt != null) { + if (reidxbt != null) { prop.put("reindexjobrunning", 1); - prop.put("querysize", bt.getJobCount()); + prop.put("querysize", reidxbt.getJobCount()); - if (bt instanceof ReindexSolrBusyThread) { - prop.put("docsprocessed", ((ReindexSolrBusyThread) bt).getProcessed()); - prop.put("currentselectquery","q="+((ReindexSolrBusyThread) bt).getCurrentQuery()); + if (reidxbt instanceof ReindexSolrBusyThread) { + prop.put("docsprocessed", ((ReindexSolrBusyThread) reidxbt).getProcessed()); + prop.put("currentselectquery","q="+((ReindexSolrBusyThread) reidxbt).getCurrentQuery()); // prepare list of fields in queue - final OrderedScoreMap querylist = ((ReindexSolrBusyThread) bt).getQueryList(); + final OrderedScoreMap querylist = ((ReindexSolrBusyThread) reidxbt).getQueryList(); if (querylist != null) { int i = 0; for (String oneqs : querylist) { // just use fieldname from query (fieldname:[* TO *]) @@ -86,6 +87,34 @@ public class IndexReIndexMonitor_p { prop.putHTML("infomessage", "! reindex works only with embedded Solr index !"); } } + + // recrawl job handling + BusyThread recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME); + if (recrawlbt == null) { + if (post != null && post.containsKey("recrawlnow") && sb.index.fulltext().connectedLocalSolr()) { + sb.deployThread(RecrawlBusyThread.THREAD_NAME, + "ReCrawl", + "recrawl existing documents", + null, + new RecrawlBusyThread(Switchboard.getSwitchboard()), + 1000); + recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME); + } + } + + if (recrawlbt != null) { + if (post != null && post.containsKey("stoprecrawl")) { + sb.terminateThread(RecrawlBusyThread.THREAD_NAME, false); + prop.put("recrawljobrunning",0); + + } else { + prop.put("recrawljobrunning", 1); + prop.put("recrawljobrunning_docCount", ((RecrawlBusyThread) recrawlbt).urlsfound); + } + } else { + prop.put("recrawljobrunning", 0); + } + // return rewrite properties return prop; } diff --git a/source/net/yacy/crawler/RecrawlBusyThread.java b/source/net/yacy/crawler/RecrawlBusyThread.java new file mode 100644 index 000000000..e04d7915b --- /dev/null +++ b/source/net/yacy/crawler/RecrawlBusyThread.java @@ -0,0 +1,184 @@ +/** + * RecrawlBusyThread.java + * Copyright 2015 by Burkhard Buelte + * First released 15.05.2015 at http://yacy.net + * + * This is a part of YaCy, a peer-to-peer based web search engine + * + * LICENSE + * + * This library is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the Free + * Software Foundation; either version 2.1 of the License, or (at your option) + * any later version. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more + * details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt If not, see + * . + */ +package net.yacy.crawler; + +import java.net.MalformedURLException; +import java.util.HashSet; +import java.util.Set; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.federate.solr.connector.SolrConnector; +import net.yacy.cora.util.ConcurrentLog; +import net.yacy.crawler.data.CrawlProfile; +import net.yacy.crawler.data.NoticedURL; +import net.yacy.crawler.retrieval.Request; +import net.yacy.kelondro.workflow.AbstractBusyThread; +import net.yacy.search.Switchboard; +import net.yacy.search.schema.CollectionSchema; +import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; +import org.apache.solr.common.params.CommonParams; + +/** + * Selects documents by a query from the local index + * and feeds the found urls to the crawler to recrawl the documents. + * This is intended to keep the index up-to-date + * Currently the doucments are selected by expired fresh_date_dt field + * an added to the crawler in smaller chunks (see chunksize) as long as no other crawl is runnin. + */ +public class RecrawlBusyThread extends AbstractBusyThread { + + public final static String THREAD_NAME = "recrawlindex"; + + public String currentQuery = CollectionSchema.fresh_date_dt.getSolrFieldName()+":[* TO NOW/DAY-1DAY]"; // current query + private int chunkstart = 0; + private int chunksize = 200; + final Switchboard sb; + private Set urlstack; // buffer of urls to recrawl + public long urlsfound = 0; + + public RecrawlBusyThread(Switchboard xsb) { + super(3000, 1000); // set lower limits of cycle delay + this.setIdleSleep(10*60000); // set actual cycle delays + this.setBusySleep(2*60000); + + this.sb = xsb; + urlstack = new HashSet(); + + } + + /** + * feed urls to the local crawler + * + * @return true if urls were added/accepted to the crawler + */ + private boolean feedToCrawler() { + + int added = 0; + + if (!this.urlstack.isEmpty()) { + final CrawlProfile profile = sb.crawler.defaultTextSnippetGlobalProfile; + + for (DigestURL url : this.urlstack) { + final Request request = sb.loader.request(url, true, true); + String acceptedError = sb.crawlStacker.checkAcceptanceChangeable(url, profile, 0); + if (acceptedError == null) { + acceptedError = sb.crawlStacker.checkAcceptanceInitially(url, profile); + } + if (acceptedError != null) { + ConcurrentLog.info(THREAD_NAME, "addToCrawler: cannot load " + url.toNormalform(true) + ": " + acceptedError); + continue; + } + final String s; + s = sb.crawlQueues.noticeURL.push(NoticedURL.StackType.LOCAL, request, profile, sb.robots); + + if (s != null) { + ConcurrentLog.info(THREAD_NAME, "addToCrawler: failed to add " + url.toNormalform(true) + ": " + s); + } else { + added++; + } + } + this.urlstack.clear(); + } + + if (added > 0) { + return true; + } + return false; + } + + /** + * Process query and hand over urls to the crawler + * + * @return true if something processed + */ + @Override + public boolean job() { + if (sb.crawlQueues.coreCrawlJobSize() > 0) { + return false; + } + + if (this.urlstack.isEmpty()) { + processSingleQuery(); + return true; + } else { + return feedToCrawler(); + } + + } + + /** + * Selects documents to recrawl the urls + */ + private void processSingleQuery() { + if (!this.urlstack.isEmpty()) { + return; + } + SolrDocumentList docList = null; + SolrQuery solrQuery = new SolrQuery(); + solrQuery.set(CommonParams.Q, currentQuery + " AND (" + CollectionSchema.httpstatus_i.name() + ":200)"); // except this yacy special + solrQuery.set("sort", CollectionSchema.fresh_date_dt.getSolrFieldName() + " asc"); + solrQuery.set(CommonParams.FL, CollectionSchema.sku.getSolrFieldName()); + solrQuery.set(CommonParams.ROWS, this.chunksize); + solrQuery.set(CommonParams.START, this.chunkstart); + + SolrConnector solrConnector = sb.index.fulltext().getDefaultConnector(); + if (!solrConnector.isClosed()) { + try { + QueryResponse rsp = solrConnector.getResponseByParams(solrQuery); + docList = rsp.getResults(); + this.urlsfound = docList.getNumFound(); + } catch (Throwable e) { + } + } + + if (docList != null) { + for (SolrDocument doc : docList) { + try { + this.urlstack.add(new DigestURL((String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()))); + } catch (MalformedURLException ex) { + } + } + + this.chunkstart = this.chunkstart + urlstack.size(); + + if (docList.getNumFound() <= this.chunkstart) { + this.chunkstart = 0; + } + } + + } + + @Override + public int getJobCount() { + return this.urlstack.size(); + } + + @Override + public void freemem() { + this.urlstack.clear(); + } + +} From cd7c0e0aae88fe21fe05e96456d0d83f8184c8be Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 17 May 2015 00:13:00 +0200 Subject: [PATCH 7/8] detail optimization of RecrawlThread --- .../net/yacy/crawler/RecrawlBusyThread.java | 49 ++++++++----------- 1 file changed, 20 insertions(+), 29 deletions(-) diff --git a/source/net/yacy/crawler/RecrawlBusyThread.java b/source/net/yacy/crawler/RecrawlBusyThread.java index e04d7915b..dc0d2e95e 100644 --- a/source/net/yacy/crawler/RecrawlBusyThread.java +++ b/source/net/yacy/crawler/RecrawlBusyThread.java @@ -35,11 +35,8 @@ import net.yacy.crawler.retrieval.Request; import net.yacy.kelondro.workflow.AbstractBusyThread; import net.yacy.search.Switchboard; import net.yacy.search.schema.CollectionSchema; -import org.apache.solr.client.solrj.SolrQuery; -import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; -import org.apache.solr.common.params.CommonParams; /** * Selects documents by a query from the local index @@ -63,10 +60,10 @@ public class RecrawlBusyThread extends AbstractBusyThread { super(3000, 1000); // set lower limits of cycle delay this.setIdleSleep(10*60000); // set actual cycle delays this.setBusySleep(2*60000); + this.setPriority(Thread.MIN_PRIORITY); this.sb = xsb; urlstack = new HashSet(); - } /** @@ -102,11 +99,7 @@ public class RecrawlBusyThread extends AbstractBusyThread { } this.urlstack.clear(); } - - if (added > 0) { - return true; - } - return false; + return (added > 0); } /** @@ -116,13 +109,13 @@ public class RecrawlBusyThread extends AbstractBusyThread { */ @Override public boolean job() { + // other crawls are running, do nothing if (sb.crawlQueues.coreCrawlJobSize() > 0) { return false; } if (this.urlstack.isEmpty()) { - processSingleQuery(); - return true; + return processSingleQuery(); } else { return feedToCrawler(); } @@ -131,27 +124,24 @@ public class RecrawlBusyThread extends AbstractBusyThread { /** * Selects documents to recrawl the urls + * @return true if query has more results */ - private void processSingleQuery() { + private boolean processSingleQuery() { if (!this.urlstack.isEmpty()) { - return; + return true; } SolrDocumentList docList = null; - SolrQuery solrQuery = new SolrQuery(); - solrQuery.set(CommonParams.Q, currentQuery + " AND (" + CollectionSchema.httpstatus_i.name() + ":200)"); // except this yacy special - solrQuery.set("sort", CollectionSchema.fresh_date_dt.getSolrFieldName() + " asc"); - solrQuery.set(CommonParams.FL, CollectionSchema.sku.getSolrFieldName()); - solrQuery.set(CommonParams.ROWS, this.chunksize); - solrQuery.set(CommonParams.START, this.chunkstart); - SolrConnector solrConnector = sb.index.fulltext().getDefaultConnector(); if (!solrConnector.isClosed()) { try { - QueryResponse rsp = solrConnector.getResponseByParams(solrQuery); - docList = rsp.getResults(); + docList = solrConnector.getDocumentListByQuery(currentQuery + " AND (" + CollectionSchema.httpstatus_i.name() + ":200)", + CollectionSchema.fresh_date_dt.getSolrFieldName() + " asc", this.chunkstart, this.chunksize, CollectionSchema.sku.getSolrFieldName()); this.urlsfound = docList.getNumFound(); } catch (Throwable e) { + this.urlsfound = 0; } + } else { + this.urlsfound =0; } if (docList != null) { @@ -161,14 +151,15 @@ public class RecrawlBusyThread extends AbstractBusyThread { } catch (MalformedURLException ex) { } } - - this.chunkstart = this.chunkstart + urlstack.size(); - - if (docList.getNumFound() <= this.chunkstart) { - this.chunkstart = 0; - } + this.chunkstart = this.chunkstart + this.chunksize; } - + + if (this.urlsfound <= this.chunkstart) { + this.chunkstart = 0; + return false; + // TODO: add a stop condition + } + return true; } @Override From 13f013f64a4bc815759e820fb72c3f85bc268329 Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 17 May 2015 06:21:12 +0200 Subject: [PATCH 8/8] Limit extra sleep of BusyThread on LowMemCycle --- source/net/yacy/kelondro/workflow/AbstractBusyThread.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/net/yacy/kelondro/workflow/AbstractBusyThread.java b/source/net/yacy/kelondro/workflow/AbstractBusyThread.java index de15a0d80..898bcd435 100644 --- a/source/net/yacy/kelondro/workflow/AbstractBusyThread.java +++ b/source/net/yacy/kelondro/workflow/AbstractBusyThread.java @@ -252,7 +252,7 @@ public abstract class AbstractBusyThread extends AbstractThread implements BusyT // do a clean-up this.freemem(); // sleep a while - ratz(this.idlePause + 1000*(outofmemoryCycles++)); + ratz(this.idlePause + 1000*(outofmemoryCycles++ % 0x0F)); // limit extra sleep time (oomCycles can grow big over time) idletime += System.currentTimeMillis() - timestamp; } }