From d958d1c0c436ac716648e11864c8934f5d3b2617 Mon Sep 17 00:00:00 2001 From: zutto Date: Sat, 29 Jun 2024 09:33:06 +0300 Subject: [PATCH 1/3] ensure that returned SolrDocument is not null --- source/net/yacy/crawler/data/CrawlQueues.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/source/net/yacy/crawler/data/CrawlQueues.java b/source/net/yacy/crawler/data/CrawlQueues.java index ea9c3b167..30a5ea93a 100644 --- a/source/net/yacy/crawler/data/CrawlQueues.java +++ b/source/net/yacy/crawler/data/CrawlQueues.java @@ -608,6 +608,9 @@ public class CrawlQueues { int i = 0; int deepRatio = Integer.parseInt(this.sb.getConfig(SwitchboardConstants.AUTOCRAWL_RATIO, "50")); for (SolrDocument doc: resp.getResults()) { + if (doc == null) { + continue; + } boolean deep = false; i++; if( i % deepRatio == 0 ){ From 962aaec0c0956279073c60a4aa5df13b7bfbefed Mon Sep 17 00:00:00 2001 From: zutto Date: Sat, 29 Jun 2024 09:37:05 +0300 Subject: [PATCH 2/3] Improve the clarity of deep crawl feature UI text on AutoCrawler --- htroot/Autocrawl_p.html | 4 ++-- locales/master.lng.xlf | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/htroot/Autocrawl_p.html b/htroot/Autocrawl_p.html index db77de051..3202a10d0 100644 --- a/htroot/Autocrawl_p.html +++ b/htroot/Autocrawl_p.html @@ -20,7 +20,7 @@ #(changed)#::
You need to restart for some settings to be applied
#(/changed)#
Enable Autocrawler:
-
Deep crawl every:
+
Deep crawl every Nth document:
Warning: if this is bigger than "Rows to fetch" only shallow crawls will run. @@ -47,4 +47,4 @@ - \ No newline at end of file + diff --git a/locales/master.lng.xlf b/locales/master.lng.xlf index 94e77d2eb..8deee4954 100644 --- a/locales/master.lng.xlf +++ b/locales/master.lng.xlf @@ -211,7 +211,7 @@ Enable Autocrawler: - Deep crawl every: + Deep crawl every Nth document: Warning: if this is bigger than "Rows to fetch" only shallow crawls will run. From 5268ae2ce93c07cd222909fc0732060cc378184a Mon Sep 17 00:00:00 2001 From: zutto Date: Sat, 29 Jun 2024 10:11:58 +0300 Subject: [PATCH 3/3] check the document protocol & host values before proceeding to form final url. --- source/net/yacy/crawler/data/CrawlQueues.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/source/net/yacy/crawler/data/CrawlQueues.java b/source/net/yacy/crawler/data/CrawlQueues.java index 30a5ea93a..38c8e8a2c 100644 --- a/source/net/yacy/crawler/data/CrawlQueues.java +++ b/source/net/yacy/crawler/data/CrawlQueues.java @@ -617,6 +617,10 @@ public class CrawlQueues { deep = true; } DigestURL url; + if (doc.getFieldValue("url_protocol_s") == null || doc.getFieldValue("host_s") == null) { + //Skip this document if either of these values is null. + continue; + } final String u = doc.getFieldValue("url_protocol_s").toString() + "://" + doc.getFieldValue("host_s").toString(); try { url = new DigestURL(u);