From d958d1c0c436ac716648e11864c8934f5d3b2617 Mon Sep 17 00:00:00 2001
From: zutto <zutto@localhost>
Date: Sat, 29 Jun 2024 09:33:06 +0300
Subject: [PATCH 1/3] ensure that returned SolrDocument is not null

---
 source/net/yacy/crawler/data/CrawlQueues.java | 3 +++
 1 file changed, 3 insertions(+)
diff --git a/source/net/yacy/crawler/data/CrawlQueues.java b/source/net/yacy/crawler/data/CrawlQueues.java
index ea9c3b167..30a5ea93a 100644
--- a/source/net/yacy/crawler/data/CrawlQueues.java
+++ b/source/net/yacy/crawler/data/CrawlQueues.java
@@ -608,6 +608,9 @@ public class CrawlQueues {
             int i = 0;
             int deepRatio = Integer.parseInt(this.sb.getConfig(SwitchboardConstants.AUTOCRAWL_RATIO, "50"));
             for (SolrDocument doc: resp.getResults()) {
+		if (doc == null) {
+		    continue;
+		}
                 boolean deep = false;
                 i++;
                 if( i % deepRatio == 0 ){

From 962aaec0c0956279073c60a4aa5df13b7bfbefed Mon Sep 17 00:00:00 2001
From: zutto <zutto@localhost>
Date: Sat, 29 Jun 2024 09:37:05 +0300
Subject: [PATCH 2/3] Improve the clarity of deep crawl feature UI text on
 AutoCrawler

---
 htroot/Autocrawl_p.html | 4 ++--
 locales/master.lng.xlf  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/htroot/Autocrawl_p.html b/htroot/Autocrawl_p.html
index db77de051..3202a10d0 100644
--- a/htroot/Autocrawl_p.html
+++ b/htroot/Autocrawl_p.html
@@ -20,7 +20,7 @@
     			#(changed)#::<dt></dt><dd><span class="error">You need to restart for some settings to be applied</span></dd>#(/changed)#
     			<dt>Enable Autocrawler:</dt>
     			<dd><input id="autocrawlEnable" name="autocrawlEnable" type="checkbox" #(autocrawlEnable)#::checked="checked"#(/autocrawlEnable)# /></dd>
-    			<dt>Deep crawl every:</dt>
+    			<dt>Deep crawl every Nth document:</dt>
     			<dd>
     				<input id="autocrawlRatio" name="autocrawlRatio" type="number" min="1" max="500" step="1" size="2" maxlength="2" value="#[autocrawlRatio]#" />
     				Warning: if this is bigger than "Rows to fetch" only shallow crawls will run.
@@ -47,4 +47,4 @@
     		</dl>
     	</form>
     </fieldset>
-</body>
\ No newline at end of file
+</body>
diff --git a/locales/master.lng.xlf b/locales/master.lng.xlf
index 94e77d2eb..8deee4954 100644
--- a/locales/master.lng.xlf
+++ b/locales/master.lng.xlf
@@ -211,7 +211,7 @@
        <source>Enable Autocrawler:</source>
     </trans-unit>
     <trans-unit id="66a1bd2c" xml:space="preserve" approved="no" translate="yes">
-       <source>Deep crawl every:</source>
+       <source>Deep crawl every Nth document:</source>
     </trans-unit>
     <trans-unit id="2291c65d" xml:space="preserve" approved="no" translate="yes">
        <source>Warning: if this is bigger than "Rows to fetch" only shallow crawls will run.</source>

From 5268ae2ce93c07cd222909fc0732060cc378184a Mon Sep 17 00:00:00 2001
From: zutto <zutto@localhost>
Date: Sat, 29 Jun 2024 10:11:58 +0300
Subject: [PATCH 3/3] check the document protocol & host values before
 proceeding to form final url.

---
 source/net/yacy/crawler/data/CrawlQueues.java | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/source/net/yacy/crawler/data/CrawlQueues.java b/source/net/yacy/crawler/data/CrawlQueues.java
index 30a5ea93a..38c8e8a2c 100644
--- a/source/net/yacy/crawler/data/CrawlQueues.java
+++ b/source/net/yacy/crawler/data/CrawlQueues.java
@@ -617,6 +617,10 @@ public class CrawlQueues {
                     deep = true;
                 }
                 DigestURL url;
+		if (doc.getFieldValue("url_protocol_s") == null || doc.getFieldValue("host_s") == null) {
+			//Skip this document if either of these values is null.
+			continue; 
+		}
                 final String u = doc.getFieldValue("url_protocol_s").toString() + "://" + doc.getFieldValue("host_s").toString();
                 try {
                     url = new DigestURL(u);