From 496f768c4431544af1b1ebb1555716c47a05aa0e Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Fri, 3 Nov 2023 18:20:10 +0100
Subject: [PATCH 01/11] modified cache strategy for zim clusters

---
 source/org/openzim/ZIMReader.java | 30 +++---------------------------
 1 file changed, 3 insertions(+), 27 deletions(-)

diff --git a/source/org/openzim/ZIMReader.java b/source/org/openzim/ZIMReader.java
index bc39fd36b..27d544e27 100644
--- a/source/org/openzim/ZIMReader.java
+++ b/source/org/openzim/ZIMReader.java
@@ -337,10 +337,7 @@ public class ZIMReader {
     public Cluster getCluster(int clusterNumber) throws IOException {
         for (int i = 0; i < this.clusterCache.size(); i++) {
             Cluster c = clusterCache.get(i);
-            if (c.cluster_number == clusterNumber) {
-                c.incUsage(); // cache hit
-                return c;
-            }
+            if (c.cluster_number == clusterNumber) return c;
         }
 
         // cache miss
@@ -348,17 +345,10 @@ public class ZIMReader {
 
         // check cache size
         if (clusterCache.size() >= MAX_CLUSTER_CACHE_SIZE) {
-            // remove one entry
-            double maxEntry = Double.MIN_VALUE;
-            int pos = -1;
-            for (int i = 0; i < clusterCache.size(); i++) {
-                double r = this.clusterCache.get(i).getUsageRatio();
-                if (r > maxEntry) {maxEntry = r; pos = i;}
-            }
-            if (pos >= 0) this.clusterCache.remove(pos);
+            // remove one entry: the first entry is the oldest entry
+            this.clusterCache.remove(0);
         }
 
-        c.incUsage();
         this.clusterCache.add(c);
         return c;
     }
@@ -378,12 +368,10 @@ public class ZIMReader {
 
         private int cluster_number; // used to identify the correct cache entry
         private List<byte[]> blobs;
-        private int usageCounter; // used for efficient caching and cache stale detection
         private boolean extended;
 
         public Cluster(int cluster_number) throws IOException {
             this.cluster_number = cluster_number;
-            this.usageCounter = 0;
 
             // open the cluster and make a Input Stream with the proper decompression type
             final long clusterPos = mFile.geClusterPtr(cluster_number);
@@ -444,21 +432,9 @@ public class ZIMReader {
             return this.blobs.get(i);
         }
 
-        public void incUsage() {
-            this.usageCounter++;
-        }
-
-        public int getUsage() {
-            return this.usageCounter;
-        }
-
         public int getSize() {
             return this.blobs.size();
         }
-
-        public double getUsageRatio() {
-            return ((double) this.usageCounter) / ((double) this.blobs.size());
-        }
     }
 
     public byte[] getArticleData(final DirectoryEntry directoryInfo) throws IOException {

From 70e29937ef76b2f3c7f5061d71bb4b3ce740a361 Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Sat, 4 Nov 2023 19:07:50 +0100
Subject: [PATCH 02/11] added a check in zim importer which tests if import
 URLs actually exist

---
 .../cora/document/id/MultiProtocolURL.java    | 27 +++++++++++++++++++
 .../yacy/document/importer/ZimImporter.java   | 11 +++++++-
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java
index e9a46fc91..a3404bec0 100644
--- a/source/net/yacy/cora/document/id/MultiProtocolURL.java
+++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java
@@ -37,6 +37,7 @@ import java.io.UnsupportedEncodingException;
 import java.net.InetAddress;
 import java.net.MalformedURLException;
 import java.net.URLDecoder;
+import java.net.http.HttpResponse;
 import java.nio.ByteBuffer;
 import java.nio.charset.StandardCharsets;
 import java.util.BitSet;
@@ -2578,6 +2579,32 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
         return null;
     }
 
+    public boolean exists(final ClientIdentification.Agent agent) {
+        try {
+            if (isFile()) {
+                return getFSFile().exists();
+            }
+            if (isSMB()) {
+                return getSmbFile().exists();
+            }
+            if (isFTP()) {
+                final FTPClient client = new FTPClient();
+                client.open(this.host, this.port < 0 ? 21 : this.port);
+                return client.fileSize(path) > 0;
+            }
+            if (isHTTP() || isHTTPS()) {
+                    try (final HTTPClient client = new HTTPClient(agent)) {
+                        client.setHost(getHost());
+                        org.apache.http.HttpResponse response = client.HEADResponse(this, true);
+                        return response != null && (response.getStatusLine().getStatusCode() == 200 || response.getStatusLine().getStatusCode() == 301);
+                    }
+            }
+            return false;
+        } catch (IOException e) {
+            return false;
+        }
+    }
+
     /**
      * Read fully the source, close it and return its content as a bytes array.
      * @param source the source to read
diff --git a/source/net/yacy/document/importer/ZimImporter.java b/source/net/yacy/document/importer/ZimImporter.java
index a96a79b18..26f36f787 100644
--- a/source/net/yacy/document/importer/ZimImporter.java
+++ b/source/net/yacy/document/importer/ZimImporter.java
@@ -30,6 +30,7 @@ import java.util.Map;
 import java.util.TreeMap;
 
 import net.yacy.cora.document.id.DigestURL;
+import net.yacy.cora.protocol.ClientIdentification;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.protocol.ResponseHeader;
 import net.yacy.cora.util.ConcurrentLog;
@@ -84,6 +85,12 @@ public class ZimImporter extends Thread implements Importer {
             this.reader = new ZIMReader(this.file);
             this.guessedSource = getSource(this.reader);
 
+            // verify the source
+            DirectoryEntry mainEntry = this.reader.getMainDirectoryEntry();
+            DigestURL url = new DigestURL(mainEntry.url);
+            if (!url.exists(ClientIdentification.browserAgent)) return; 
+
+            // read all documents
             for (int i = 0; i < this.file.header_entryCount; i++) {
                 if (this.abort) break;
                 DirectoryEntry de = this.reader.getDirectoryInfo(i);
@@ -304,7 +311,9 @@ public class ZimImporter extends Thread implements Importer {
                 System.out.println("guessed domain: " + guessDomainName(f.getName()));
                 String source = getSource(r);
                 System.out.println("guessed Source: " + source);
-                System.out.println("guessed main article: " + guessURL(source, de));
+                String mainURL = guessURL(source, de);
+                System.out.println("guessed main article: " + mainURL);
+                System.out.println("main article exists: " + new DigestURL(mainURL).exists(ClientIdentification.browserAgent));
                 System.out.println();
             } catch (IOException e) {
                 e.printStackTrace();

From 7db0534d8a0709a2903f1880e98aaa4657fbf462 Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Sun, 5 Nov 2023 02:16:40 +0100
Subject: [PATCH 03/11] Added a zim parser to the surrogate import option. You
 can now import zim files into YaCy by simply moving them to the
 DATA/SURROGATE/IN folder. They will be fetched and after parsing moved to
 DATA/SURROGATE/OUT. There are exceptions where the parser is not able to
 identify the original URL of the documents in the zim file. In that case the
 file is simply ignored. This commit also carries an important fix to the pdf
 parser and an increase of the maximum parsing speed to 60000 PPM which should
 make it possible to index up to 1000 files in one second.

---
 htroot/ConfigParser_p.html                    |  21 --
 htroot/Crawler_p.html                         |   4 +-
 ivy.xml                                       |   1 +
 .../cora/document/id/MultiProtocolURL.java    |  14 +-
 .../yacy/document/importer/ZimImporter.java   | 215 ++++++++++++++++--
 .../net/yacy/document/parser/pdfParser.java   | 126 +++-------
 source/net/yacy/htroot/ConfigParser_p.java    |  10 -
 source/net/yacy/htroot/Crawler_p.java         |  10 +-
 .../kelondro/data/meta/URIMetadataNode.java   |  12 +-
 source/net/yacy/search/Switchboard.java       |  18 +-
 .../net/yacy/search/SwitchboardConstants.java |   2 -
 source/org/openzim/ZIMFile.java               |  32 +--
 12 files changed, 279 insertions(+), 186 deletions(-)

diff --git a/htroot/ConfigParser_p.html b/htroot/ConfigParser_p.html
index a51ee1013..66a4665d3 100644
--- a/htroot/ConfigParser_p.html
+++ b/htroot/ConfigParser_p.html
@@ -51,27 +51,6 @@
   </tr>
 </table>
 </fieldset>
-<fieldset><legend id="parser">PDF Parser Attributes</legend>
-<p>
-  This is an experimental setting which makes it possible to split PDF documents into individual index entries.
-  Every page will become a single index hit and the url is artifically extended with a post/get attribute value containing
-  the page number as value. When such an url is displayed within a search result, then the post/get attribute is transformed into an anchor hash link.
-  This makes it possible to view the individual page directly in the pdf.js viewer built-in into firefox,
-  for reference see https://github.com/mozilla/pdf.js/wiki/Viewer-options
-</p>
-<table border="0">
-  <tr class="TableCellLight">
-    <td class="small" align="right" width="90">Split PDF</td>
-    <td class="small" align="left" width="300"><input type="checkbox" name="individualPages" #(individualPages)#::checked="checked" #(/individualPages)#/></td>
-  </tr>
-  <tr class="TableCellLight">
-    <td class="small" align="right">Property Name</td>
-    <td class="small" align="left"><input type="text" name="individualPagePropertyname" value="#[individualPagePropertyname]#"/></td>
-  </tr>
-  <tr class="TableCellDark">
-    <td colspan="3" class="small" ><input type="submit" name="pdfSettings" value="Submit" class="btn btn-primary"/></td>
-  </tr>
-</table>
 </form>
     #%env/templates/footer.template%#
   </body>
diff --git a/htroot/Crawler_p.html b/htroot/Crawler_p.html
index 79a0319c0..3b328a996 100644
--- a/htroot/Crawler_p.html
+++ b/htroot/Crawler_p.html
@@ -134,7 +134,7 @@
           <tr class="TableCellLight"> 
             <td align="left">Speed / PPM<br/>(Pages Per Minute)</td>
             <td align="left" colspan="4">
-            <input id="customPPM" name="customPPM" type="number" min="10" max="30000" style="width:5em" value="#[customPPMdefault]#" /><label for="customPPM"><abbr title="Pages Per Minute">PPM</abbr></label>
+            <input id="customPPM" name="customPPM" type="number" min="10" max="60000" style="width:5em" value="#[customPPMdefault]#" /><label for="customPPM"><abbr title="Pages Per Minute">PPM</abbr></label>
             <input id="latencyFactor" name="latencyFactor" type="number" min="0.1" max="3.0" step="0.1" style="width:3.5em" value="#[latencyFactorDefault]#" />
             <label for="latencyFactor"><abbr title="Latency Factor">LF</abbr></label>
             <input id="MaxSameHostInQueue" name="MaxSameHostInQueue" type="number" min="1" max="30" style="width:3em" value="#[MaxSameHostInQueueDefault]#" />
@@ -147,7 +147,7 @@
             <td align="left">Crawler PPM</td>
             <td align="left" width="60"><span id="ppmNum">&nbsp;&nbsp;&nbsp;</span></td>
             <td align="left" width="260px" colspan="3">
-                <progress id="ppmbar" max="30000" value="0" style="width:94%;"/>
+                <progress id="ppmbar" max="60000" value="0" style="width:94%;"/>
             </td>
           </tr>
           <tr class="TableCellLight"> 
diff --git a/ivy.xml b/ivy.xml
index 61f9ee127..8c072699d 100644
--- a/ivy.xml
+++ b/ivy.xml
@@ -28,6 +28,7 @@
       <dependency org="io.opentracing" name="opentracing-noop" rev="0.33.0"/>
       <dependency org="io.opentracing" name="opentracing-util" rev="0.33.0"/>
       <dependency org="javax.servlet" name="javax.servlet-api" rev="3.1.0"/>
+      <dependency org="javainetlocator" name="inetaddresslocator" rev="2.18" />
       <dependency org="jcifs" name="jcifs" rev="1.3.17"  conf="compile->master" />
       <dependency org="net.arnx" name="jsonic" rev="1.3.10"/>
       <dependency org="net.jthink" name="jaudiotagger" rev="2.2.5"/>
diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java
index a3404bec0..1cac0dace 100644
--- a/source/net/yacy/cora/document/id/MultiProtocolURL.java
+++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java
@@ -2593,14 +2593,18 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
                 return client.fileSize(path) > 0;
             }
             if (isHTTP() || isHTTPS()) {
-                    try (final HTTPClient client = new HTTPClient(agent)) {
-                        client.setHost(getHost());
-                        org.apache.http.HttpResponse response = client.HEADResponse(this, true);
-                        return response != null && (response.getStatusLine().getStatusCode() == 200 || response.getStatusLine().getStatusCode() == 301);
-                    }
+                final HTTPClient client = new HTTPClient(agent);
+                client.setHost(getHost());
+                org.apache.http.HttpResponse response = client.HEADResponse(this, true);
+                client.close();
+                if (response == null) return false;
+                int status = response.getStatusLine().getStatusCode();
+                return status == 200 || status == 301 || status == 302;
             }
             return false;
         } catch (IOException e) {
+            if (e.getMessage().contains("Circular redirect to")) return true; // exception; this is a 302 which the client actually accepts
+            //e.printStackTrace();
             return false;
         }
     }
diff --git a/source/net/yacy/document/importer/ZimImporter.java b/source/net/yacy/document/importer/ZimImporter.java
index 26f36f787..118e27e40 100644
--- a/source/net/yacy/document/importer/ZimImporter.java
+++ b/source/net/yacy/document/importer/ZimImporter.java
@@ -25,12 +25,20 @@ package net.yacy.document.importer;
 
 import java.io.File;
 import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
 import java.util.Collection;
+import java.util.Date;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
 import java.util.Map;
+import java.util.Set;
 import java.util.TreeMap;
 
+import net.yacy.cora.document.encoding.ASCII;
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.protocol.ClientIdentification;
+import net.yacy.cora.protocol.HeaderFramework;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.protocol.ResponseHeader;
 import net.yacy.cora.util.ConcurrentLog;
@@ -81,14 +89,18 @@ public class ZimImporter extends Thread implements Importer {
     public void run() {
         job = this;
         this.startTime = System.currentTimeMillis();
+        Switchboard sb = Switchboard.getSwitchboard();
         try {
             this.reader = new ZIMReader(this.file);
             this.guessedSource = getSource(this.reader);
 
             // verify the source
             DirectoryEntry mainEntry = this.reader.getMainDirectoryEntry();
-            DigestURL url = new DigestURL(mainEntry.url);
-            if (!url.exists(ClientIdentification.browserAgent)) return; 
+            DigestURL mainURL = guessURL(this.guessedSource, mainEntry);
+            if (!mainURL.exists(ClientIdentification.browserAgent)) {
+                sb.log.info("zim importer: file " + this.file.getName() + " failed main url existence test: " + mainURL);
+                return; 
+            }
 
             // read all documents
             for (int i = 0; i < this.file.header_entryCount; i++) {
@@ -98,8 +110,14 @@ public class ZimImporter extends Thread implements Importer {
                 ArticleEntry ae = (ArticleEntry) de;
 
                 // check url
-                String guessedUrl = guessURL(this.guessedSource, de);
-                assert guessedUrl.startsWith("http");
+                DigestURL guessedUrl = guessURL(this.guessedSource, de);
+                if (recordCnt < 10) {
+                    // critical test for the first 10 urls
+                    if (!guessedUrl.exists(ClientIdentification.browserAgent)) {
+                        sb.log.info("zim importer: file " + this.file.getName() + " failed url " + recordCnt + " existence test: " + guessedUrl);
+                        return; 
+                    }
+                }
 
                 // check availability of text parser
                 String mimeType = ae.getMimeType();
@@ -111,7 +129,17 @@ public class ZimImporter extends Thread implements Importer {
                 // create artificial request and response headers for the indexer
                 RequestHeader requestHeader = new RequestHeader();
                 ResponseHeader responseHeader = new ResponseHeader(200);
-                final Request request = new Request(new DigestURL(guessedUrl), null);
+                responseHeader.put(HeaderFramework.CONTENT_TYPE, de.getMimeType()); // very important to tell parser which kind of content
+                final Request request = new Request(
+                        ASCII.getBytes(sb.peers.mySeed().hash),
+                        guessedUrl,
+                        null, // referrerhash the hash of the referrer URL
+                        de.title, // name the name of the document to crawl
+                        null, // appdate the time when the url was first time appeared
+                        sb.crawler.defaultSurrogateProfile.handle(),        // profileHandle the name of the prefetch profile. This must not be null!
+                        0,    // depth the crawling depth of the entry
+                        sb.crawler.defaultSurrogateProfile.timezoneOffset() // timezone offset
+                );
                 final Response response = new Response(
                         request,
                         requestHeader,
@@ -122,7 +150,7 @@ public class ZimImporter extends Thread implements Importer {
                 );
 
                 // throw this to the indexer
-                String error = Switchboard.getSwitchboard().toIndexer(response);
+                String error = sb.toIndexer(response);
                 if (error != null) ConcurrentLog.info("ZimImporter", "error parsing: " + error);
                 this.recordCnt++;
             }
@@ -203,7 +231,7 @@ public class ZimImporter extends Thread implements Importer {
             case "fonts":
                 return "fonts.google.com";
             case "gutenberg":
-                return "gutenberg.org";
+                return "https://dev.library.kiwix.org/viewer#gutenberg_de_all_2023-03";
             case "ifixit":
                 return "ifixit.com";
             case "lesfondamentaux":
@@ -223,11 +251,23 @@ public class ZimImporter extends Thread implements Importer {
             case "rapsberry_pi_docs":
                 return "raspberrypi.org";
             case "ted":
-                return "ted.com";
+                return "www.ted.com/search?q=";
             case "vikidia":
-                return "vikidia.org";
+                return parts[1] + ".vikidia.org/wiki";
             case "westeros":
                 return "westeros.org";
+            case "wikihow":
+                return parts[1].equals("en") ? "wikihow.com" : parts[1] + ".wikihow.com";
+            case "wikisource":
+                return parts[1] + ".wikisource.org/wiki";
+            case "wikiversity":
+                return parts[1] + ".wikiversity.org/wiki";
+            case "wikivoyage":
+                return parts[1] + ".wikivoyage.org/wiki";
+            case "wiktionary":
+                return parts[1] + ".wiktionary.org/wiki";
+            case "wikiquote":
+                return parts[1] + ".wikiquote.org/wiki";
             case "wikibooks":
                 return parts[1] + ".wikibooks.org/wiki";
             case "wikinews":
@@ -273,16 +313,148 @@ public class ZimImporter extends Thread implements Importer {
         return source;
     }
 
-    public static String guessURL(String guessedSource, DirectoryEntry de) {
+    public static DigestURL guessURL(String guessedSource, DirectoryEntry de) throws MalformedURLException {
         String url = de.url;
         if (url.equals("Main_Page")) url = "";
-        if (guessedSource != null) return guessedSource + url;
-        if (url.startsWith("A/")) return "https://" + url.substring(2);
-        if (url.startsWith("H/")) return "https://" + url.substring(2);
-        return guessedSource + url;
+        if (guessedSource != null) return new DigestURL(guessedSource + url);
+        if (url.startsWith("A/")) return new DigestURL("https://" + url.substring(2));
+        if (url.startsWith("H/")) return new DigestURL("https://" + url.substring(2));
+        return new DigestURL(guessedSource + url);
     }
 
+    private final static String[] skip_files = {
+         "iota.stackexchange.com_en_all_2023-05.zim",
+         "stellar.stackexchange.com_en_all_2023-10.zim",
+         "vegetarianism.stackexchange.com_en_all_2023-05.zim",
+         "esperanto.stackexchange.com_eo_all_2023-10.zim",
+         "tezos.stackexchange.com_en_all_2023-10.zim",
+         "eosio.stackexchange.com_en_all_2023-10.zim",
+         "ebooks.stackexchange.com_en_all_2023-10.zim",
+         "poker.stackexchange.com_en_all_2023-05.zim",
+         "cseducators.stackexchange.com_en_all_2023-10.zim",
+         "iot.stackexchange.com_en_all_2023-05.zim",
+         "portuguese.stackexchange.com_pt_all_2023-04.zim",
+         "portuguese.stackexchange.com_pt_all_2023-10.zim",
+         "italian.stackexchange.com_it_all_2023-05.zim",
+         "monero.stackexchange.com_en_all_2022-11.zim",
+         "sustainability.stackexchange.com_en_all_2023-05.zim",
+         "westeros_en_all_nopic_2021-03.zim",
+         "opensource.stackexchange.com_en_all_2023-10.zim",
+         "tor.stackexchange.com_en_all_2023-05.zim",
+         "devops.stackexchange.com_en_all_2023-10.zim",
+         "patents.stackexchange.com_en_all_2023-10.zim",
+         "stackapps.com_en_all_2023-05.zim",
+         "hardwarerecs.stackexchange.com_en_all_2023-05.zim",
+         "hsm.stackexchange.com_en_all_2023-05.zim",
+         "expatriates.stackexchange.com_en_all_2023-11.zim",
+         "opendata.stackexchange.com_en_all_2023-10.zim",
+         "sports.stackexchange.com_en_all_2023-05.zim",
+         "wikinews_de_all_nopic_2023-10.zim",
+         "computergraphics.stackexchange.com_en_all_2023-10.zim",
+         "tridion.stackexchange.com_en_all_2023-10.zim",
+         "bioinformatics.stackexchange.com_en_all_2023-10.zim",
+         "expressionengine.stackexchange.com_en_all_2023-11.zim",
+         "elementaryos.stackexchange.com_en_all_2023-10.zim",
+         "cstheory.stackexchange.com_en_all_2023-10.zim",
+         "chess.stackexchange.com_en_all_2023-05.zim",
+         "vi.stackexchange.com_en_all_2023-05.zim",
+         "fitness.stackexchange.com_en_all_2023-10.zim",
+         "pets.stackexchange.com_en_all_2023-05.zim",
+         "french.stackexchange.com_fr_all_2023-10.zim",
+         "sqa.stackexchange.com_en_all_2023-05.zim",
+         "islam.stackexchange.com_en_all_2023-05.zim",
+         "scicomp.stackexchange.com_en_all_2023-05.zim",
+         "wikinews_en_all_nopic_2023-09.zim",
+         "ai.stackexchange.com_en_all_2023-10.zim",
+         "boardgames.stackexchange.com_en_all_2023-05.zim",
+         "economics.stackexchange.com_en_all_2023-05.zim",
+         "3dprinting.stackexchange.com_en_all_2023-07.zim",
+         "earthscience.stackexchange.com_en_all_2023-05.zim",
+         "emacs.stackexchange.com_en_all_2023-10.zim",
+         "bitcoin.stackexchange.com_en_all_2023-05.zim",
+         "philosophy.stackexchange.com_en_all_2023-05.zim",
+         "law.stackexchange.com_en_all_2023-05.zim",
+         "astronomy.stackexchange.com_en_all_2023-05.zim",
+         "artofproblemsolving_en_all_nopic_2021-03.zim",
+         "engineering.stackexchange.com_en_all_2023-05.zim",
+         "ja.stackoverflow.com_ja_all_2023-06.zim",
+         "webmasters.stackexchange.com_en_all_2023-05.zim",
+         "anime.stackexchange.com_en_all_2023-10.zim",
+         "cooking.stackexchange.com_en_all_2023-05.zim",
+         "arduino.stackexchange.com_en_all_2023-05.zim",
+         "money.stackexchange.com_en_all_2023-05.zim",
+         "judaism.stackexchange.com_en_all_2023-05.zim",
+         "ethereum.stackexchange.com_en_all_2023-05.zim",
+         "datascience.stackexchange.com_en_all_2023-10.zim",
+         "academia.stackexchange.com_en_all_2023-10.zim",
+         "music.stackexchange.com_en_all_2023-05.zim",
+         "cs.stackexchange.com_en_all_2023-03.zim",
+         "dsp.stackexchange.com_en_all_2023-05.zim",
+         "biology.stackexchange.com_en_all_2023-05.zim",
+         "android.stackexchange.com_en_all_2023-10.zim",
+         "bicycles.stackexchange.com_en_all_2023-05.zim",
+         "puzzling.stackexchange.com_en_all_2023-05.zim",
+         "photo.stackexchange.com_en_all_2023-05.zim",
+         "aviation.stackexchange.com_en_all_2023-05.zim",
+         "drupal.stackexchange.com_en_all_2023-05.zim",
+         "ux.stackexchange.com_en_all_2023-05.zim",
+         "ell.stackexchange.com_en_all_2023-10.zim",
+         "openstreetmap-wiki_en_all_nopic_2023-05.zim",
+         "softwareengineering.stackexchange.com_en_all_2023-05.zim",
+         "gaming.stackexchange.com_en_all_2023-10.zim",
+         "mathematica.stackexchange.com_en_all_2023-10.zim",
+         "pt.stackoverflow.com_pt_all_2023-06.zim",
+         "apple.stackexchange.com_en_all_2023-05.zim",
+         "diy.stackexchange.com_en_all_2023-08.zim",
+         "es.stackoverflow.com_es_all_2023-06.zim",
+         "gis.stackexchange.com_en_all_2023-05.zim",
+         "stats.stackexchange.com_en_all_2023-05.zim",
+         "physics.stackexchange.com_en_all_2023-05.zim",
+         "serverfault.com_en_all_2023-05.zim",
+         "electronics.stackexchange.com_en_all_2023-05.zim",
+         "tex.stackexchange.com_en_all_2023-05.zim",
+         "wikibooks_de_all_nopic_2021-03.zim",
+         "askubuntu.com_en_all_2023-05.zim",
+         "superuser.com_en_all_2023-05.zim",
+         "lesfondamentaux.reseau-canope.fr_fr_all_2022-11.zim",
+         "wikibooks_en_all_nopic_2021-03.zim",
+         "courses.lumenlearning.com_en_all_2021-03.zim",
+         "wikipedia_de_all_nopic_2023-10.zim",
+         "wikipedia_en_all_nopic_2023-10.zim",
+         "stackoverflow.com_en_all_nopic_2022-07.zim",
+         "stackoverflow.com_en_all_2023-05.zim",
+         "armypubs_en_all_2023-08.zim",
+         "vikidia_en_all_nopic_2023-09.zim",
+         "wikiquote_de_all_nopic_2023-10.zim",
+         "wikiquote_en_all_nopic_2023-09.zim",
+         "wiktionary_de_all_nopic_2023-10.zim",
+         "wiktionary_en_all_nopic_2023-10.zim",
+         "wikihow_de_maxi_2023-10.zim",
+         "wikivoyage_de_all_nopic_2023-09.zim",
+         "wikiversity_de_all_nopic_2021-03.zim",
+         "wikiversity_en_all_nopic_2021-03.zim",
+         "wikisource_de_all_nopic_2023-09.zim",
+         "wikisource_en_all_nopic_2023-08.zim",
+         "ted_countdown_global_2023-09.zim",
+         "ted_en_design_2023-09.zim",
+         "ted_en_business_2023-09.zim",
+         "ted_en_global_issues_2023-09.zim",
+
+         // 302
+         "moderators.stackexchange.com_en_all_2023-05.zim",
+         "beer.stackexchange.com_en_all_2023-05.zim",
+         "health.stackexchange.com_en_all_2023-05.zim",
+         "avp.stackexchange.com_en_all_2023-05.zim",
+         "lowtechmagazine.com_en_all_2023-08.zim",
+         "ifixit_de_all_2023-07.zim",
+         "ifixit_en_all_2023-10.zim",
+         "der-postillon.com_de_all_2020-12.zim",
+         "wikihow_en_maxi_2023-03.zim",
+    };
+
     public static void main(String[] args) {
+        Set<String> skip = new HashSet<>();
+        for (String s: skip_files) skip.add(s);
         // zim file import test
         // will test mostly if domain names are included in zim file urls
         String zimFilesPath = args[0];
@@ -298,7 +470,10 @@ public class ZimImporter extends Thread implements Importer {
         }
 
         Collection<File> orderedFiles = orderedFileMap.values();
+        Set<String> files_ok = new LinkedHashSet<>();
+        Set<String> files_nok = new LinkedHashSet<>();
         for (File f: orderedFiles) {
+            if (skip.contains(f.getName())) continue;
             try {
                 ZIMFile z = new ZIMFile(f.getAbsolutePath());
                 ZIMReader r = new ZIMReader(z);
@@ -308,16 +483,20 @@ public class ZimImporter extends Thread implements Importer {
                 System.out.println("Namespace: " + de.namespace);
                 System.out.println("Title:     " + de.title);
                 System.out.println("URL:       " + de.url);
-                System.out.println("guessed domain: " + guessDomainName(f.getName()));
+                System.out.println("guessed domain: " + guessDomainName(f.getName())); // uses a table and rules that deduces a source from the file name
                 String source = getSource(r);
-                System.out.println("guessed Source: " + source);
-                String mainURL = guessURL(source, de);
+                System.out.println("guessed Source: " + source); // this uses metadata stored in the zim file
+                DigestURL mainURL = guessURL(source, de);
                 System.out.println("guessed main article: " + mainURL);
-                System.out.println("main article exists: " + new DigestURL(mainURL).exists(ClientIdentification.browserAgent));
+                boolean ok = mainURL.exists(ClientIdentification.browserAgent);
+                System.out.println("main article exists: " + ok);
+                if (ok) files_ok.add(f.getName()); else files_nok.add(f.getName());
                 System.out.println();
             } catch (IOException e) {
                 e.printStackTrace();
             }
         }
+        System.out.println("ok files: " + files_ok.toString());
+        System.out.println("not-ok files: " + files_nok.toString());
     }
 }
diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java
index 0ad6b2248..f02577244 100644
--- a/source/net/yacy/document/parser/pdfParser.java
+++ b/source/net/yacy/document/parser/pdfParser.java
@@ -53,7 +53,6 @@ import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
 import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
 import org.apache.pdfbox.text.PDFTextStripper;
 
-import net.yacy.cora.document.encoding.UTF8;
 import net.yacy.cora.document.id.AnchorURL;
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.document.id.MultiProtocolURL;
@@ -69,9 +68,6 @@ import net.yacy.kelondro.util.MemoryControl;
 
 public class pdfParser extends AbstractParser implements Parser {
 
-    public static boolean individualPages = false;
-    public static String individualPagePropertyname = "page";
-
     public pdfParser() {
         super("Acrobat Portable Document Parser");
         this.SUPPORTED_EXTENSIONS.add("pdf");
@@ -149,98 +145,36 @@ public class pdfParser extends AbstractParser implements Parser {
             // get the links
         	final List<Collection<AnchorURL>> pdflinks = extractPdfLinks(pdfDoc);
 
-            // get the fulltext (either per document or for each page)
-            final PDFTextStripper stripper = new PDFTextStripper(/*StandardCharsets.UTF_8.name()*/);
-
-            if (individualPages) {
-                // this is a hack which stores individual pages of the source pdf into individual index documents
-                // the new documents will get a virtual link with a post argument page=X appended to the original url
-
-                // collect text
-                final int pagecount = pdfDoc.getNumberOfPages();
-                final String[] pages = new String[pagecount];
-                for (int page = 1; page <= pagecount; page++) {
-                    stripper.setStartPage(page);
-                    stripper.setEndPage(page);
-                    pages[page - 1] = stripper.getText(pdfDoc);
-                    //System.out.println("PAGE " + page + ": " + pages[page - 1]);
-                }
-
-                // create individual documents for each page
-                assert pages.length == pdflinks.size() : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.size();
-                result = new Document[Math.min(pages.length, pdflinks.size())];
-                final String loc = location.toNormalform(true);
-                for (int page = 0; page < result.length; page++) {
-                    result[page] = new Document(
-                            new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash
-                            mimeType,
-                            StandardCharsets.UTF_8.name(),
-                            this,
-                            null,
-                            docKeywords,
-                            singleList(docTitle),
-                            docAuthor,
-                            docPublisher,
-                            null,
-                            null,
-                            0.0d, 0.0d,
-                            pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]),
-                            pdflinks == null || page >= pdflinks.size() ? null : pdflinks.get(page),
-                            null,
-                            null,
-                            false,
-                            docDate);
-                }
-            } else {
-                // collect the whole text at once
-                final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
-                byte[] contentBytes = new byte[0];
-                stripper.setEndPage(3); // get first 3 pages (always)
-                writer.append(stripper.getText(pdfDoc));
-                contentBytes = writer.getBytes(); // remember text in case of interrupting thread
-
-                if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read
-                    stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text)
-                    stripper.setEndPage(Integer.MAX_VALUE); // set to default
-                    // we start the pdf parsing in a separate thread to ensure that it can be terminated
-                    final PDDocument pdfDocC = pdfDoc;
-                    final Thread t = new Thread("pdfParser.getText:" + location) {
-                        @Override
-                        public void run() {
-                            try {
-                                writer.append(stripper.getText(pdfDocC));
-                            } catch (final Throwable e) {}
-                        }
-                    };
-                    t.start();
-                    t.join(3000); // pdfbox likes to forget to terminate ... (quite often)
-                    if (t.isAlive()) t.interrupt();
-                    contentBytes = writer.getBytes(); // get final text before closing writer
-                    writer.close(); // free writer resources
-                }
-
-                final Collection<AnchorURL> pdflinksCombined = new HashSet<>();
-                for (final Collection<AnchorURL> pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx);
-                result = new Document[]{new Document(
-                        location,
-                        mimeType,
-                        StandardCharsets.UTF_8.name(),
-                        this,
-                        null,
-                        docKeywords,
-                        singleList(docTitle),
-                        docAuthor,
-                        docPublisher,
-                        null,
-                        null,
-                        0.0d, 0.0d,
-                        contentBytes,
-                        pdflinksCombined,
-                        null,
-                        null,
-                        false,
-                        docDate)};
-            }
+            // collect the whole text at once
+            final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
+            byte[] contentBytes = new byte[0];
+            final PDFTextStripper stripper = new PDFTextStripper();
+            stripper.setEndPage(Integer.MAX_VALUE);
+            writer.append(stripper.getText(pdfDoc));
+            contentBytes = writer.getBytes(); // remember text in case of interrupting thread
+            writer.close(); // free writer resources
+
+            final Collection<AnchorURL> pdflinksCombined = new HashSet<>();
+            for (final Collection<AnchorURL> pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx);
+            result = new Document[]{new Document(
+                    location,
+                    mimeType,
+                    StandardCharsets.UTF_8.name(),
+                    this,
+                    null,
+                    docKeywords,
+                    singleList(docTitle),
+                    docAuthor,
+                    docPublisher,
+                    null,
+                    null,
+                    0.0d, 0.0d,
+                    contentBytes,
+                    pdflinksCombined,
+                    null,
+                    null,
+                    false,
+                    docDate)};
         } catch (final Throwable e) {
             //throw new Parser.Failure(e.getMessage(), location);
         } finally {
diff --git a/source/net/yacy/htroot/ConfigParser_p.java b/source/net/yacy/htroot/ConfigParser_p.java
index e466d783b..943279382 100644
--- a/source/net/yacy/htroot/ConfigParser_p.java
+++ b/source/net/yacy/htroot/ConfigParser_p.java
@@ -61,13 +61,6 @@ public class ConfigParser_p {
                 env.setConfig(SwitchboardConstants.PARSER_MIME_DENY, TextParser.getDenyMime());
                 env.setConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, TextParser.getDenyExtension());
             }
-
-            if (post.containsKey("pdfSettings")) {
-                env.setConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, post.getBoolean("individualPages"));
-                env.setConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, post.get("individualPagePropertyname", "page"));
-                pdfParser.individualPages = sb.getConfigBool(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, false);
-                pdfParser.individualPagePropertyname = sb.getConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, "page");
-            }
         }
 
         int i = 0;
@@ -94,9 +87,6 @@ public class ConfigParser_p {
 
         prop.put("parser", i);
 
-        prop.put("individualPages", sb.getConfigBool(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, false));
-        prop.put("individualPagePropertyname", sb.getConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, "page"));
-
         // return rewrite properties
         return prop;
     }
diff --git a/source/net/yacy/htroot/Crawler_p.java b/source/net/yacy/htroot/Crawler_p.java
index e95562713..8c898f558 100644
--- a/source/net/yacy/htroot/Crawler_p.java
+++ b/source/net/yacy/htroot/Crawler_p.java
@@ -774,7 +774,7 @@ public class Crawler_p {
         }
 
         /*
-         *  <input id="customPPM" name="customPPM" type="number" min="10" max="30000" style="width:46px" value="#[customPPMdefault]#" />PPM
+         *  <input id="customPPM" name="customPPM" type="number" min="10" max="60000" style="width:46px" value="#[customPPMdefault]#" />PPM
             <input id="latencyFactor" name="latencyFactor" type="number" min="0.1" max="3.0" step="0.1" style="width:32px" value="#[latencyFactorDefault]#" />LF
             <input id="MaxSameHostInQueue" name="MaxSameHostInQueue" type="number" min="1" max="30" style="width:32px" value="#[MaxSameHostInQueueDefault]#" />MH
             <input type="submit" name="crawlingPerformance" value="set" />
@@ -784,19 +784,19 @@ public class Crawler_p {
         if (post != null && post.containsKey("crawlingPerformance")) {
             final String crawlingPerformance = post.get("crawlingPerformance", "custom");
             final long LCbusySleep1 = sb.getConfigLong(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, 1000L);
-            int wantedPPM = (LCbusySleep1 == 0) ? 30000 : (int) (60000L / LCbusySleep1);
+            int wantedPPM = (LCbusySleep1 == 0) ? 60000 : (int) (60000L / LCbusySleep1);
             try {
                 wantedPPM = post.getInt("customPPM", wantedPPM);
             } catch (final NumberFormatException e) {}
             if ("minimum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 10;
-            if ("maximum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 30000;
+            if ("maximum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 60000;
 
             int wPPM = wantedPPM;
             if ( wPPM <= 0 ) {
                 wPPM = 1;
             }
-            if ( wPPM >= 30000 ) {
-                wPPM = 30000;
+            if ( wPPM >= 60000 ) {
+                wPPM = 60000;
             }
             final int newBusySleep = 60000 / wPPM; // for wantedPPM = 10: 6000; for wantedPPM = 1000: 60
 
diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java
index 33b797524..2d93ec8b7 100644
--- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java
+++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java
@@ -981,17 +981,7 @@ public class URIMetadataNode extends SolrDocument /* implements Comparable<URIMe
     public String urlstring() {
         if (this.alternative_urlstring != null) return this.alternative_urlstring;
 
-        if (!pdfParser.individualPages) return this.url().toNormalform(true);
-        if (!"pdf".equals(MultiProtocolURL.getFileExtension(this.url().getFileName()).toLowerCase(Locale.ROOT))) return this.url().toNormalform(true);
-        // for pdf links we rewrite the url
-        // this is a special treatment of pdf files which can be splitted into subpages
-        String pageprop = pdfParser.individualPagePropertyname;
-        String resultUrlstring = this.url().toNormalform(true);
-        int p = resultUrlstring.lastIndexOf(pageprop + "=");
-        if (p > 0) {
-          return resultUrlstring.substring(0, p - 1) + "#page=" + resultUrlstring.substring(p + pageprop.length() + 1);
-        }
-        return resultUrlstring;
+        return this.url().toNormalform(true);
     }
     /**
      * used for search result entry
diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java
index 39f856ea3..bebd16cbd 100644
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@@ -176,6 +176,7 @@ import net.yacy.document.content.SurrogateReader;
 import net.yacy.document.importer.JsonListImporter;
 import net.yacy.document.importer.OAIListFriendsLoader;
 import net.yacy.document.importer.WarcImporter;
+import net.yacy.document.importer.ZimImporter;
 import net.yacy.document.parser.audioTagParser;
 import net.yacy.document.parser.pdfParser;
 import net.yacy.document.parser.html.Evaluation;
@@ -906,8 +907,6 @@ public final class Switchboard extends serverSwitch {
 
                 TextParser.setDenyMime(this.getConfig(SwitchboardConstants.PARSER_MIME_DENY, ""));
                 TextParser.setDenyExtension(this.getConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, ""));
-                pdfParser.individualPages = this.getConfigBool(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, false);
-                pdfParser.individualPagePropertyname = this.getConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, "page");
 
                 // start a loader
                 this.log.config("Starting Crawl Loader");
@@ -2153,6 +2152,20 @@ public final class Switchboard extends serverSwitch {
                 this.log.warn("IO Error processing warc file " + infile);
             }
             return moved;
+        } else if (s.endsWith(".zim")) {
+            try {
+                final ZimImporter wri = new ZimImporter(infile.getAbsolutePath());
+                wri.start();
+                try {
+                    wri.join();
+                } catch (final InterruptedException ex) {
+                    return moved;
+                }
+                moved = infile.renameTo(outfile);
+            } catch (final IOException ex) {
+                this.log.warn("IO Error processing zim file " + infile);
+            }
+            return moved;
         } else if (s.endsWith(".jsonlist") || s.endsWith(".jsonlist.gz") || s.endsWith(".flatjson")) {
             return this.processSurrogateJson(infile, outfile);
         }
@@ -2349,6 +2362,7 @@ public final class Switchboard extends serverSwitch {
                     if ( surrogate.endsWith(".xml")
                             || surrogate.endsWith(".xml.gz")
                             || surrogate.endsWith(".xml.zip")
+                            || surrogate.endsWith(".zim")
                             || surrogate.endsWith(".warc")
                             || surrogate.endsWith(".warc.gz")
                             || surrogate.endsWith(".jsonlist")
diff --git a/source/net/yacy/search/SwitchboardConstants.java b/source/net/yacy/search/SwitchboardConstants.java
index b871291cf..fedef45e7 100644
--- a/source/net/yacy/search/SwitchboardConstants.java
+++ b/source/net/yacy/search/SwitchboardConstants.java
@@ -220,8 +220,6 @@ public final class SwitchboardConstants {
     public static final String INDEX_TRANSFER_GZIP_BODY         = "indexTransfer.gzipBody";
     public static final String PARSER_MIME_DENY                 = "parser.mime.deny";
     public static final String PARSER_EXTENSIONS_DENY           = "parser.extensions.deny";
-    public static final String PARSER_PDF_INDIVIDUALPAGES       = "parser.pdf.individualpages";
-    public static final String PARSER_PDF_INDIVIDUALPAGES_KEY   = "parser.pdf.individualpages.key";
     /**
      * <p><code>public static final String <strong>PROXY_ONLINE_CAUTION_DELAY</strong> = "onlineCautionDelay"</code></p>
      * <p>Name of the setting how long indexing should pause after the last time the proxy was used in milliseconds</p>
diff --git a/source/org/openzim/ZIMFile.java b/source/org/openzim/ZIMFile.java
index 906bf30a9..a241507ab 100644
--- a/source/org/openzim/ZIMFile.java
+++ b/source/org/openzim/ZIMFile.java
@@ -113,20 +113,24 @@ public class ZIMFile extends File {
         }
         this.mimeTypeList = mList.toArray(new String[mList.size()]);
 
-        // Initialize the Url Pointer List
-        this.urlPtrListBlob = new byte[this.header_entryCount * 8];
-        mReader.seek(this.header_urlPtrPos);
-        RandomAccessFileZIMInputStream.readFully(mReader, this.urlPtrListBlob);
-
-        // Initialize the Title Pointer List
-        this.titlePtrListBlob = new byte[this.header_entryCount * 4];
-        mReader.seek(this.header_titlePtrPos);
-        RandomAccessFileZIMInputStream.readFully(mReader, this.titlePtrListBlob);
-
-        // Initialize the Cluster Pointer List
-        this.clusterPtrListBlob = new byte[this.header_clusterCount * 8];
-        mReader.seek(this.header_clusterPtrPos);
-        RandomAccessFileZIMInputStream.readFully(mReader, this.clusterPtrListBlob);
+        try {
+            // Initialize the Url Pointer List
+            this.urlPtrListBlob = new byte[this.header_entryCount * 8];
+            mReader.seek(this.header_urlPtrPos);
+            RandomAccessFileZIMInputStream.readFully(mReader, this.urlPtrListBlob);
+
+            // Initialize the Title Pointer List
+            this.titlePtrListBlob = new byte[this.header_entryCount * 4];
+            mReader.seek(this.header_titlePtrPos);
+            RandomAccessFileZIMInputStream.readFully(mReader, this.titlePtrListBlob);
+
+            // Initialize the Cluster Pointer List
+            this.clusterPtrListBlob = new byte[this.header_clusterCount * 8];
+            mReader.seek(this.header_clusterPtrPos);
+            RandomAccessFileZIMInputStream.readFully(mReader, this.clusterPtrListBlob);
+        } catch (IndexOutOfBoundsException e) {
+            throw new IOException(e.getMessage());
+        }
     }
 
     public final String getMimeType(int idx) {

From 34a9fc1a076e89a67b351bc19bd1c2a67e730c60 Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Sun, 5 Nov 2023 12:46:37 +0100
Subject: [PATCH 04/11] bugfixes to zim reader:

---
 .../cora/document/id/MultiProtocolURL.java    |  1 -
 .../yacy/document/importer/ZimImporter.java   | 40 +++++++++++++++++--
 source/org/openzim/ZIMFile.java               |  1 +
 source/org/openzim/ZIMReader.java             | 16 +++++++-
 4 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java
index 1cac0dace..768ca0aa6 100644
--- a/source/net/yacy/cora/document/id/MultiProtocolURL.java
+++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java
@@ -37,7 +37,6 @@ import java.io.UnsupportedEncodingException;
 import java.net.InetAddress;
 import java.net.MalformedURLException;
 import java.net.URLDecoder;
-import java.net.http.HttpResponse;
 import java.nio.ByteBuffer;
 import java.nio.charset.StandardCharsets;
 import java.util.BitSet;
diff --git a/source/net/yacy/document/importer/ZimImporter.java b/source/net/yacy/document/importer/ZimImporter.java
index 118e27e40..bc7266e0a 100644
--- a/source/net/yacy/document/importer/ZimImporter.java
+++ b/source/net/yacy/document/importer/ZimImporter.java
@@ -26,11 +26,13 @@ package net.yacy.document.importer;
 import java.io.File;
 import java.io.IOException;
 import java.net.MalformedURLException;
-import java.net.URL;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
 import java.util.Collection;
 import java.util.Date;
 import java.util.HashSet;
 import java.util.LinkedHashSet;
+import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeMap;
@@ -93,6 +95,8 @@ public class ZimImporter extends Thread implements Importer {
         try {
             this.reader = new ZIMReader(this.file);
             this.guessedSource = getSource(this.reader);
+            Date guessedDate = getDate(this.reader);
+            String dates = HeaderFramework.newRfc1123Format().format(guessedDate);
 
             // verify the source
             DirectoryEntry mainEntry = this.reader.getMainDirectoryEntry();
@@ -108,6 +112,7 @@ public class ZimImporter extends Thread implements Importer {
                 DirectoryEntry de = this.reader.getDirectoryInfo(i);
                 if (!(de instanceof ZIMReader.ArticleEntry)) continue;
                 ArticleEntry ae = (ArticleEntry) de;
+                if (ae.namespace != 'C' && ae.namespace != 'A') continue;
 
                 // check url
                 DigestURL guessedUrl = guessURL(this.guessedSource, de);
@@ -121,6 +126,7 @@ public class ZimImporter extends Thread implements Importer {
 
                 // check availability of text parser
                 String mimeType = ae.getMimeType();
+                if (!mimeType.startsWith("text/") && !mimeType.equals("application/epub+zip")) continue; // in this import we want only text, not everything that is possible
                 if (TextParser.supportsMime(mimeType) != null) continue;
 
                 // read the content
@@ -130,6 +136,7 @@ public class ZimImporter extends Thread implements Importer {
                 RequestHeader requestHeader = new RequestHeader();
                 ResponseHeader responseHeader = new ResponseHeader(200);
                 responseHeader.put(HeaderFramework.CONTENT_TYPE, de.getMimeType()); // very important to tell parser which kind of content
+                responseHeader.put(HeaderFramework.LAST_MODIFIED, dates); // put in the guessd date to have something that is not the current date
                 final Request request = new Request(
                         ASCII.getBytes(sb.peers.mySeed().hash),
                         guessedUrl,
@@ -230,8 +237,6 @@ public class ZimImporter extends Thread implements Importer {
                 return "fas.org";
             case "fonts":
                 return "fonts.google.com";
-            case "gutenberg":
-                return "https://dev.library.kiwix.org/viewer#gutenberg_de_all_2023-03";
             case "ifixit":
                 return "ifixit.com";
             case "lesfondamentaux":
@@ -313,12 +318,22 @@ public class ZimImporter extends Thread implements Importer {
         return source;
     }
 
+    public static Date getDate(ZIMReader r) throws IOException {
+        String date = r.getMetadata("Date");
+        if (date != null) try {
+            SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd", Locale.US);
+            return format.parse(date);
+        } catch (ParseException e) {}
+        // failover situation: use file date
+        return new Date(r.getZIMFile().lastModified());
+    }
+
     public static DigestURL guessURL(String guessedSource, DirectoryEntry de) throws MalformedURLException {
         String url = de.url;
         if (url.equals("Main_Page")) url = "";
-        if (guessedSource != null) return new DigestURL(guessedSource + url);
         if (url.startsWith("A/")) return new DigestURL("https://" + url.substring(2));
         if (url.startsWith("H/")) return new DigestURL("https://" + url.substring(2));
+        if (guessedSource != null) return new DigestURL(guessedSource + url);
         return new DigestURL(guessedSource + url);
     }
 
@@ -439,6 +454,22 @@ public class ZimImporter extends Thread implements Importer {
          "ted_en_design_2023-09.zim",
          "ted_en_business_2023-09.zim",
          "ted_en_global_issues_2023-09.zim",
+         "opentextbooks_en_all_2023-08.zim",
+         "bestedlessons.org_en_all_2023-08.zim",
+         "wikivoyage_en_all_nopic_2023-10.zim",
+         "based.cooking_en_all_2023-10.zim",
+         "wordnet_en_all_2023-04.zim",
+         "internet-encyclopedia-philosophy_en_all_2023-08.zim",
+         "100r-off-the-grid_en_2023-09.zim",
+         "coopmaths_2023-04.zim",
+         "birds-of-ladakh_en_all_2023-02.zim",
+         "storyweaver.org_en_2023-09.zim",
+         "developer.mozilla.org_en_all_2023-02.zim",
+         "www.ready.gov_es_2023-06.zim",
+         "teoria.com_en_2023-08.zim",
+         "theworldfactbook_en_all_2023-06.zim",
+         "mutopiaproject.org_en_2023-08.zim",
+         "dp.la_en_all_2023-08.zim",
 
          // 302
          "moderators.stackexchange.com_en_all_2023-05.zim",
@@ -483,6 +514,7 @@ public class ZimImporter extends Thread implements Importer {
                 System.out.println("Namespace: " + de.namespace);
                 System.out.println("Title:     " + de.title);
                 System.out.println("URL:       " + de.url);
+                System.out.println("Mime Type  " + de.getMimeType());
                 System.out.println("guessed domain: " + guessDomainName(f.getName())); // uses a table and rules that deduces a source from the file name
                 String source = getSource(r);
                 System.out.println("guessed Source: " + source); // this uses metadata stored in the zim file
diff --git a/source/org/openzim/ZIMFile.java b/source/org/openzim/ZIMFile.java
index a241507ab..cbde3a0a8 100644
--- a/source/org/openzim/ZIMFile.java
+++ b/source/org/openzim/ZIMFile.java
@@ -134,6 +134,7 @@ public class ZIMFile extends File {
     }
 
     public final String getMimeType(int idx) {
+        if (idx >= this.mimeTypeList.length) return "";
         return this.mimeTypeList[idx];
     }
 
diff --git a/source/org/openzim/ZIMReader.java b/source/org/openzim/ZIMReader.java
index 27d544e27..14bc47dfd 100644
--- a/source/org/openzim/ZIMReader.java
+++ b/source/org/openzim/ZIMReader.java
@@ -237,11 +237,25 @@ public class ZIMReader {
 
     public DirectoryEntry getMainDirectoryEntry() throws IOException {
         DirectoryEntry de = getDirectoryInfo(this.mFile.header_mainPage);
-        if (de.namespace == 'W' && de.url.equals("mainPage") && de instanceof RedirectEntry) {
+        if (de instanceof RedirectEntry) {
             // resolve redirect to get the actual main page
             int redirect = ((RedirectEntry) de).redirect_index;
             de = getDirectoryInfo(redirect);
         }
+        // For the main entry we demand a "text/html" mime type.
+        // Many zim files do not provide this as the main file, which is strange (maybe lazy/irresponsibe)
+        // Because the main entry is important for a validation, we seek for one entry which may
+        // be proper for indexing.
+        int entryNumner = 0;
+        while (!de.getMimeType().equals("text/html") && entryNumner < this.mFile.header_entryCount) {
+            de = getDirectoryInfo(entryNumner);
+            entryNumner++;
+            if (de.namespace != 'C' && de.namespace != 'A') continue;
+            if (!(de instanceof ArticleEntry)) continue;
+            if (!de.getMimeType().equals("text/html")) continue;
+            if (de.url.contains("404") || de.title.contains("404") || de.title.contains("301")) continue; // is a pain
+            return de;
+        }
         return de;
     }
 

From 24011dcbcc26f79f80a81a8dae59c6f65c1cee25 Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Mon, 6 Nov 2023 22:44:18 +0100
Subject: [PATCH 05/11] more file name extensions for json list surrogate files

---
 source/net/yacy/search/Switchboard.java | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java
index bebd16cbd..bfadb1892 100644
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@@ -2166,7 +2166,10 @@ public final class Switchboard extends serverSwitch {
                 this.log.warn("IO Error processing zim file " + infile);
             }
             return moved;
-        } else if (s.endsWith(".jsonlist") || s.endsWith(".jsonlist.gz") || s.endsWith(".flatjson")) {
+        } else if (
+                s.endsWith(".jsonl") || s.endsWith(".jsonl.gz") ||
+                s.endsWith(".jsonlist") || s.endsWith(".jsonlist.gz") ||
+                s.endsWith(".flatjson") || s.endsWith(".flatjson.gz")) {
             return this.processSurrogateJson(infile, outfile);
         }
         InputStream is = null;

From 655d8db80218312593f37b6026fd6dc0db01d23f Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Sun, 12 Nov 2023 15:26:18 +0100
Subject: [PATCH 06/11] detailed directions in index export to explain how the
 export can be imported again using elasticsearch/opensearch

---
 htroot/IndexExport_p.html | 51 +++++++++++++++++++++++++++++----------
 1 file changed, 38 insertions(+), 13 deletions(-)

diff --git a/htroot/IndexExport_p.html b/htroot/IndexExport_p.html
index 1aa992716..eb3e1f188 100644
--- a/htroot/IndexExport_p.html
+++ b/htroot/IndexExport_p.html
@@ -9,11 +9,10 @@
   <body id="IndexControl">
     #%env/templates/header.template%#
     #%env/templates/submenuIndexImport.template%#
-    
-    
+
     <h2>Index Export</h2>
     <p>The local index currently contains #[ucount]# documents, only #[ucount200]# exportable with status code 200 - the remaining are error documents.</p>
-    
+
     #(lurlexport)#::
     <form action="IndexExport_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
     <fieldset><legend>Loaded URL Export</legend>
@@ -34,19 +33,45 @@
         <dd>
         <dl>
         <dt>Full Data Records:</dt>
-        <dd><input type="radio" name="format" value="full-solr" /> XML (Rich and full-text Solr data, one document per line in one large xml file, can be processed with shell tools, can be imported with DATA/SURROGATE/in/)<br />
-            <input type="radio" name="format" value="full-elasticsearch" checked="checked" /> JSON (Rich and full-text Elasticsearch data, one document per line in one flat JSON file, can be bulk-imported to elasticsearch with the command "curl -XPOST localhost:9200/collection1/yacy/_bulk --data-binary @yacy_dump_XXX.flatjson")<br />
-            <input type="radio" name="format" value="full-rss" /> XML (RSS)</dd>
+        <dd><input type="radio" name="format" value="full-elasticsearch" checked="checked" />
+            JSON (Rich and full-text Elasticsearch data, one document per line in one flat JSON file,
+            can be bulk-imported to elasticsearch. Here is an example for opensearch, using docker:<br />
+Start docker container of opensearch:<br />
+<code>docker run --name opensearch -p 9200:9200 -d -e OPENSEARCH_JAVA_OPTS="-Xms2G -Xmx2G" -e discovery.type=single-node -e DISABLE_SECURITY_PLUGIN=true -v $(pwd)/opensearch_data:/usr/share/opensearch/data opensearchproject/opensearch:latest</code><br />
+Unblock index creation:<br />
+<code>curl -X PUT "http://localhost:9200/_cluster/settings" -H 'Content-Type: application/json' -d'
+{
+  "persistent": {
+    "cluster.blocks.create_index": null
+  }
+}'</code><br />
+Create the search index:<br />
+<code>curl -X PUT "http://localhost:9200/collection1/yacy"</code><br />
+Bulk-upload the index file:<br />
+<code>curl -XPOST "http://localhost:9200/collection1/yacy/_bulk?filter_path=took,errors" -H "Content-Type: application/x-ndjson" --data-binary @yacy_dump_XXX.flatjson</code><br />
+Make a search, get 10 results, search in fields text_t, title, description with boosts:<br />
+<code>curl -X POST "http://localhost:9200/collection1/yacy/_search" -H 'Content-Type: application/json' -d'
+{"size": 10, "query": {"multi_match": {
+    "query": "one two three",
+    "fields": ["text_t", "title^10", "description^3"], "fuzziness": "AUTO"
+}}}'</code><br />
+            <input type="radio" name="format" value="full-solr" />
+            XML (Rich and full-text Solr data, one document per line in one large xml file,
+            can be processed with shell tools, can be imported with DATA/SURROGATE/in/)
+            <br />
+            <input type="radio" name="format" value="full-rss" />
+            XML (RSS)
+        </dd>
         <dt>Full URL List:</dt>
         <dd><input type="radio" name="format" value="url-text" /> Plain Text List (URLs only)<br />
             <input type="radio" name="format" value="url-html" /> HTML (URLs with title)</dd>
         <dt>Only Domain:</dt>
         <dd><input type="radio" name="format" value="dom-text" /> Plain Text List (domains only)<br />
             <input type="radio" name="format" value="dom-html" /> HTML (domains as URLs, no title)</dd>
-		<dt>Only Text:</dt>
+        <dt>Only Text:</dt>
         <dd><input type="radio" name="format" value="text-text" /> Fulltext of Search Index Text</dd>
-		</dl>
-		</dd>
+        </dl>
+        </dd>
         <dt>&nbsp;</dt>
         <dd><input type="submit" name="lurlexport" value="Export" class="btn btn-primary" style="width:240px;"/>
         </dd>
@@ -55,16 +80,16 @@
     </form>::
     <div class="alert alert-info" style="text-decoration:blink">Export to file #[exportfile]# is running ..  #[urlcount]# Documents so far</div>::
     #(/lurlexport)#
-    
-	#(lurlexportfinished)#::
+
+    #(lurlexportfinished)#::
     <div class="alert alert-success">Finished export of #[urlcount]# Documents to file <a href="file://#[exportfile]#" target="_">#[exportfile]#</a><br/>
     <em>Import this file by moving it to DATA/SURROGATES/in</em></div>::
     #(/lurlexportfinished)#
-    
+
     #(lurlexporterror)#::
     <div class="alert alert-warning">Export to file #[exportfile]# failed: #[exportfailmsg]#</div>::
     #(/lurlexporterror)#
-    
+
     #(dumprestore)#::
     <form action="IndexExport_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
     <fieldset><legend>Dump and Restore of Solr Index</legend>

From c20c4b8a21364bf06d1f91c63650fcf3b434ba04 Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Sun, 12 Nov 2023 22:11:55 +0100
Subject: [PATCH 07/11] modified export: added maximum number of docs per chunk
 The export file can now be many files, called chunks. By default still only
 one chunk is exported. This function is required in case that the exported
 files shall be imported to an elasticsearch/opensearch index. The bulk import
 function of elasticsearch/opensearch is limited to 100MB. To make it possible
 to import YaCy files, those must be splitted into chunks. Right now we cannot
 estimate the chunk size as bytes, only as number of documents. The user must
 do experiments to find out the optimum chunk max size, like 50000 docs per
 chunk. Try this as first attempt.

---
 htroot/IndexExport_p.html                  |  11 +-
 source/net/yacy/htroot/IndexExport_p.java  |  64 +++----
 source/net/yacy/search/index/Fulltext.java | 197 ++++++++++++++-------
 3 files changed, 171 insertions(+), 101 deletions(-)

diff --git a/htroot/IndexExport_p.html b/htroot/IndexExport_p.html
index eb3e1f188..87ee4b62d 100644
--- a/htroot/IndexExport_p.html
+++ b/htroot/IndexExport_p.html
@@ -21,13 +21,16 @@
         <dd><input type="text" name="exportfilepath" value="#[exportfilepath]#" size="120" maxlength="250" />
         </dd>
         <dt class="TableCellDark">URL Filter</dt>
-        <dd><input type="text" name="exportfilter" value=".*.*" size="20" maxlength="250" />
+        <dd><input type="text" name="exportfilter" value=".*.*" size="20" maxlength="250" />&nbsp;.*.* (default) is a catch-all; format: java regex
         </dd>
         <dt class="TableCellDark">query</dt>
-        <dd><input type="text" name="exportquery" value="*:*" size="20" maxlength="250" />
+        <dd><input type="text" name="exportquery" value="*:*" size="20" maxlength="250" />&nbsp;*:* (default) is a catch-all; format: <field-name>:<solr-pattern>
         </dd>
-        <dt class="TableCellDark">maximum age (seconds, -1 = unlimited)</dt>
-        <dd><input type="text" name="exportmaxseconds" value="-1" size="20" maxlength="250" />
+        <dt class="TableCellDark">maximum age (seconds)</dt>
+        <dd><input type="text" name="exportmaxseconds" value="-1" size="20" maxlength="250" />&nbsp;-1 = unlimited -> no document is too old
+        </dd>
+        <dt class="TableCellDark">maximum number of records per chunk</dt>
+        <dd><input type="text" name="maxchunksize" value="-1" size="20" maxlength="250" />&nbsp;if exceeded: several chunks are stored; -1 = unlimited (makes only one chunk)
         </dd>
         <dt class="TableCellDark">Export Format</dt>
         <dd>
diff --git a/source/net/yacy/htroot/IndexExport_p.java b/source/net/yacy/htroot/IndexExport_p.java
index 78cc94132..aa5fc6f09 100644
--- a/source/net/yacy/htroot/IndexExport_p.java
+++ b/source/net/yacy/htroot/IndexExport_p.java
@@ -64,8 +64,8 @@ public class IndexExport_p {
         prop.put("lurlexport", 0);
         prop.put("reload", 0);
         prop.put("dumprestore", 1);
-		prop.put("dumprestore_dumpRestoreEnabled", sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_FULLTEXT,
-				SwitchboardConstants.CORE_SERVICE_FULLTEXT_DEFAULT));
+        prop.put("dumprestore_dumpRestoreEnabled", sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_FULLTEXT,
+                SwitchboardConstants.CORE_SERVICE_FULLTEXT_DEFAULT));
         List<File> dumpFiles =  segment.fulltext().dumpFiles();
         prop.put("dumprestore_dumpfile", dumpFiles.size() == 0 ? "" : dumpFiles.get(dumpFiles.size() - 1).getAbsolutePath());
         prop.put("dumprestore_optimizemax", 10);
@@ -80,7 +80,7 @@ public class IndexExport_p {
             prop.put("lurlexportfinished", 0);
             prop.put("lurlexporterror", 0);
             prop.put("lurlexport_exportfile", export.file().toString());
-            prop.put("lurlexport_urlcount", export.count());
+            prop.put("lurlexport_urlcount", export.docCount());
             prop.put("reload", 1);
         } else {
             prop.put("lurlexport", 1);
@@ -93,7 +93,7 @@ public class IndexExport_p {
                 // an export was running but has finished
                 prop.put("lurlexportfinished", 1);
                 prop.put("lurlexportfinished_exportfile", export.file().toString());
-                prop.put("lurlexportfinished_urlcount", export.count());
+                prop.put("lurlexportfinished_urlcount", export.docCount());
                 if (export.failed() == null) {
                     prop.put("lurlexporterror", 0);
                 } else {
@@ -123,6 +123,8 @@ public class IndexExport_p {
             final String filter = post.get("exportfilter", ".*");
             final String query = post.get("exportquery", "*:*");
             final int maxseconds = post.getInt("exportmaxseconds", -1);
+            long maxChunkSize = post.getLong("maxchunksize", Long.MAX_VALUE);
+            if (maxChunkSize <= 0) maxChunkSize = Long.MAX_VALUE;
             final String path = post.get("exportfilepath", "");
 
             // store this call as api call: we do this even if there is a chance that it fails because recurring calls may do not fail
@@ -130,7 +132,7 @@ public class IndexExport_p {
 
             // start the export
             try {
-                export = sb.index.fulltext().export(format, filter, query, maxseconds, new File(path), dom, text);
+                export = sb.index.fulltext().export(format, filter, query, maxseconds, new File(path), dom, text, maxChunkSize);
             } catch (final IOException e) {
                 prop.put("lurlexporterror", 1);
                 prop.put("lurlexporterror_exportfile", "-no export-");
@@ -140,7 +142,7 @@ public class IndexExport_p {
 
             // show result
             prop.put("lurlexport_exportfile", export.file().toString());
-            prop.put("lurlexport_urlcount", export.count());
+            prop.put("lurlexport_urlcount", export.docCount());
             if ((export != null) && (export.failed() == null)) {
                 prop.put("lurlexport", 2);
             }
@@ -148,34 +150,34 @@ public class IndexExport_p {
         }
 
         if (post.containsKey("indexdump")) {
-        	try {
-        		final File dump = segment.fulltext().dumpEmbeddedSolr();
-        		prop.put("indexdump", 1);
-        		prop.put("indexdump_dumpfile", dump.getAbsolutePath());
-        		dumpFiles =  segment.fulltext().dumpFiles();
-        		prop.put("dumprestore_dumpfile", dumpFiles.size() == 0 ? "" : dumpFiles.get(dumpFiles.size() - 1).getAbsolutePath());
-        		// sb.tables.recordAPICall(post, "IndexExport_p.html", WorkTables.TABLE_API_TYPE_STEERING, "solr dump generation");
-        	} catch(final SolrException e) {
-        		if(ErrorCode.SERVICE_UNAVAILABLE.code == e.code()) {
-        			prop.put("indexdump", 2);
-        		} else {
-        			prop.put("indexdump", 3);
-        		}
-        	}
+            try {
+                final File dump = segment.fulltext().dumpEmbeddedSolr();
+                prop.put("indexdump", 1);
+                prop.put("indexdump_dumpfile", dump.getAbsolutePath());
+                dumpFiles =  segment.fulltext().dumpFiles();
+                prop.put("dumprestore_dumpfile", dumpFiles.size() == 0 ? "" : dumpFiles.get(dumpFiles.size() - 1).getAbsolutePath());
+                // sb.tables.recordAPICall(post, "IndexExport_p.html", WorkTables.TABLE_API_TYPE_STEERING, "solr dump generation");
+            } catch(final SolrException e) {
+                if(ErrorCode.SERVICE_UNAVAILABLE.code == e.code()) {
+                    prop.put("indexdump", 2);
+                } else {
+                    prop.put("indexdump", 3);
+                }
+            }
         }
 
         if (post.containsKey("indexrestore")) {
-        	try {
-        		final File dump = new File(post.get("dumpfile", ""));
-        		segment.fulltext().restoreEmbeddedSolr(dump);
-        		prop.put("indexRestore", 1);
-        	} catch(final SolrException e) {
-        		if(ErrorCode.SERVICE_UNAVAILABLE.code == e.code()) {
-        			prop.put("indexRestore", 2);
-        		} else {
-        			prop.put("indexRestore", 3);
-        		}
-        	}
+            try {
+                final File dump = new File(post.get("dumpfile", ""));
+                segment.fulltext().restoreEmbeddedSolr(dump);
+                prop.put("indexRestore", 1);
+            } catch(final SolrException e) {
+                if(ErrorCode.SERVICE_UNAVAILABLE.code == e.code()) {
+                    prop.put("indexRestore", 2);
+                } else {
+                    prop.put("indexRestore", 3);
+                }
+            }
         }
 
         // insert constants
diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java
index 718be0099..d8a1754a7 100644
--- a/source/net/yacy/search/index/Fulltext.java
+++ b/source/net/yacy/search/index/Fulltext.java
@@ -695,7 +695,10 @@ public final class Fulltext {
     }
 
     public final static String yacy_dump_prefix = "yacy_dump_";
-    public Export export(Fulltext.ExportFormat format, String filter, String query, final int maxseconds, File path, boolean dom, boolean text) throws IOException {
+    public Export export(
+            Fulltext.ExportFormat format, String filter, String query,
+            final int maxseconds, File path, boolean dom, boolean text,
+            long maxChunkSize) throws IOException {
 
         // modify query according to maxseconds
         final long now = System.currentTimeMillis();
@@ -760,27 +763,26 @@ public final class Fulltext {
             }
         }
 
-        String s = new File(path, yacy_dump_prefix +
+        String filename = yacy_dump_prefix +
                 "f" + GenericFormatter.SHORT_MINUTE_FORMATTER.format(firstdate) + "_" +
                 "l" + GenericFormatter.SHORT_MINUTE_FORMATTER.format(lastdate) + "_" +
                 "n" + GenericFormatter.SHORT_MINUTE_FORMATTER.format(new Date(now)) + "_" +
-                "c" + String.format("%1$012d", doccount)).getAbsolutePath() + "_tc"; // the name ends with the transaction token ('c' = 'created')
+                "c" + String.format("%1$012d", doccount)+ "_tc"; // the name ends with the transaction token ('c' = 'created')
 
-        // create export file name
-        if (s.indexOf('.',0) < 0) s += "." + format.getExt();
-        final File f = new File(s);
-        f.getParentFile().mkdirs();
-
-        return export(f, filter, query, format, dom, text);
+        return export(path, filename, format.getExt(), filter, query, format, dom, text, maxChunkSize);
     }
 
     // export methods
-    public Export export(final File f, final String filter, final String query, final ExportFormat format, final boolean dom, final boolean text) {
+    public Export export(
+            final File path, final String filename,
+            final String fileext, final String filter, final String query,
+            final ExportFormat format, final boolean dom, final boolean text,
+            long maxChunkSize) {
         if ((this.exportthread != null) && (this.exportthread.isAlive())) {
             ConcurrentLog.warn("LURL-EXPORT", "cannot start another export thread, already one running");
             return this.exportthread;
         }
-        this.exportthread = new Export(f, filter, query, format, dom, text);
+        this.exportthread = new Export(path, filename, fileext, filter, query, format, dom, text, maxChunkSize);
         this.exportthread.start();
         return this.exportthread;
     }
@@ -795,69 +797,95 @@ public final class Fulltext {
     }
 
     public class Export extends Thread {
-        private final File f;
+        private final File path;
+        private final String filename, fileext;
         private final Pattern pattern;
-        private int count;
         private String failure;
         private final String query;
         private final ExportFormat format;
         private final boolean dom, text;
-
-        private Export(final File f, final String filter, final String query, final ExportFormat format, final boolean dom, final boolean text) {
+        private int docCount, chunkSize, chunkCount;
+        private final long maxChunkSize;
+
+        private Export(
+                final File path, final String filename,
+                final String fileext, final String filter, final String query,
+                final ExportFormat format, final boolean dom, final boolean text,
+                long maxChunkSize) {
             super("Fulltext.Export");
             // format: 0=text, 1=html, 2=rss/xml
-            this.f = f;
+            this.path = path;
+            this.filename = filename;
+            this.fileext = fileext;
             this.pattern = filter == null ? null : Pattern.compile(filter);
             this.query = query == null? AbstractSolrConnector.CATCHALL_QUERY : query;
-            this.count = 0;
             this.failure = null;
             this.format = format;
             this.dom = dom;
             this.text = text;
+            this.docCount = 0; // number of all documents exported so far
+            this.chunkSize = 0; // number of documents in the current chunk
+            this.chunkCount = 0; // number of chunks opened so far
+            this.maxChunkSize = maxChunkSize; // number of maximum document count per chunk
             //if ((dom) && (format == 2)) dom = false;
         }
 
+        private void printHead(PrintWriter pw) {
+            if (this.format == ExportFormat.html) {
+                pw.println("<html><head></head><body>");
+            }
+            if (this.format == ExportFormat.rss) {
+                pw.println("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
+                pw.println("<?xml-stylesheet type='text/xsl' href='/yacysearch.xsl' version='1.0'?>");
+                pw.println("<rss version=\"2.0\" xmlns:yacy=\"http://www.yacy.net/\" xmlns:opensearch=\"http://a9.com/-/spec/opensearch/1.1/\" xmlns:atom=\"http://www.w3.org/2005/Atom\">");
+                pw.println("<channel>");
+                pw.println("<title>YaCy Peer-to-Peer - Web-Search URL Export</title>");
+                pw.println("<description></description>");
+                pw.println("<link>http://yacy.net</link>");
+            }
+            if (this.format == ExportFormat.solr) {
+                pw.println("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
+                pw.println("<response>");
+                pw.println("<lst name=\"responseHeader\">");
+                pw.println(" <str format=\"yacy.index.export.solr.xml\"/>");
+                pw.println(" <lst name=\"params\">");
+                pw.println("  <str name=\"q\">" + this.query + "</str>");
+                pw.println(" </lst>");
+                pw.println("</lst>");
+                pw.println("<result>");
+            }
+        }
+ 
+        private void printTail(PrintWriter pw) {
+            if (this.format == ExportFormat.html) {
+                pw.println("</body></html>");
+            }
+            if (this.format == ExportFormat.rss) {
+                pw.println("</channel>");
+                pw.println("</rss>");
+            }
+            if (this.format == ExportFormat.solr) {
+                pw.println("</result>");
+                pw.println("</response>");
+            }
+        }
+ 
         @Override
         public void run() {
             try {
-                final File parentf = this.f.getParentFile();
-                if (parentf != null) {
-                    parentf.mkdirs();
-                }
+                if (this.path != null) this.path.mkdirs();
             } catch(final Exception e) {
                 ConcurrentLog.logException(e);
                 this.failure = e.getMessage();
                 return;
             }
 
-            try (/* Resources automatically closed by this try-with-resources statement */
-                    final OutputStream os = new FileOutputStream(this.format == ExportFormat.solr ? new File(this.f.getAbsolutePath() + ".gz") : this.f);
-                    final OutputStream wrappedStream = ((this.format == ExportFormat.solr)) ? new GZIPOutputStream(os, 65536){{this.def.setLevel(Deflater.BEST_COMPRESSION);}} : os;
-                    final PrintWriter pw =  new PrintWriter(new BufferedOutputStream(wrappedStream));
-                    ) {
-                if (this.format == ExportFormat.html) {
-                    pw.println("<html><head></head><body>");
-                }
-                if (this.format == ExportFormat.rss) {
-                    pw.println("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
-                    pw.println("<?xml-stylesheet type='text/xsl' href='/yacysearch.xsl' version='1.0'?>");
-                    pw.println("<rss version=\"2.0\" xmlns:yacy=\"http://www.yacy.net/\" xmlns:opensearch=\"http://a9.com/-/spec/opensearch/1.1/\" xmlns:atom=\"http://www.w3.org/2005/Atom\">");
-                    pw.println("<channel>");
-                    pw.println("<title>YaCy Peer-to-Peer - Web-Search URL Export</title>");
-                    pw.println("<description></description>");
-                    pw.println("<link>http://yacy.net</link>");
-                }
-                if (this.format == ExportFormat.solr) {
-                    pw.println("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
-                    pw.println("<response>");
-                    pw.println("<lst name=\"responseHeader\">");
-                    pw.println(" <str format=\"yacy.index.export.solr.xml\"/>");
-                    pw.println(" <lst name=\"params\">");
-                    pw.println("  <str name=\"q\">" + this.query + "</str>");
-                    pw.println(" </lst>");
-                    pw.println("</lst>");
-                    pw.println("<result>");
-                }
+            try {
+                docCount = 0;
+                chunkSize = 0;
+                chunkCount = 0;
+                PrintWriter pw = getWriter();
+                printHead(pw);
                 if (this.dom) {
                     final Map<String, ReversibleScoreMap<String>> scores = Fulltext.this.getDefaultConnector().getFacets(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", 100000000, CollectionSchema.host_s.getSolrFieldName());
                     final ReversibleScoreMap<String> stats = scores.get(CollectionSchema.host_s.getSolrFieldName());
@@ -865,7 +893,7 @@ public final class Fulltext {
                         if (this.pattern != null && !this.pattern.matcher(host).matches()) continue;
                         if (this.format == ExportFormat.text) pw.println(host);
                         if (this.format == ExportFormat.html) pw.println("<a href=\"http://" + host + "\">" + host + "</a><br>");
-                        this.count++;
+                        this.docCount++; this.chunkSize++;
                     }
                 } else {
                     if (this.format == ExportFormat.solr || this.format == ExportFormat.elasticsearch || (this.text && this.format == ExportFormat.text)) {
@@ -882,7 +910,14 @@ public final class Fulltext {
                             if (this.format == ExportFormat.elasticsearch) pw.println("{\"index\":{}}");
                             final String d = sw.toString();
                             pw.println(d);
-                            this.count++;
+                            this.docCount++; this.chunkSize++;
+                            if (this.chunkSize >= this.maxChunkSize) {
+                                printTail(pw);
+                                pw.close();
+                                pw = getWriter(); // increases chunkCount as side-effect
+                                printHead(pw);
+                                this.chunkSize = 0;
+                            }
                         }
                     } else {
                         final BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", null, 0, 100000000, Long.MAX_VALUE, 100, 1, true,
@@ -918,21 +953,19 @@ public final class Fulltext {
                                 pw.println("<guid isPermaLink=\"false\">" + hash + "</guid>");
                                 pw.println("</item>");
                             }
-                            this.count++;
+                            this.docCount++; this.chunkSize++;
+                            if (this.chunkSize >= this.maxChunkSize) {
+                                printTail(pw);
+                                pw.close();
+                                pw = getWriter(); // increases chunkCount as side-effect
+                                printHead(pw);
+                                this.chunkSize = 0;
+                            }
                         }
                     }
                 }
-                if (this.format == ExportFormat.html) {
-                    pw.println("</body></html>");
-                }
-                if (this.format == ExportFormat.rss) {
-                    pw.println("</channel>");
-                    pw.println("</rss>");
-                }
-                if (this.format == ExportFormat.solr) {
-                    pw.println("</result>");
-                    pw.println("</response>");
-                }
+                printTail(pw);
+                pw.close();
             } catch (final Exception e) {
                 /* Catch but log any IO exception that can occur on copy, automatic closing or streams creation */
                 ConcurrentLog.logException(e);
@@ -942,15 +975,47 @@ public final class Fulltext {
         }
 
         public File file() {
-            return this.f;
+            final File f = new File(this.path, this.filename + "_" + chunkcount(this.chunkCount) + "." + this.fileext);
+            return f;
+        }
+
+        private PrintWriter getWriter() throws IOException {
+            File f = file();
+            final OutputStream os = new FileOutputStream(this.format == ExportFormat.solr ? new File(f.getAbsolutePath() + ".gz") : f);
+            final PrintWriter pw =  new PrintWriter(new BufferedOutputStream(((this.format == ExportFormat.solr)) ? new GZIPOutputStream(os, 65536){{this.def.setLevel(Deflater.BEST_COMPRESSION);}} : os));
+            this.chunkCount++;
+            return pw;
+        }
+
+        private String chunkcount(int count) {
+            if (count < 10) return "000" + count;
+            if (count < 100) return "00" + count;
+            if (count < 1000) return "0" + count;
+            return "" + count;
+        }
+
+        public File path() {
+            return this.path;
+        }
+
+        public String filename() {
+            return this.filename;
+        }
+
+        public String fileext() {
+            return this.fileext;
         }
 
         public String failed() {
             return this.failure;
         }
 
-        public int count() {
-            return this.count;
+        public int docCount() {
+            return this.docCount;
+        }
+
+        public int chunkCount() {
+            return this.chunkCount;
         }
 
         @SuppressWarnings("unchecked")

From 3268a93019aa00b49fdfb5925d7caac3c1a94274 Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Mon, 13 Nov 2023 10:27:50 +0100
Subject: [PATCH 08/11] added a 'minified' option to YaCy dumps

---
 htroot/IndexExport_p.html                  |  5 ++
 source/net/yacy/htroot/IndexExport_p.java  |  3 +-
 source/net/yacy/search/index/Fulltext.java | 87 ++++++++++++++--------
 3 files changed, 62 insertions(+), 33 deletions(-)

diff --git a/htroot/IndexExport_p.html b/htroot/IndexExport_p.html
index 87ee4b62d..df58837c2 100644
--- a/htroot/IndexExport_p.html
+++ b/htroot/IndexExport_p.html
@@ -32,6 +32,11 @@
         <dt class="TableCellDark">maximum number of records per chunk</dt>
         <dd><input type="text" name="maxchunksize" value="-1" size="20" maxlength="250" />&nbsp;if exceeded: several chunks are stored; -1 = unlimited (makes only one chunk)
         </dd>
+        <dt class="TableCellDark">Export Size</dt>
+        <dd>
+          full size, all fields:<input type="radio" name="minified" value="no" checked="checked">&nbsp;
+          minified; only fields sku, date, title, description, text_t<input type="radio" name="minified" value="yes" >
+        </dd>
         <dt class="TableCellDark">Export Format</dt>
         <dd>
         <dl>
diff --git a/source/net/yacy/htroot/IndexExport_p.java b/source/net/yacy/htroot/IndexExport_p.java
index aa5fc6f09..667ba5711 100644
--- a/source/net/yacy/htroot/IndexExport_p.java
+++ b/source/net/yacy/htroot/IndexExport_p.java
@@ -126,13 +126,14 @@ public class IndexExport_p {
             long maxChunkSize = post.getLong("maxchunksize", Long.MAX_VALUE);
             if (maxChunkSize <= 0) maxChunkSize = Long.MAX_VALUE;
             final String path = post.get("exportfilepath", "");
+            final boolean minified = post.get("minified", "no").equals("yes");
 
             // store this call as api call: we do this even if there is a chance that it fails because recurring calls may do not fail
             if (maxseconds != -1) sb.tables.recordAPICall(post, "IndexExport_p.html", WorkTables.TABLE_API_TYPE_DUMP, format + "-dump, q=" + query + ", maxseconds=" + maxseconds);
 
             // start the export
             try {
-                export = sb.index.fulltext().export(format, filter, query, maxseconds, new File(path), dom, text, maxChunkSize);
+                export = sb.index.fulltext().export(format, filter, query, maxseconds, new File(path), dom, text, maxChunkSize, minified);
             } catch (final IOException e) {
                 prop.put("lurlexporterror", 1);
                 prop.put("lurlexporterror_exportfile", "-no export-");
diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java
index d8a1754a7..cd9680b27 100644
--- a/source/net/yacy/search/index/Fulltext.java
+++ b/source/net/yacy/search/index/Fulltext.java
@@ -34,8 +34,10 @@ import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Date;
 import java.util.HashSet;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
+import java.util.Map.Entry;
 import java.util.Set;
 import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.atomic.AtomicInteger;
@@ -118,7 +120,7 @@ public final class Fulltext {
         this.writeWebgraph = false;
     }
 
-    public void setUseWebgraph(boolean check) {
+    public void setUseWebgraph(final boolean check) {
         this.writeWebgraph = check;
     }
 
@@ -142,8 +144,8 @@ public final class Fulltext {
         final File solrLocation = new File(this.segmentPath, SOLR_PATH);
 
         // migrate old solr to new
-        for (String oldVersion: SOLR_OLD_PATH) {
-            File oldLocation = new File(this.segmentPath, oldVersion);
+        for (final String oldVersion: SOLR_OLD_PATH) {
+            final File oldLocation = new File(this.segmentPath, oldVersion);
             if (oldLocation.exists()) {
                 if (!oldLocation.renameTo(solrLocation)) {
                     ConcurrentLog.severe("Fulltext", "Failed renaming old Solr location ("
@@ -183,11 +185,11 @@ public final class Fulltext {
         return this.solrInstances.getDefaultEmbeddedConnector();
     }
 
-    public EmbeddedSolrConnector getEmbeddedConnector(String corename) {
+    public EmbeddedSolrConnector getEmbeddedConnector(final String corename) {
         return this.solrInstances.getEmbeddedConnector(corename);
     }
 
-    public SolrConnector getConnectorForRead(String corename) {
+    public SolrConnector getConnectorForRead(final String corename) {
         if (this.solrInstances.isConnectedRemote()) return this.solrInstances.getRemoteConnector(corename);
         if (this.solrInstances.isConnectedEmbedded()) return this.solrInstances.getEmbeddedConnector(corename);
         return null;
@@ -315,7 +317,7 @@ public final class Fulltext {
     }
 
     private long lastCommit = 0;
-    public void commit(boolean softCommit) {
+    public void commit(final boolean softCommit) {
         final long t = System.currentTimeMillis();
         if (this.lastCommit + 10000 > t) return;
         this.lastCommit = t;
@@ -423,7 +425,7 @@ public final class Fulltext {
      * @param freshdate either NULL or a date in the past which is the limit for deletion. Only documents older than this date are deleted
      * @throws IOException
      */
-    public void deleteStaleDomainHashes(final Set<String> hosthashes, Date freshdate) {
+    public void deleteStaleDomainHashes(final Set<String> hosthashes, final Date freshdate) {
         // delete in solr
         final Date now = new Date();
         deleteDomainWithConstraint(this.getDefaultConnector(), CollectionSchema.host_id_s.getSolrFieldName(), hosthashes,
@@ -434,7 +436,7 @@ public final class Fulltext {
                     (WebgraphSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]"));
     }
 
-    public void deleteStaleDomainNames(final Set<String> hostnames, Date freshdate) {
+    public void deleteStaleDomainNames(final Set<String> hostnames, final Date freshdate) {
 
         final Date now = new Date();
         deleteDomainWithConstraint(this.getDefaultConnector(), CollectionSchema.host_s.getSolrFieldName(), hostnames,
@@ -453,7 +455,7 @@ public final class Fulltext {
         deleteDomainWithConstraint(this.getDefaultConnector(), CollectionSchema.host_id_s.getSolrFieldName(), hosthashes, CollectionSchema.failreason_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM);
     }
 
-    private static void deleteDomainWithConstraint(SolrConnector connector, String fieldname, final Set<String> hosthashes, String constraintQuery) {
+    private static void deleteDomainWithConstraint(final SolrConnector connector, final String fieldname, final Set<String> hosthashes, final String constraintQuery) {
         if (hosthashes == null || hosthashes.size() == 0) return;
         final int subsetscount = 1 + (hosthashes.size() / 255); // if the list is too large, we get a "too many boolean clauses" exception
         int c = 0;
@@ -492,7 +494,7 @@ public final class Fulltext {
      * @param basepath the left path of the url; at least until the end of the host
      * @param freshdate either NULL or a date in the past which is the limit for deletion. Only documents older than this date are deleted
      */
-    public int remove(final String basepath, Date freshdate) {
+    public int remove(final String basepath, final Date freshdate) {
         DigestURL uri;
         try {uri = new DigestURL(basepath);} catch (final MalformedURLException e) {return 0;}
         final String host = uri.getHost();
@@ -690,15 +692,15 @@ public final class Fulltext {
     public static enum ExportFormat {
         text("txt"), html("html"), rss("rss"), solr("xml"), elasticsearch("flatjson");
         private final String ext;
-        private ExportFormat(String ext) {this.ext = ext;}
+        private ExportFormat(final String ext) {this.ext = ext;}
         public String getExt() {return this.ext;}
     }
 
     public final static String yacy_dump_prefix = "yacy_dump_";
     public Export export(
-            Fulltext.ExportFormat format, String filter, String query,
-            final int maxseconds, File path, boolean dom, boolean text,
-            long maxChunkSize) throws IOException {
+            final Fulltext.ExportFormat format, final String filter, String query,
+            final int maxseconds, final File path, final boolean dom, final boolean text,
+            final long maxChunkSize, final boolean minified) throws IOException {
 
         // modify query according to maxseconds
         final long now = System.currentTimeMillis();
@@ -763,13 +765,13 @@ public final class Fulltext {
             }
         }
 
-        String filename = yacy_dump_prefix +
+        final String filename = yacy_dump_prefix +
                 "f" + GenericFormatter.SHORT_MINUTE_FORMATTER.format(firstdate) + "_" +
                 "l" + GenericFormatter.SHORT_MINUTE_FORMATTER.format(lastdate) + "_" +
                 "n" + GenericFormatter.SHORT_MINUTE_FORMATTER.format(new Date(now)) + "_" +
                 "c" + String.format("%1$012d", doccount)+ "_tc"; // the name ends with the transaction token ('c' = 'created')
 
-        return export(path, filename, format.getExt(), filter, query, format, dom, text, maxChunkSize);
+        return export(path, filename, format.getExt(), filter, query, format, dom, text, maxChunkSize, minified);
     }
 
     // export methods
@@ -777,17 +779,17 @@ public final class Fulltext {
             final File path, final String filename,
             final String fileext, final String filter, final String query,
             final ExportFormat format, final boolean dom, final boolean text,
-            long maxChunkSize) {
+            final long maxChunkSize, final boolean minified) {
         if ((this.exportthread != null) && (this.exportthread.isAlive())) {
             ConcurrentLog.warn("LURL-EXPORT", "cannot start another export thread, already one running");
             return this.exportthread;
         }
-        this.exportthread = new Export(path, filename, fileext, filter, query, format, dom, text, maxChunkSize);
+        this.exportthread = new Export(path, filename, fileext, filter, query, format, dom, text, maxChunkSize, minified);
         this.exportthread.start();
         return this.exportthread;
     }
 
-    public static void main(String args[]) {
+    public static void main(final String args[]) {
         final Date firstdate = null;
         System.out.println(GenericFormatter.SHORT_MINUTE_FORMATTER.format(firstdate));
     }
@@ -796,6 +798,18 @@ public final class Fulltext {
         return this.exportthread;
     }
 
+    private final static Set<String> minified_keys = new HashSet<>();
+    static {
+        //minified_keys.add(CollectionSchema.id.getSolrFieldName());
+        minified_keys.add(CollectionSchema.sku.getSolrFieldName());
+        minified_keys.add(CollectionSchema.title.getSolrFieldName());
+        //minified_keys.add(CollectionSchema.author.getSolrFieldName());
+        minified_keys.add(CollectionSchema.description_txt.getSolrFieldName());
+        //minified_keys.add(CollectionSchema.size_i.getSolrFieldName());
+        minified_keys.add(CollectionSchema.last_modified.getSolrFieldName());
+        minified_keys.add(CollectionSchema.text_t.getSolrFieldName());
+    }
+
     public class Export extends Thread {
         private final File path;
         private final String filename, fileext;
@@ -806,12 +820,13 @@ public final class Fulltext {
         private final boolean dom, text;
         private int docCount, chunkSize, chunkCount;
         private final long maxChunkSize;
+        private final boolean minified;
 
         private Export(
                 final File path, final String filename,
                 final String fileext, final String filter, final String query,
                 final ExportFormat format, final boolean dom, final boolean text,
-                long maxChunkSize) {
+                final long maxChunkSize, final boolean minified) {
             super("Fulltext.Export");
             // format: 0=text, 1=html, 2=rss/xml
             this.path = path;
@@ -827,10 +842,11 @@ public final class Fulltext {
             this.chunkSize = 0; // number of documents in the current chunk
             this.chunkCount = 0; // number of chunks opened so far
             this.maxChunkSize = maxChunkSize; // number of maximum document count per chunk
+            this.minified = minified;
             //if ((dom) && (format == 2)) dom = false;
         }
 
-        private void printHead(PrintWriter pw) {
+        private void printHead(final PrintWriter pw) {
             if (this.format == ExportFormat.html) {
                 pw.println("<html><head></head><body>");
             }
@@ -855,8 +871,8 @@ public final class Fulltext {
                 pw.println("<result>");
             }
         }
- 
-        private void printTail(PrintWriter pw) {
+
+        private void printTail(final PrintWriter pw) {
             if (this.format == ExportFormat.html) {
                 pw.println("</body></html>");
             }
@@ -869,7 +885,7 @@ public final class Fulltext {
                 pw.println("</response>");
             }
         }
- 
+
         @Override
         public void run() {
             try {
@@ -881,9 +897,9 @@ public final class Fulltext {
             }
 
             try {
-                docCount = 0;
-                chunkSize = 0;
-                chunkCount = 0;
+                this.docCount = 0;
+                this.chunkSize = 0;
+                this.chunkCount = 0;
                 PrintWriter pw = getWriter();
                 printHead(pw);
                 if (this.dom) {
@@ -902,6 +918,12 @@ public final class Fulltext {
                         while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
                             final String url = getStringFrom(doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
                             if (this.pattern != null && !this.pattern.matcher(url).matches()) continue;
+                            if (this.minified) {
+                                final Iterator<Entry<String, Object>> i = doc.iterator();
+                                while (i.hasNext()) {
+                                    if (!minified_keys.contains(i.next().getKey())) i.remove();
+                                }
+                            }
                             final CRIgnoreWriter sw = new CRIgnoreWriter();
                             if (this.text) sw.write((String) doc.getFieldValue(CollectionSchema.text_t.getSolrFieldName()));
                             if (this.format == ExportFormat.solr) EnhancedXMLResponseWriter.writeDoc(sw, doc);
@@ -914,7 +936,8 @@ public final class Fulltext {
                             if (this.chunkSize >= this.maxChunkSize) {
                                 printTail(pw);
                                 pw.close();
-                                pw = getWriter(); // increases chunkCount as side-effect
+                                this.chunkCount++;
+                                pw = getWriter();
                                 printHead(pw);
                                 this.chunkSize = 0;
                             }
@@ -957,7 +980,8 @@ public final class Fulltext {
                             if (this.chunkSize >= this.maxChunkSize) {
                                 printTail(pw);
                                 pw.close();
-                                pw = getWriter(); // increases chunkCount as side-effect
+                                this.chunkCount++;
+                                pw = getWriter();
                                 printHead(pw);
                                 this.chunkSize = 0;
                             }
@@ -980,14 +1004,13 @@ public final class Fulltext {
         }
 
         private PrintWriter getWriter() throws IOException {
-            File f = file();
+            final File f = file();
             final OutputStream os = new FileOutputStream(this.format == ExportFormat.solr ? new File(f.getAbsolutePath() + ".gz") : f);
             final PrintWriter pw =  new PrintWriter(new BufferedOutputStream(((this.format == ExportFormat.solr)) ? new GZIPOutputStream(os, 65536){{this.def.setLevel(Deflater.BEST_COMPRESSION);}} : os));
-            this.chunkCount++;
             return pw;
         }
 
-        private String chunkcount(int count) {
+        private String chunkcount(final int count) {
             if (count < 10) return "000" + count;
             if (count < 100) return "00" + count;
             if (count < 1000) return "0" + count;

From 656b3e3e771159791392b48a2866ace6ce12cf4b Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Mon, 13 Nov 2023 10:59:49 +0100
Subject: [PATCH 09/11] updated guava to latest and added missing library for
 failureaccess

---
 ivy.xml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ivy.xml b/ivy.xml
index 8c072699d..85af1d3f3 100644
--- a/ivy.xml
+++ b/ivy.xml
@@ -14,7 +14,8 @@
       <dependency org="com.drewnoakes" name="metadata-extractor" rev="2.18.0"  />
       <dependency org="com.fasterxml.jackson.core" name="jackson-databind" rev="2.13.5"/>
       <dependency org="com.github.ben-manes.caffeine" name="caffeine" rev="3.1.8"/>
-      <dependency org="com.google.guava" name="guava" rev="28.0-jre" conf="compile->master"/>
+      <dependency org="com.google.guava" name="guava" rev="32.1.3-jre" conf="compile->master"/>
+      <dependency org="com.google.guava" name="failureaccess" rev="1.0.2" />
       <dependency org="com.ibm.icu" name="icu4j" rev="73.2"/>
       <dependency org="com.jcraft" name="jsch" rev="0.1.55" />
       <dependency org="com.twelvemonkeys.imageio" name="imageio-core" rev="3.9.4"/>

From ceb07a52186a5c8107bbd1fc73683cd9789f004b Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Mon, 13 Nov 2023 11:12:10 +0100
Subject: [PATCH 10/11] fixed problem with zim importer which crashed when
 non-valid urls appeared

---
 .../yacy/document/importer/ZimImporter.java   | 109 +++++++++---------
 1 file changed, 57 insertions(+), 52 deletions(-)

diff --git a/source/net/yacy/document/importer/ZimImporter.java b/source/net/yacy/document/importer/ZimImporter.java
index bc7266e0a..1b4095df6 100644
--- a/source/net/yacy/document/importer/ZimImporter.java
+++ b/source/net/yacy/document/importer/ZimImporter.java
@@ -108,58 +108,63 @@ public class ZimImporter extends Thread implements Importer {
 
             // read all documents
             for (int i = 0; i < this.file.header_entryCount; i++) {
-                if (this.abort) break;
-                DirectoryEntry de = this.reader.getDirectoryInfo(i);
-                if (!(de instanceof ZIMReader.ArticleEntry)) continue;
-                ArticleEntry ae = (ArticleEntry) de;
-                if (ae.namespace != 'C' && ae.namespace != 'A') continue;
-
-                // check url
-                DigestURL guessedUrl = guessURL(this.guessedSource, de);
-                if (recordCnt < 10) {
-                    // critical test for the first 10 urls
-                    if (!guessedUrl.exists(ClientIdentification.browserAgent)) {
-                        sb.log.info("zim importer: file " + this.file.getName() + " failed url " + recordCnt + " existence test: " + guessedUrl);
-                        return; 
-                    }
-                }
-
-                // check availability of text parser
-                String mimeType = ae.getMimeType();
-                if (!mimeType.startsWith("text/") && !mimeType.equals("application/epub+zip")) continue; // in this import we want only text, not everything that is possible
-                if (TextParser.supportsMime(mimeType) != null) continue;
-
-                // read the content
-                byte[] b = this.reader.getArticleData(ae);
-
-                // create artificial request and response headers for the indexer
-                RequestHeader requestHeader = new RequestHeader();
-                ResponseHeader responseHeader = new ResponseHeader(200);
-                responseHeader.put(HeaderFramework.CONTENT_TYPE, de.getMimeType()); // very important to tell parser which kind of content
-                responseHeader.put(HeaderFramework.LAST_MODIFIED, dates); // put in the guessd date to have something that is not the current date
-                final Request request = new Request(
-                        ASCII.getBytes(sb.peers.mySeed().hash),
-                        guessedUrl,
-                        null, // referrerhash the hash of the referrer URL
-                        de.title, // name the name of the document to crawl
-                        null, // appdate the time when the url was first time appeared
-                        sb.crawler.defaultSurrogateProfile.handle(),        // profileHandle the name of the prefetch profile. This must not be null!
-                        0,    // depth the crawling depth of the entry
-                        sb.crawler.defaultSurrogateProfile.timezoneOffset() // timezone offset
-                );
-                final Response response = new Response(
-                        request,
-                        requestHeader,
-                        responseHeader,
-                        Switchboard.getSwitchboard().crawler.defaultSurrogateProfile,
-                        false,
-                        b
-                );
-
-                // throw this to the indexer
-                String error = sb.toIndexer(response);
-                if (error != null) ConcurrentLog.info("ZimImporter", "error parsing: " + error);
-                this.recordCnt++;
+            	try {
+	                if (this.abort) break;
+	                DirectoryEntry de = this.reader.getDirectoryInfo(i);
+	                if (!(de instanceof ZIMReader.ArticleEntry)) continue;
+	                ArticleEntry ae = (ArticleEntry) de;
+	                if (ae.namespace != 'C' && ae.namespace != 'A') continue;
+	
+	                // check url
+	                DigestURL guessedUrl = guessURL(this.guessedSource, de);
+	                if (recordCnt < 10) {
+	                    // critical test for the first 10 urls
+	                    if (!guessedUrl.exists(ClientIdentification.browserAgent)) {
+	                        sb.log.info("zim importer: file " + this.file.getName() + " failed url " + recordCnt + " existence test: " + guessedUrl);
+	                        return; 
+	                    }
+	                }
+	
+	                // check availability of text parser
+	                String mimeType = ae.getMimeType();
+	                if (!mimeType.startsWith("text/") && !mimeType.equals("application/epub+zip")) continue; // in this import we want only text, not everything that is possible
+	                if (TextParser.supportsMime(mimeType) != null) continue;
+	
+	                // read the content
+	                byte[] b = this.reader.getArticleData(ae);
+	
+	                // create artificial request and response headers for the indexer
+	                RequestHeader requestHeader = new RequestHeader();
+	                ResponseHeader responseHeader = new ResponseHeader(200);
+	                responseHeader.put(HeaderFramework.CONTENT_TYPE, de.getMimeType()); // very important to tell parser which kind of content
+	                responseHeader.put(HeaderFramework.LAST_MODIFIED, dates); // put in the guessd date to have something that is not the current date
+	                final Request request = new Request(
+	                        ASCII.getBytes(sb.peers.mySeed().hash),
+	                        guessedUrl,
+	                        null, // referrerhash the hash of the referrer URL
+	                        de.title, // name the name of the document to crawl
+	                        null, // appdate the time when the url was first time appeared
+	                        sb.crawler.defaultSurrogateProfile.handle(),        // profileHandle the name of the prefetch profile. This must not be null!
+	                        0,    // depth the crawling depth of the entry
+	                        sb.crawler.defaultSurrogateProfile.timezoneOffset() // timezone offset
+	                );
+	                final Response response = new Response(
+	                        request,
+	                        requestHeader,
+	                        responseHeader,
+	                        Switchboard.getSwitchboard().crawler.defaultSurrogateProfile,
+	                        false,
+	                        b
+	                );
+	
+	                // throw this to the indexer
+	                String error = sb.toIndexer(response);
+	                if (error != null) ConcurrentLog.info("ZimImporter", "error parsing: " + error);
+	                this.recordCnt++;
+            	} catch (Exception e) {
+            		// catch any error that could stop the importer
+	                ConcurrentLog.info("ZimImporter", "error loading: " + e.getMessage());
+            	}
             }
         } catch (IOException e) {
             ConcurrentLog.info("ZimImporter", "error reading: " + e.getMessage());

From cff0991d850123dd5b9a7062b6df991c50fb26f5 Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Mon, 13 Nov 2023 16:41:19 +0100
Subject: [PATCH 11/11] test if this is helpful for
 https://github.com/yacy/yacy_search_server/issues/500

---
 source/net/yacy/document/parser/sitemapParser.java | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/source/net/yacy/document/parser/sitemapParser.java b/source/net/yacy/document/parser/sitemapParser.java
index be52f72e7..2dd6ebdeb 100644
--- a/source/net/yacy/document/parser/sitemapParser.java
+++ b/source/net/yacy/document/parser/sitemapParser.java
@@ -49,7 +49,6 @@ import net.yacy.document.Document;
 import net.yacy.document.Parser;
 import net.yacy.document.TextParser;
 import net.yacy.document.VocabularyScraper;
-import net.yacy.kelondro.io.ByteCountInputStream;
 
 import org.w3c.dom.CharacterData;
 import org.w3c.dom.Element;
@@ -116,7 +115,8 @@ public class sitemapParser extends AbstractParser implements Parser {
         ConcurrentLog.info("SitemapReader", "loading sitemap from " + sitemapURL.toNormalform(true));
         // client.setHeader(requestHeader.entrySet());
         try (final HTTPClient client = new HTTPClient(agent)) {
-            client.GET(sitemapURL.toNormalform(false), false);
+        	String url = sitemapURL.toNormalform(false);
+            client.GET(url, false);
             if (client.getStatusCode() != 200) {
                 throw new IOException("Unable to download the sitemap file " + sitemapURL +
                         "\nServer returned status: " + client.getHttpResponse().getStatusLine());
@@ -128,11 +128,10 @@ public class sitemapParser extends AbstractParser implements Parser {
             final String contentMimeType = header.mime();
 
             InputStream contentStream = client.getContentstream();
-            if (contentMimeType != null && (contentMimeType.equals("application/x-gzip") || contentMimeType.equals("application/gzip"))) {
+            if ((contentMimeType != null && (contentMimeType.equals("application/x-gzip") || contentMimeType.equals("application/gzip"))) || url.endsWith(".gz")) {
                 contentStream = new GZIPInputStream(contentStream);
             }
-            final ByteCountInputStream counterStream = new ByteCountInputStream(contentStream, null);
-            return new SitemapReader(counterStream, agent);
+            return new SitemapReader(contentStream, agent);
         } catch (final IOException e) {
             throw e;
         }