Merge branch 'master' of https://github.com/yacy/yacy_search_server

1 year ago · d72cd7916c
parent 0663ae3c99 cff0991d85
commit d72cd7916c
17 changed files with 652 additions and 369 deletions
--- a/htroot/ConfigParser_p.html
+++ b/htroot/ConfigParser_p.html
@ -51,27 +51,6 @@
  </tr>
 </table>
 </fieldset>
-<fieldset><legend id="parser">PDF Parser Attributes</legend>
-<p>
-  This is an experimental setting which makes it possible to split PDF documents into individual index entries.
-  Every page will become a single index hit and the url is artifically extended with a post/get attribute value containing
-  the page number as value. When such an url is displayed within a search result, then the post/get attribute is transformed into an anchor hash link.
-  This makes it possible to view the individual page directly in the pdf.js viewer built-in into firefox,
-  for reference see https://github.com/mozilla/pdf.js/wiki/Viewer-options
-</p>
-<table border="0">
-  <tr class="TableCellLight">
-    <td class="small" align="right" width="90">Split PDF</td>
-    <td class="small" align="left" width="300"><input type="checkbox" name="individualPages" #(individualPages)#::checked="checked" #(/individualPages)#/></td>
-  </tr>
-  <tr class="TableCellLight">
-    <td class="small" align="right">Property Name</td>
-    <td class="small" align="left"><input type="text" name="individualPagePropertyname" value="#[individualPagePropertyname]#"/></td>
-  </tr>
-  <tr class="TableCellDark">
-    <td colspan="3" class="small" ><input type="submit" name="pdfSettings" value="Submit" class="btn btn-primary"/></td>
-  </tr>
-</table>
 </form>
    #%env/templates/footer.template%#
  </body>
--- a/htroot/Crawler_p.html
+++ b/htroot/Crawler_p.html
@ -134,7 +134,7 @@
          <tr class="TableCellLight"> 
            <td align="left">Speed / PPM<br/>(Pages Per Minute)</td>
            <td align="left" colspan="4">
-            <input id="customPPM" name="customPPM" type="number" min="10" max="30000" style="width:5em" value="#[customPPMdefault]#" /><label for="customPPM"><abbr title="Pages Per Minute">PPM</abbr></label>
+            <input id="customPPM" name="customPPM" type="number" min="10" max="60000" style="width:5em" value="#[customPPMdefault]#" /><label for="customPPM"><abbr title="Pages Per Minute">PPM</abbr></label>
            <input id="latencyFactor" name="latencyFactor" type="number" min="0.1" max="3.0" step="0.1" style="width:3.5em" value="#[latencyFactorDefault]#" />
            <label for="latencyFactor"><abbr title="Latency Factor">LF</abbr></label>
            <input id="MaxSameHostInQueue" name="MaxSameHostInQueue" type="number" min="1" max="30" style="width:3em" value="#[MaxSameHostInQueueDefault]#" />
@ -147,7 +147,7 @@
            <td align="left">Crawler PPM</td>
            <td align="left" width="60"><span id="ppmNum">&nbsp;&nbsp;&nbsp;</span></td>
            <td align="left" width="260px" colspan="3">
-                <progress id="ppmbar" max="30000" value="0" style="width:94%;"/>
+                <progress id="ppmbar" max="60000" value="0" style="width:94%;"/>
            </td>
          </tr>
          <tr class="TableCellLight"> 
--- a/htroot/IndexExport_p.html
+++ b/htroot/IndexExport_p.html
@ -9,11 +9,10 @@
  <body id="IndexControl">
    #%env/templates/header.template%#
    #%env/templates/submenuIndexImport.template%#
-    
-    
+
    <h2>Index Export</h2>
    <p>The local index currently contains #[ucount]# documents, only #[ucount200]# exportable with status code 200 - the remaining are error documents.</p>
-    
+
    #(lurlexport)#::
    <form action="IndexExport_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
    <fieldset><legend>Loaded URL Export</legend>
@ -22,31 +21,65 @@
        <dd><input type="text" name="exportfilepath" value="#[exportfilepath]#" size="120" maxlength="250" />
        </dd>
        <dt class="TableCellDark">URL Filter</dt>
-        <dd><input type="text" name="exportfilter" value=".*.*" size="20" maxlength="250" />
+        <dd><input type="text" name="exportfilter" value=".*.*" size="20" maxlength="250" />&nbsp;.*.* (default) is a catch-all; format: java regex
        </dd>
        <dt class="TableCellDark">query</dt>
-        <dd><input type="text" name="exportquery" value="*:*" size="20" maxlength="250" />
+        <dd><input type="text" name="exportquery" value="*:*" size="20" maxlength="250" />&nbsp;*:* (default) is a catch-all; format: <field-name>:<solr-pattern>
        </dd>
-        <dt class="TableCellDark">maximum age (seconds, -1 = unlimited)</dt>
-        <dd><input type="text" name="exportmaxseconds" value="-1" size="20" maxlength="250" />
+        <dt class="TableCellDark">maximum age (seconds)</dt>
+        <dd><input type="text" name="exportmaxseconds" value="-1" size="20" maxlength="250" />&nbsp;-1 = unlimited -> no document is too old
+        </dd>
+        <dt class="TableCellDark">maximum number of records per chunk</dt>
+        <dd><input type="text" name="maxchunksize" value="-1" size="20" maxlength="250" />&nbsp;if exceeded: several chunks are stored; -1 = unlimited (makes only one chunk)
+        </dd>
+        <dt class="TableCellDark">Export Size</dt>
+        <dd>
+          full size, all fields:<input type="radio" name="minified" value="no" checked="checked">&nbsp;
+          minified; only fields sku, date, title, description, text_t<input type="radio" name="minified" value="yes" >
        </dd>
        <dt class="TableCellDark">Export Format</dt>
        <dd>
        <dl>
        <dt>Full Data Records:</dt>
-        <dd><input type="radio" name="format" value="full-solr" /> XML (Rich and full-text Solr data, one document per line in one large xml file, can be processed with shell tools, can be imported with DATA/SURROGATE/in/)<br />
-            <input type="radio" name="format" value="full-elasticsearch" checked="checked" /> JSON (Rich and full-text Elasticsearch data, one document per line in one flat JSON file, can be bulk-imported to elasticsearch with the command "curl -XPOST localhost:9200/collection1/yacy/_bulk --data-binary @yacy_dump_XXX.flatjson")<br />
-            <input type="radio" name="format" value="full-rss" /> XML (RSS)</dd>
+        <dd><input type="radio" name="format" value="full-elasticsearch" checked="checked" />
+            JSON (Rich and full-text Elasticsearch data, one document per line in one flat JSON file,
+            can be bulk-imported to elasticsearch. Here is an example for opensearch, using docker:<br />
+Start docker container of opensearch:<br />
+<code>docker run --name opensearch -p 9200:9200 -d -e OPENSEARCH_JAVA_OPTS="-Xms2G -Xmx2G" -e discovery.type=single-node -e DISABLE_SECURITY_PLUGIN=true -v $(pwd)/opensearch_data:/usr/share/opensearch/data opensearchproject/opensearch:latest</code><br />
+Unblock index creation:<br />
+<code>curl -X PUT "http://localhost:9200/_cluster/settings" -H 'Content-Type: application/json' -d'
+{
+  "persistent": {
+    "cluster.blocks.create_index": null
+  }
+}'</code><br />
+Create the search index:<br />
+<code>curl -X PUT "http://localhost:9200/collection1/yacy"</code><br />
+Bulk-upload the index file:<br />
+<code>curl -XPOST "http://localhost:9200/collection1/yacy/_bulk?filter_path=took,errors" -H "Content-Type: application/x-ndjson" --data-binary @yacy_dump_XXX.flatjson</code><br />
+Make a search, get 10 results, search in fields text_t, title, description with boosts:<br />
+<code>curl -X POST "http://localhost:9200/collection1/yacy/_search" -H 'Content-Type: application/json' -d'
+{"size": 10, "query": {"multi_match": {
+    "query": "one two three",
+    "fields": ["text_t", "title^10", "description^3"], "fuzziness": "AUTO"
+}}}'</code><br />
+            <input type="radio" name="format" value="full-solr" />
+            XML (Rich and full-text Solr data, one document per line in one large xml file,
+            can be processed with shell tools, can be imported with DATA/SURROGATE/in/)
+            <br />
+            <input type="radio" name="format" value="full-rss" />
+            XML (RSS)
+        </dd>
        <dt>Full URL List:</dt>
        <dd><input type="radio" name="format" value="url-text" /> Plain Text List (URLs only)<br />
            <input type="radio" name="format" value="url-html" /> HTML (URLs with title)</dd>
        <dt>Only Domain:</dt>
        <dd><input type="radio" name="format" value="dom-text" /> Plain Text List (domains only)<br />
            <input type="radio" name="format" value="dom-html" /> HTML (domains as URLs, no title)</dd>
-		<dt>Only Text:</dt>
+        <dt>Only Text:</dt>
        <dd><input type="radio" name="format" value="text-text" /> Fulltext of Search Index Text</dd>
-		</dl>
-		</dd>
+        </dl>
+        </dd>
        <dt>&nbsp;</dt>
        <dd><input type="submit" name="lurlexport" value="Export" class="btn btn-primary" style="width:240px;"/>
        </dd>
@ -55,16 +88,16 @@
    </form>::
    <div class="alert alert-info" style="text-decoration:blink">Export to file #[exportfile]# is running ..  #[urlcount]# Documents so far</div>::
    #(/lurlexport)#
-    
-	#(lurlexportfinished)#::
+
+    #(lurlexportfinished)#::
    <div class="alert alert-success">Finished export of #[urlcount]# Documents to file <a href="file://#[exportfile]#" target="_">#[exportfile]#</a><br/>
    <em>Import this file by moving it to DATA/SURROGATES/in</em></div>::
    #(/lurlexportfinished)#
-    
+
    #(lurlexporterror)#::
    <div class="alert alert-warning">Export to file #[exportfile]# failed: #[exportfailmsg]#</div>::
    #(/lurlexporterror)#
-    
+
    #(dumprestore)#::
    <form action="IndexExport_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
    <fieldset><legend>Dump and Restore of Solr Index</legend>
--- a/ivy.xml
+++ b/ivy.xml
@ -14,7 +14,8 @@
      <dependency org="com.drewnoakes" name="metadata-extractor" rev="2.18.0"  />
      <dependency org="com.fasterxml.jackson.core" name="jackson-databind" rev="2.13.5"/>
      <dependency org="com.github.ben-manes.caffeine" name="caffeine" rev="3.1.8"/>
-      <dependency org="com.google.guava" name="guava" rev="28.0-jre" conf="compile->master"/>
+      <dependency org="com.google.guava" name="guava" rev="32.1.3-jre" conf="compile->master"/>
+      <dependency org="com.google.guava" name="failureaccess" rev="1.0.2" />
      <dependency org="com.ibm.icu" name="icu4j" rev="73.2"/>
      <dependency org="com.jcraft" name="jsch" rev="0.1.55" />
      <dependency org="com.twelvemonkeys.imageio" name="imageio-core" rev="3.9.4"/>
@ -28,6 +29,7 @@
      <dependency org="io.opentracing" name="opentracing-noop" rev="0.33.0"/>
      <dependency org="io.opentracing" name="opentracing-util" rev="0.33.0"/>
      <dependency org="javax.servlet" name="javax.servlet-api" rev="3.1.0"/>
+      <dependency org="javainetlocator" name="inetaddresslocator" rev="2.18" />
      <dependency org="jcifs" name="jcifs" rev="1.3.17"  conf="compile->master" />
      <dependency org="net.arnx" name="jsonic" rev="1.3.10"/>
      <dependency org="net.jthink" name="jaudiotagger" rev="2.2.5"/>
--- a/source/net/yacy/cora/document/id/MultiProtocolURL.java
+++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java
@ -2578,6 +2578,36 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
        return null;
    }

+    public boolean exists(final ClientIdentification.Agent agent) {
+        try {
+            if (isFile()) {
+                return getFSFile().exists();
+            }
+            if (isSMB()) {
+                return getSmbFile().exists();
+            }
+            if (isFTP()) {
+                final FTPClient client = new FTPClient();
+                client.open(this.host, this.port < 0 ? 21 : this.port);
+                return client.fileSize(path) > 0;
+            }
+            if (isHTTP() || isHTTPS()) {
+                final HTTPClient client = new HTTPClient(agent);
+                client.setHost(getHost());
+                org.apache.http.HttpResponse response = client.HEADResponse(this, true);
+                client.close();
+                if (response == null) return false;
+                int status = response.getStatusLine().getStatusCode();
+                return status == 200 || status == 301 || status == 302;
+            }
+            return false;
+        } catch (IOException e) {
+            if (e.getMessage().contains("Circular redirect to")) return true; // exception; this is a 302 which the client actually accepts
+            //e.printStackTrace();
+            return false;
+        }
+    }
+
    /**
     * Read fully the source, close it and return its content as a bytes array.
     * @param source the source to read
--- a/source/net/yacy/document/importer/ZimImporter.java
+++ b/source/net/yacy/document/importer/ZimImporter.java
@ -25,11 +25,22 @@ package net.yacy.document.importer;

 import java.io.File;
 import java.io.IOException;
+import java.net.MalformedURLException;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
 import java.util.Collection;
+import java.util.Date;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.Locale;
 import java.util.Map;
+import java.util.Set;
 import java.util.TreeMap;

+import net.yacy.cora.document.encoding.ASCII;
 import net.yacy.cora.document.id.DigestURL;
+import net.yacy.cora.protocol.ClientIdentification;
+import net.yacy.cora.protocol.HeaderFramework;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.protocol.ResponseHeader;
 import net.yacy.cora.util.ConcurrentLog;
@ -80,44 +91,80 @@ public class ZimImporter extends Thread implements Importer {
    public void run() {
        job = this;
        this.startTime = System.currentTimeMillis();
+        Switchboard sb = Switchboard.getSwitchboard();
        try {
            this.reader = new ZIMReader(this.file);
            this.guessedSource = getSource(this.reader);
+            Date guessedDate = getDate(this.reader);
+            String dates = HeaderFramework.newRfc1123Format().format(guessedDate);
+
+            // verify the source
+            DirectoryEntry mainEntry = this.reader.getMainDirectoryEntry();
+            DigestURL mainURL = guessURL(this.guessedSource, mainEntry);
+            if (!mainURL.exists(ClientIdentification.browserAgent)) {
+                sb.log.info("zim importer: file " + this.file.getName() + " failed main url existence test: " + mainURL);
+                return; 
+            }

+            // read all documents
            for (int i = 0; i < this.file.header_entryCount; i++) {
-                if (this.abort) break;
-                DirectoryEntry de = this.reader.getDirectoryInfo(i);
-                if (!(de instanceof ZIMReader.ArticleEntry)) continue;
-                ArticleEntry ae = (ArticleEntry) de;
-
-                // check url
-                String guessedUrl = guessURL(this.guessedSource, de);
-                assert guessedUrl.startsWith("http");
-
-                // check availability of text parser
-                String mimeType = ae.getMimeType();
-                if (TextParser.supportsMime(mimeType) != null) continue;
-
-                // read the content
-                byte[] b = this.reader.getArticleData(ae);
-
-                // create artificial request and response headers for the indexer
-                RequestHeader requestHeader = new RequestHeader();
-                ResponseHeader responseHeader = new ResponseHeader(200);
-                final Request request = new Request(new DigestURL(guessedUrl), null);
-                final Response response = new Response(
-                        request,
-                        requestHeader,
-                        responseHeader,
-                        Switchboard.getSwitchboard().crawler.defaultSurrogateProfile,
-                        false,
-                        b
-                );
-
-                // throw this to the indexer
-                String error = Switchboard.getSwitchboard().toIndexer(response);
-                if (error != null) ConcurrentLog.info("ZimImporter", "error parsing: " + error);
-                this.recordCnt++;
+            	try {
+	                if (this.abort) break;
+	                DirectoryEntry de = this.reader.getDirectoryInfo(i);
+	                if (!(de instanceof ZIMReader.ArticleEntry)) continue;
+	                ArticleEntry ae = (ArticleEntry) de;
+	                if (ae.namespace != 'C' && ae.namespace != 'A') continue;
+	
+	                // check url
+	                DigestURL guessedUrl = guessURL(this.guessedSource, de);
+	                if (recordCnt < 10) {
+	                    // critical test for the first 10 urls
+	                    if (!guessedUrl.exists(ClientIdentification.browserAgent)) {
+	                        sb.log.info("zim importer: file " + this.file.getName() + " failed url " + recordCnt + " existence test: " + guessedUrl);
+	                        return; 
+	                    }
+	                }
+	
+	                // check availability of text parser
+	                String mimeType = ae.getMimeType();
+	                if (!mimeType.startsWith("text/") && !mimeType.equals("application/epub+zip")) continue; // in this import we want only text, not everything that is possible
+	                if (TextParser.supportsMime(mimeType) != null) continue;
+	
+	                // read the content
+	                byte[] b = this.reader.getArticleData(ae);
+	
+	                // create artificial request and response headers for the indexer
+	                RequestHeader requestHeader = new RequestHeader();
+	                ResponseHeader responseHeader = new ResponseHeader(200);
+	                responseHeader.put(HeaderFramework.CONTENT_TYPE, de.getMimeType()); // very important to tell parser which kind of content
+	                responseHeader.put(HeaderFramework.LAST_MODIFIED, dates); // put in the guessd date to have something that is not the current date
+	                final Request request = new Request(
+	                        ASCII.getBytes(sb.peers.mySeed().hash),
+	                        guessedUrl,
+	                        null, // referrerhash the hash of the referrer URL
+	                        de.title, // name the name of the document to crawl
+	                        null, // appdate the time when the url was first time appeared
+	                        sb.crawler.defaultSurrogateProfile.handle(),        // profileHandle the name of the prefetch profile. This must not be null!
+	                        0,    // depth the crawling depth of the entry
+	                        sb.crawler.defaultSurrogateProfile.timezoneOffset() // timezone offset
+	                );
+	                final Response response = new Response(
+	                        request,
+	                        requestHeader,
+	                        responseHeader,
+	                        Switchboard.getSwitchboard().crawler.defaultSurrogateProfile,
+	                        false,
+	                        b
+	                );
+	
+	                // throw this to the indexer
+	                String error = sb.toIndexer(response);
+	                if (error != null) ConcurrentLog.info("ZimImporter", "error parsing: " + error);
+	                this.recordCnt++;
+            	} catch (Exception e) {
+            		// catch any error that could stop the importer
+	                ConcurrentLog.info("ZimImporter", "error loading: " + e.getMessage());
+            	}
            }
        } catch (IOException e) {
            ConcurrentLog.info("ZimImporter", "error reading: " + e.getMessage());
@ -195,8 +242,6 @@ public class ZimImporter extends Thread implements Importer {
                return "fas.org";
            case "fonts":
                return "fonts.google.com";
-            case "gutenberg":
-                return "gutenberg.org";
            case "ifixit":
                return "ifixit.com";
            case "lesfondamentaux":
@ -216,11 +261,23 @@ public class ZimImporter extends Thread implements Importer {
            case "rapsberry_pi_docs":
                return "raspberrypi.org";
            case "ted":
-                return "ted.com";
+                return "www.ted.com/search?q=";
            case "vikidia":
-                return "vikidia.org";
+                return parts[1] + ".vikidia.org/wiki";
            case "westeros":
                return "westeros.org";
+            case "wikihow":
+                return parts[1].equals("en") ? "wikihow.com" : parts[1] + ".wikihow.com";
+            case "wikisource":
+                return parts[1] + ".wikisource.org/wiki";
+            case "wikiversity":
+                return parts[1] + ".wikiversity.org/wiki";
+            case "wikivoyage":
+                return parts[1] + ".wikivoyage.org/wiki";
+            case "wiktionary":
+                return parts[1] + ".wiktionary.org/wiki";
+            case "wikiquote":
+                return parts[1] + ".wikiquote.org/wiki";
            case "wikibooks":
                return parts[1] + ".wikibooks.org/wiki";
            case "wikinews":
@ -266,16 +323,174 @@ public class ZimImporter extends Thread implements Importer {
        return source;
    }

-    public static String guessURL(String guessedSource, DirectoryEntry de) {
+    public static Date getDate(ZIMReader r) throws IOException {
+        String date = r.getMetadata("Date");
+        if (date != null) try {
+            SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd", Locale.US);
+            return format.parse(date);
+        } catch (ParseException e) {}
+        // failover situation: use file date
+        return new Date(r.getZIMFile().lastModified());
+    }
+
+    public static DigestURL guessURL(String guessedSource, DirectoryEntry de) throws MalformedURLException {
        String url = de.url;
        if (url.equals("Main_Page")) url = "";
-        if (guessedSource != null) return guessedSource + url;
-        if (url.startsWith("A/")) return "https://" + url.substring(2);
-        if (url.startsWith("H/")) return "https://" + url.substring(2);
-        return guessedSource + url;
+        if (url.startsWith("A/")) return new DigestURL("https://" + url.substring(2));
+        if (url.startsWith("H/")) return new DigestURL("https://" + url.substring(2));
+        if (guessedSource != null) return new DigestURL(guessedSource + url);
+        return new DigestURL(guessedSource + url);
    }

+    private final static String[] skip_files = {
+         "iota.stackexchange.com_en_all_2023-05.zim",
+         "stellar.stackexchange.com_en_all_2023-10.zim",
+         "vegetarianism.stackexchange.com_en_all_2023-05.zim",
+         "esperanto.stackexchange.com_eo_all_2023-10.zim",
+         "tezos.stackexchange.com_en_all_2023-10.zim",
+         "eosio.stackexchange.com_en_all_2023-10.zim",
+         "ebooks.stackexchange.com_en_all_2023-10.zim",
+         "poker.stackexchange.com_en_all_2023-05.zim",
+         "cseducators.stackexchange.com_en_all_2023-10.zim",
+         "iot.stackexchange.com_en_all_2023-05.zim",
+         "portuguese.stackexchange.com_pt_all_2023-04.zim",
+         "portuguese.stackexchange.com_pt_all_2023-10.zim",
+         "italian.stackexchange.com_it_all_2023-05.zim",
+         "monero.stackexchange.com_en_all_2022-11.zim",
+         "sustainability.stackexchange.com_en_all_2023-05.zim",
+         "westeros_en_all_nopic_2021-03.zim",
+         "opensource.stackexchange.com_en_all_2023-10.zim",
+         "tor.stackexchange.com_en_all_2023-05.zim",
+         "devops.stackexchange.com_en_all_2023-10.zim",
+         "patents.stackexchange.com_en_all_2023-10.zim",
+         "stackapps.com_en_all_2023-05.zim",
+         "hardwarerecs.stackexchange.com_en_all_2023-05.zim",
+         "hsm.stackexchange.com_en_all_2023-05.zim",
+         "expatriates.stackexchange.com_en_all_2023-11.zim",
+         "opendata.stackexchange.com_en_all_2023-10.zim",
+         "sports.stackexchange.com_en_all_2023-05.zim",
+         "wikinews_de_all_nopic_2023-10.zim",
+         "computergraphics.stackexchange.com_en_all_2023-10.zim",
+         "tridion.stackexchange.com_en_all_2023-10.zim",
+         "bioinformatics.stackexchange.com_en_all_2023-10.zim",
+         "expressionengine.stackexchange.com_en_all_2023-11.zim",
+         "elementaryos.stackexchange.com_en_all_2023-10.zim",
+         "cstheory.stackexchange.com_en_all_2023-10.zim",
+         "chess.stackexchange.com_en_all_2023-05.zim",
+         "vi.stackexchange.com_en_all_2023-05.zim",
+         "fitness.stackexchange.com_en_all_2023-10.zim",
+         "pets.stackexchange.com_en_all_2023-05.zim",
+         "french.stackexchange.com_fr_all_2023-10.zim",
+         "sqa.stackexchange.com_en_all_2023-05.zim",
+         "islam.stackexchange.com_en_all_2023-05.zim",
+         "scicomp.stackexchange.com_en_all_2023-05.zim",
+         "wikinews_en_all_nopic_2023-09.zim",
+         "ai.stackexchange.com_en_all_2023-10.zim",
+         "boardgames.stackexchange.com_en_all_2023-05.zim",
+         "economics.stackexchange.com_en_all_2023-05.zim",
+         "3dprinting.stackexchange.com_en_all_2023-07.zim",
+         "earthscience.stackexchange.com_en_all_2023-05.zim",
+         "emacs.stackexchange.com_en_all_2023-10.zim",
+         "bitcoin.stackexchange.com_en_all_2023-05.zim",
+         "philosophy.stackexchange.com_en_all_2023-05.zim",
+         "law.stackexchange.com_en_all_2023-05.zim",
+         "astronomy.stackexchange.com_en_all_2023-05.zim",
+         "artofproblemsolving_en_all_nopic_2021-03.zim",
+         "engineering.stackexchange.com_en_all_2023-05.zim",
+         "ja.stackoverflow.com_ja_all_2023-06.zim",
+         "webmasters.stackexchange.com_en_all_2023-05.zim",
+         "anime.stackexchange.com_en_all_2023-10.zim",
+         "cooking.stackexchange.com_en_all_2023-05.zim",
+         "arduino.stackexchange.com_en_all_2023-05.zim",
+         "money.stackexchange.com_en_all_2023-05.zim",
+         "judaism.stackexchange.com_en_all_2023-05.zim",
+         "ethereum.stackexchange.com_en_all_2023-05.zim",
+         "datascience.stackexchange.com_en_all_2023-10.zim",
+         "academia.stackexchange.com_en_all_2023-10.zim",
+         "music.stackexchange.com_en_all_2023-05.zim",
+         "cs.stackexchange.com_en_all_2023-03.zim",
+         "dsp.stackexchange.com_en_all_2023-05.zim",
+         "biology.stackexchange.com_en_all_2023-05.zim",
+         "android.stackexchange.com_en_all_2023-10.zim",
+         "bicycles.stackexchange.com_en_all_2023-05.zim",
+         "puzzling.stackexchange.com_en_all_2023-05.zim",
+         "photo.stackexchange.com_en_all_2023-05.zim",
+         "aviation.stackexchange.com_en_all_2023-05.zim",
+         "drupal.stackexchange.com_en_all_2023-05.zim",
+         "ux.stackexchange.com_en_all_2023-05.zim",
+         "ell.stackexchange.com_en_all_2023-10.zim",
+         "openstreetmap-wiki_en_all_nopic_2023-05.zim",
+         "softwareengineering.stackexchange.com_en_all_2023-05.zim",
+         "gaming.stackexchange.com_en_all_2023-10.zim",
+         "mathematica.stackexchange.com_en_all_2023-10.zim",
+         "pt.stackoverflow.com_pt_all_2023-06.zim",
+         "apple.stackexchange.com_en_all_2023-05.zim",
+         "diy.stackexchange.com_en_all_2023-08.zim",
+         "es.stackoverflow.com_es_all_2023-06.zim",
+         "gis.stackexchange.com_en_all_2023-05.zim",
+         "stats.stackexchange.com_en_all_2023-05.zim",
+         "physics.stackexchange.com_en_all_2023-05.zim",
+         "serverfault.com_en_all_2023-05.zim",
+         "electronics.stackexchange.com_en_all_2023-05.zim",
+         "tex.stackexchange.com_en_all_2023-05.zim",
+         "wikibooks_de_all_nopic_2021-03.zim",
+         "askubuntu.com_en_all_2023-05.zim",
+         "superuser.com_en_all_2023-05.zim",
+         "lesfondamentaux.reseau-canope.fr_fr_all_2022-11.zim",
+         "wikibooks_en_all_nopic_2021-03.zim",
+         "courses.lumenlearning.com_en_all_2021-03.zim",
+         "wikipedia_de_all_nopic_2023-10.zim",
+         "wikipedia_en_all_nopic_2023-10.zim",
+         "stackoverflow.com_en_all_nopic_2022-07.zim",
+         "stackoverflow.com_en_all_2023-05.zim",
+         "armypubs_en_all_2023-08.zim",
+         "vikidia_en_all_nopic_2023-09.zim",
+         "wikiquote_de_all_nopic_2023-10.zim",
+         "wikiquote_en_all_nopic_2023-09.zim",
+         "wiktionary_de_all_nopic_2023-10.zim",
+         "wiktionary_en_all_nopic_2023-10.zim",
+         "wikihow_de_maxi_2023-10.zim",
+         "wikivoyage_de_all_nopic_2023-09.zim",
+         "wikiversity_de_all_nopic_2021-03.zim",
+         "wikiversity_en_all_nopic_2021-03.zim",
+         "wikisource_de_all_nopic_2023-09.zim",
+         "wikisource_en_all_nopic_2023-08.zim",
+         "ted_countdown_global_2023-09.zim",
+         "ted_en_design_2023-09.zim",
+         "ted_en_business_2023-09.zim",
+         "ted_en_global_issues_2023-09.zim",
+         "opentextbooks_en_all_2023-08.zim",
+         "bestedlessons.org_en_all_2023-08.zim",
+         "wikivoyage_en_all_nopic_2023-10.zim",
+         "based.cooking_en_all_2023-10.zim",
+         "wordnet_en_all_2023-04.zim",
+         "internet-encyclopedia-philosophy_en_all_2023-08.zim",
+         "100r-off-the-grid_en_2023-09.zim",
+         "coopmaths_2023-04.zim",
+         "birds-of-ladakh_en_all_2023-02.zim",
+         "storyweaver.org_en_2023-09.zim",
+         "developer.mozilla.org_en_all_2023-02.zim",
+         "www.ready.gov_es_2023-06.zim",
+         "teoria.com_en_2023-08.zim",
+         "theworldfactbook_en_all_2023-06.zim",
+         "mutopiaproject.org_en_2023-08.zim",
+         "dp.la_en_all_2023-08.zim",
+
+         // 302
+         "moderators.stackexchange.com_en_all_2023-05.zim",
+         "beer.stackexchange.com_en_all_2023-05.zim",
+         "health.stackexchange.com_en_all_2023-05.zim",
+         "avp.stackexchange.com_en_all_2023-05.zim",
+         "lowtechmagazine.com_en_all_2023-08.zim",
+         "ifixit_de_all_2023-07.zim",
+         "ifixit_en_all_2023-10.zim",
+         "der-postillon.com_de_all_2020-12.zim",
+         "wikihow_en_maxi_2023-03.zim",
+    };
+
    public static void main(String[] args) {
+        Set<String> skip = new HashSet<>();
+        for (String s: skip_files) skip.add(s);
        // zim file import test
        // will test mostly if domain names are included in zim file urls
        String zimFilesPath = args[0];
@ -291,7 +506,10 @@ public class ZimImporter extends Thread implements Importer {
        }

        Collection<File> orderedFiles = orderedFileMap.values();
+        Set<String> files_ok = new LinkedHashSet<>();
+        Set<String> files_nok = new LinkedHashSet<>();
        for (File f: orderedFiles) {
+            if (skip.contains(f.getName())) continue;
            try {
                ZIMFile z = new ZIMFile(f.getAbsolutePath());
                ZIMReader r = new ZIMReader(z);
@ -301,14 +519,21 @@ public class ZimImporter extends Thread implements Importer {
                System.out.println("Namespace: " + de.namespace);
                System.out.println("Title:     " + de.title);
                System.out.println("URL:       " + de.url);
-                System.out.println("guessed domain: " + guessDomainName(f.getName()));
+                System.out.println("Mime Type  " + de.getMimeType());
+                System.out.println("guessed domain: " + guessDomainName(f.getName())); // uses a table and rules that deduces a source from the file name
                String source = getSource(r);
-                System.out.println("guessed Source: " + source);
-                System.out.println("guessed main article: " + guessURL(source, de));
+                System.out.println("guessed Source: " + source); // this uses metadata stored in the zim file
+                DigestURL mainURL = guessURL(source, de);
+                System.out.println("guessed main article: " + mainURL);
+                boolean ok = mainURL.exists(ClientIdentification.browserAgent);
+                System.out.println("main article exists: " + ok);
+                if (ok) files_ok.add(f.getName()); else files_nok.add(f.getName());
                System.out.println();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
+        System.out.println("ok files: " + files_ok.toString());
+        System.out.println("not-ok files: " + files_nok.toString());
    }
 }
--- a/source/net/yacy/document/parser/pdfParser.java
+++ b/source/net/yacy/document/parser/pdfParser.java
@ -53,7 +53,6 @@ import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
 import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
 import org.apache.pdfbox.text.PDFTextStripper;

-import net.yacy.cora.document.encoding.UTF8;
 import net.yacy.cora.document.id.AnchorURL;
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.document.id.MultiProtocolURL;
@ -69,9 +68,6 @@ import net.yacy.kelondro.util.MemoryControl;

 public class pdfParser extends AbstractParser implements Parser {

-    public static boolean individualPages = false;
-    public static String individualPagePropertyname = "page";
-
    public pdfParser() {
        super("Acrobat Portable Document Parser");
        this.SUPPORTED_EXTENSIONS.add("pdf");
@ -149,98 +145,36 @@ public class pdfParser extends AbstractParser implements Parser {
            // get the links
        	final List<Collection<AnchorURL>> pdflinks = extractPdfLinks(pdfDoc);

-            // get the fulltext (either per document or for each page)
-            final PDFTextStripper stripper = new PDFTextStripper(/*StandardCharsets.UTF_8.name()*/);
-
-            if (individualPages) {
-                // this is a hack which stores individual pages of the source pdf into individual index documents
-                // the new documents will get a virtual link with a post argument page=X appended to the original url
-
-                // collect text
-                final int pagecount = pdfDoc.getNumberOfPages();
-                final String[] pages = new String[pagecount];
-                for (int page = 1; page <= pagecount; page++) {
-                    stripper.setStartPage(page);
-                    stripper.setEndPage(page);
-                    pages[page - 1] = stripper.getText(pdfDoc);
-                    //System.out.println("PAGE " + page + ": " + pages[page - 1]);
-                }
-
-                // create individual documents for each page
-                assert pages.length == pdflinks.size() : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.size();
-                result = new Document[Math.min(pages.length, pdflinks.size())];
-                final String loc = location.toNormalform(true);
-                for (int page = 0; page < result.length; page++) {
-                    result[page] = new Document(
-                            new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash
-                            mimeType,
-                            StandardCharsets.UTF_8.name(),
-                            this,
-                            null,
-                            docKeywords,
-                            singleList(docTitle),
-                            docAuthor,
-                            docPublisher,
-                            null,
-                            null,
-                            0.0d, 0.0d,
-                            pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]),
-                            pdflinks == null || page >= pdflinks.size() ? null : pdflinks.get(page),
-                            null,
-                            null,
-                            false,
-                            docDate);
-                }
-            } else {
-                // collect the whole text at once
-                final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
-                byte[] contentBytes = new byte[0];
-                stripper.setEndPage(3); // get first 3 pages (always)
-                writer.append(stripper.getText(pdfDoc));
-                contentBytes = writer.getBytes(); // remember text in case of interrupting thread
-
-                if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read
-                    stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text)
-                    stripper.setEndPage(Integer.MAX_VALUE); // set to default
-                    // we start the pdf parsing in a separate thread to ensure that it can be terminated
-                    final PDDocument pdfDocC = pdfDoc;
-                    final Thread t = new Thread("pdfParser.getText:" + location) {
-                        @Override
-                        public void run() {
-                            try {
-                                writer.append(stripper.getText(pdfDocC));
-                            } catch (final Throwable e) {}
-                        }
-                    };
-                    t.start();
-                    t.join(3000); // pdfbox likes to forget to terminate ... (quite often)
-                    if (t.isAlive()) t.interrupt();
-                    contentBytes = writer.getBytes(); // get final text before closing writer
-                    writer.close(); // free writer resources
-                }
-
-                final Collection<AnchorURL> pdflinksCombined = new HashSet<>();
-                for (final Collection<AnchorURL> pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx);
-                result = new Document[]{new Document(
-                        location,
-                        mimeType,
-                        StandardCharsets.UTF_8.name(),
-                        this,
-                        null,
-                        docKeywords,
-                        singleList(docTitle),
-                        docAuthor,
-                        docPublisher,
-                        null,
-                        null,
-                        0.0d, 0.0d,
-                        contentBytes,
-                        pdflinksCombined,
-                        null,
-                        null,
-                        false,
-                        docDate)};
-            }
+            // collect the whole text at once
+            final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
+            byte[] contentBytes = new byte[0];
+            final PDFTextStripper stripper = new PDFTextStripper();
+            stripper.setEndPage(Integer.MAX_VALUE);
+            writer.append(stripper.getText(pdfDoc));
+            contentBytes = writer.getBytes(); // remember text in case of interrupting thread
+            writer.close(); // free writer resources
+
+            final Collection<AnchorURL> pdflinksCombined = new HashSet<>();
+            for (final Collection<AnchorURL> pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx);
+            result = new Document[]{new Document(
+                    location,
+                    mimeType,
+                    StandardCharsets.UTF_8.name(),
+                    this,
+                    null,
+                    docKeywords,
+                    singleList(docTitle),
+                    docAuthor,
+                    docPublisher,
+                    null,
+                    null,
+                    0.0d, 0.0d,
+                    contentBytes,
+                    pdflinksCombined,
+                    null,
+                    null,
+                    false,
+                    docDate)};
        } catch (final Throwable e) {
            //throw new Parser.Failure(e.getMessage(), location);
        } finally {
--- a/source/net/yacy/document/parser/sitemapParser.java
+++ b/source/net/yacy/document/parser/sitemapParser.java
@ -49,7 +49,6 @@ import net.yacy.document.Document;
 import net.yacy.document.Parser;
 import net.yacy.document.TextParser;
 import net.yacy.document.VocabularyScraper;
-import net.yacy.kelondro.io.ByteCountInputStream;

 import org.w3c.dom.CharacterData;
 import org.w3c.dom.Element;
@ -116,7 +115,8 @@ public class sitemapParser extends AbstractParser implements Parser {
        ConcurrentLog.info("SitemapReader", "loading sitemap from " + sitemapURL.toNormalform(true));
        // client.setHeader(requestHeader.entrySet());
        try (final HTTPClient client = new HTTPClient(agent)) {
-            client.GET(sitemapURL.toNormalform(false), false);
+        	String url = sitemapURL.toNormalform(false);
+            client.GET(url, false);
            if (client.getStatusCode() != 200) {
                throw new IOException("Unable to download the sitemap file " + sitemapURL +
                        "\nServer returned status: " + client.getHttpResponse().getStatusLine());
@ -128,11 +128,10 @@ public class sitemapParser extends AbstractParser implements Parser {
            final String contentMimeType = header.mime();

            InputStream contentStream = client.getContentstream();
-            if (contentMimeType != null && (contentMimeType.equals("application/x-gzip") || contentMimeType.equals("application/gzip"))) {
+            if ((contentMimeType != null && (contentMimeType.equals("application/x-gzip") || contentMimeType.equals("application/gzip"))) || url.endsWith(".gz")) {
                contentStream = new GZIPInputStream(contentStream);
            }
-            final ByteCountInputStream counterStream = new ByteCountInputStream(contentStream, null);
-            return new SitemapReader(counterStream, agent);
+            return new SitemapReader(contentStream, agent);
        } catch (final IOException e) {
            throw e;
        }
--- a/source/net/yacy/htroot/ConfigParser_p.java
+++ b/source/net/yacy/htroot/ConfigParser_p.java
@ -61,13 +61,6 @@ public class ConfigParser_p {
                env.setConfig(SwitchboardConstants.PARSER_MIME_DENY, TextParser.getDenyMime());
                env.setConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, TextParser.getDenyExtension());
            }
-
-            if (post.containsKey("pdfSettings")) {
-                env.setConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, post.getBoolean("individualPages"));
-                env.setConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, post.get("individualPagePropertyname", "page"));
-                pdfParser.individualPages = sb.getConfigBool(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, false);
-                pdfParser.individualPagePropertyname = sb.getConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, "page");
-            }
        }

        int i = 0;
@ -94,9 +87,6 @@ public class ConfigParser_p {

        prop.put("parser", i);

-        prop.put("individualPages", sb.getConfigBool(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, false));
-        prop.put("individualPagePropertyname", sb.getConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, "page"));
-
        // return rewrite properties
        return prop;
    }
--- a/source/net/yacy/htroot/Crawler_p.java
+++ b/source/net/yacy/htroot/Crawler_p.java
@ -774,7 +774,7 @@ public class Crawler_p {
        }

        /*
-         *  <input id="customPPM" name="customPPM" type="number" min="10" max="30000" style="width:46px" value="#[customPPMdefault]#" />PPM
+         *  <input id="customPPM" name="customPPM" type="number" min="10" max="60000" style="width:46px" value="#[customPPMdefault]#" />PPM
            <input id="latencyFactor" name="latencyFactor" type="number" min="0.1" max="3.0" step="0.1" style="width:32px" value="#[latencyFactorDefault]#" />LF
            <input id="MaxSameHostInQueue" name="MaxSameHostInQueue" type="number" min="1" max="30" style="width:32px" value="#[MaxSameHostInQueueDefault]#" />MH
            <input type="submit" name="crawlingPerformance" value="set" />
@ -784,19 +784,19 @@ public class Crawler_p {
        if (post != null && post.containsKey("crawlingPerformance")) {
            final String crawlingPerformance = post.get("crawlingPerformance", "custom");
            final long LCbusySleep1 = sb.getConfigLong(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, 1000L);
-            int wantedPPM = (LCbusySleep1 == 0) ? 30000 : (int) (60000L / LCbusySleep1);
+            int wantedPPM = (LCbusySleep1 == 0) ? 60000 : (int) (60000L / LCbusySleep1);
            try {
                wantedPPM = post.getInt("customPPM", wantedPPM);
            } catch (final NumberFormatException e) {}
            if ("minimum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 10;
-            if ("maximum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 30000;
+            if ("maximum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 60000;

            int wPPM = wantedPPM;
            if ( wPPM <= 0 ) {
                wPPM = 1;
            }
-            if ( wPPM >= 30000 ) {
-                wPPM = 30000;
+            if ( wPPM >= 60000 ) {
+                wPPM = 60000;
            }
            final int newBusySleep = 60000 / wPPM; // for wantedPPM = 10: 6000; for wantedPPM = 1000: 60

--- a/source/net/yacy/htroot/IndexExport_p.java
+++ b/source/net/yacy/htroot/IndexExport_p.java
@ -64,8 +64,8 @@ public class IndexExport_p {
        prop.put("lurlexport", 0);
        prop.put("reload", 0);
        prop.put("dumprestore", 1);
-		prop.put("dumprestore_dumpRestoreEnabled", sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_FULLTEXT,
-				SwitchboardConstants.CORE_SERVICE_FULLTEXT_DEFAULT));
+        prop.put("dumprestore_dumpRestoreEnabled", sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_FULLTEXT,
+                SwitchboardConstants.CORE_SERVICE_FULLTEXT_DEFAULT));
        List<File> dumpFiles =  segment.fulltext().dumpFiles();
        prop.put("dumprestore_dumpfile", dumpFiles.size() == 0 ? "" : dumpFiles.get(dumpFiles.size() - 1).getAbsolutePath());
        prop.put("dumprestore_optimizemax", 10);
@ -80,7 +80,7 @@ public class IndexExport_p {
            prop.put("lurlexportfinished", 0);
            prop.put("lurlexporterror", 0);
            prop.put("lurlexport_exportfile", export.file().toString());
-            prop.put("lurlexport_urlcount", export.count());
+            prop.put("lurlexport_urlcount", export.docCount());
            prop.put("reload", 1);
        } else {
            prop.put("lurlexport", 1);
@ -93,7 +93,7 @@ public class IndexExport_p {
                // an export was running but has finished
                prop.put("lurlexportfinished", 1);
                prop.put("lurlexportfinished_exportfile", export.file().toString());
-                prop.put("lurlexportfinished_urlcount", export.count());
+                prop.put("lurlexportfinished_urlcount", export.docCount());
                if (export.failed() == null) {
                    prop.put("lurlexporterror", 0);
                } else {
@ -123,14 +123,17 @@ public class IndexExport_p {
            final String filter = post.get("exportfilter", ".*");
            final String query = post.get("exportquery", "*:*");
            final int maxseconds = post.getInt("exportmaxseconds", -1);
+            long maxChunkSize = post.getLong("maxchunksize", Long.MAX_VALUE);
+            if (maxChunkSize <= 0) maxChunkSize = Long.MAX_VALUE;
            final String path = post.get("exportfilepath", "");
+            final boolean minified = post.get("minified", "no").equals("yes");

            // store this call as api call: we do this even if there is a chance that it fails because recurring calls may do not fail
            if (maxseconds != -1) sb.tables.recordAPICall(post, "IndexExport_p.html", WorkTables.TABLE_API_TYPE_DUMP, format + "-dump, q=" + query + ", maxseconds=" + maxseconds);

            // start the export
            try {
-                export = sb.index.fulltext().export(format, filter, query, maxseconds, new File(path), dom, text);
+                export = sb.index.fulltext().export(format, filter, query, maxseconds, new File(path), dom, text, maxChunkSize, minified);
            } catch (final IOException e) {
                prop.put("lurlexporterror", 1);
                prop.put("lurlexporterror_exportfile", "-no export-");
@ -140,7 +143,7 @@ public class IndexExport_p {

            // show result
            prop.put("lurlexport_exportfile", export.file().toString());
-            prop.put("lurlexport_urlcount", export.count());
+            prop.put("lurlexport_urlcount", export.docCount());
            if ((export != null) && (export.failed() == null)) {
                prop.put("lurlexport", 2);
            }
@ -148,34 +151,34 @@ public class IndexExport_p {
        }

        if (post.containsKey("indexdump")) {
-        	try {
-        		final File dump = segment.fulltext().dumpEmbeddedSolr();
-        		prop.put("indexdump", 1);
-        		prop.put("indexdump_dumpfile", dump.getAbsolutePath());
-        		dumpFiles =  segment.fulltext().dumpFiles();
-        		prop.put("dumprestore_dumpfile", dumpFiles.size() == 0 ? "" : dumpFiles.get(dumpFiles.size() - 1).getAbsolutePath());
-        		// sb.tables.recordAPICall(post, "IndexExport_p.html", WorkTables.TABLE_API_TYPE_STEERING, "solr dump generation");
-        	} catch(final SolrException e) {
-        		if(ErrorCode.SERVICE_UNAVAILABLE.code == e.code()) {
-        			prop.put("indexdump", 2);
-        		} else {
-        			prop.put("indexdump", 3);
-        		}
-        	}
+            try {
+                final File dump = segment.fulltext().dumpEmbeddedSolr();
+                prop.put("indexdump", 1);
+                prop.put("indexdump_dumpfile", dump.getAbsolutePath());
+                dumpFiles =  segment.fulltext().dumpFiles();
+                prop.put("dumprestore_dumpfile", dumpFiles.size() == 0 ? "" : dumpFiles.get(dumpFiles.size() - 1).getAbsolutePath());
+                // sb.tables.recordAPICall(post, "IndexExport_p.html", WorkTables.TABLE_API_TYPE_STEERING, "solr dump generation");
+            } catch(final SolrException e) {
+                if(ErrorCode.SERVICE_UNAVAILABLE.code == e.code()) {
+                    prop.put("indexdump", 2);
+                } else {
+                    prop.put("indexdump", 3);
+                }
+            }
        }

        if (post.containsKey("indexrestore")) {
-        	try {
-        		final File dump = new File(post.get("dumpfile", ""));
-        		segment.fulltext().restoreEmbeddedSolr(dump);
-        		prop.put("indexRestore", 1);
-        	} catch(final SolrException e) {
-        		if(ErrorCode.SERVICE_UNAVAILABLE.code == e.code()) {
-        			prop.put("indexRestore", 2);
-        		} else {
-        			prop.put("indexRestore", 3);
-        		}
-        	}
+            try {
+                final File dump = new File(post.get("dumpfile", ""));
+                segment.fulltext().restoreEmbeddedSolr(dump);
+                prop.put("indexRestore", 1);
+            } catch(final SolrException e) {
+                if(ErrorCode.SERVICE_UNAVAILABLE.code == e.code()) {
+                    prop.put("indexRestore", 2);
+                } else {
+                    prop.put("indexRestore", 3);
+                }
+            }
        }

        // insert constants
--- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java
+++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java
@ -981,17 +981,7 @@ public class URIMetadataNode extends SolrDocument /* implements Comparable<URIMe
    public String urlstring() {
        if (this.alternative_urlstring != null) return this.alternative_urlstring;

-        if (!pdfParser.individualPages) return this.url().toNormalform(true);
-        if (!"pdf".equals(MultiProtocolURL.getFileExtension(this.url().getFileName()).toLowerCase(Locale.ROOT))) return this.url().toNormalform(true);
-        // for pdf links we rewrite the url
-        // this is a special treatment of pdf files which can be splitted into subpages
-        String pageprop = pdfParser.individualPagePropertyname;
-        String resultUrlstring = this.url().toNormalform(true);
-        int p = resultUrlstring.lastIndexOf(pageprop + "=");
-        if (p > 0) {
-          return resultUrlstring.substring(0, p - 1) + "#page=" + resultUrlstring.substring(p + pageprop.length() + 1);
-        }
-        return resultUrlstring;
+        return this.url().toNormalform(true);
    }
    /**
     * used for search result entry
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -176,6 +176,7 @@ import net.yacy.document.content.SurrogateReader;
 import net.yacy.document.importer.JsonListImporter;
 import net.yacy.document.importer.OAIListFriendsLoader;
 import net.yacy.document.importer.WarcImporter;
+import net.yacy.document.importer.ZimImporter;
 import net.yacy.document.parser.audioTagParser;
 import net.yacy.document.parser.pdfParser;
 import net.yacy.document.parser.html.Evaluation;
@ -906,8 +907,6 @@ public final class Switchboard extends serverSwitch {

                TextParser.setDenyMime(this.getConfig(SwitchboardConstants.PARSER_MIME_DENY, ""));
                TextParser.setDenyExtension(this.getConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, ""));
-                pdfParser.individualPages = this.getConfigBool(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, false);
-                pdfParser.individualPagePropertyname = this.getConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, "page");

                // start a loader
                this.log.config("Starting Crawl Loader");
@ -2153,7 +2152,24 @@ public final class Switchboard extends serverSwitch {
                this.log.warn("IO Error processing warc file " + infile);
            }
            return moved;
-        } else if (s.endsWith(".jsonlist") || s.endsWith(".jsonlist.gz") || s.endsWith(".flatjson")) {
+        } else if (s.endsWith(".zim")) {
+            try {
+                final ZimImporter wri = new ZimImporter(infile.getAbsolutePath());
+                wri.start();
+                try {
+                    wri.join();
+                } catch (final InterruptedException ex) {
+                    return moved;
+                }
+                moved = infile.renameTo(outfile);
+            } catch (final IOException ex) {
+                this.log.warn("IO Error processing zim file " + infile);
+            }
+            return moved;
+        } else if (
+                s.endsWith(".jsonl") || s.endsWith(".jsonl.gz") ||
+                s.endsWith(".jsonlist") || s.endsWith(".jsonlist.gz") ||
+                s.endsWith(".flatjson") || s.endsWith(".flatjson.gz")) {
            return this.processSurrogateJson(infile, outfile);
        }
        InputStream is = null;
@ -2349,6 +2365,7 @@ public final class Switchboard extends serverSwitch {
                    if ( surrogate.endsWith(".xml")
                            || surrogate.endsWith(".xml.gz")
                            || surrogate.endsWith(".xml.zip")
+                            || surrogate.endsWith(".zim")
                            || surrogate.endsWith(".warc")
                            || surrogate.endsWith(".warc.gz")
                            || surrogate.endsWith(".jsonlist")
--- a/source/net/yacy/search/SwitchboardConstants.java
+++ b/source/net/yacy/search/SwitchboardConstants.java
@ -220,8 +220,6 @@ public final class SwitchboardConstants {
    public static final String INDEX_TRANSFER_GZIP_BODY         = "indexTransfer.gzipBody";
    public static final String PARSER_MIME_DENY                 = "parser.mime.deny";
    public static final String PARSER_EXTENSIONS_DENY           = "parser.extensions.deny";
-    public static final String PARSER_PDF_INDIVIDUALPAGES       = "parser.pdf.individualpages";
-    public static final String PARSER_PDF_INDIVIDUALPAGES_KEY   = "parser.pdf.individualpages.key";
    /**
     * <p><code>public static final String <strong>PROXY_ONLINE_CAUTION_DELAY</strong> = "onlineCautionDelay"</code></p>
     * <p>Name of the setting how long indexing should pause after the last time the proxy was used in milliseconds</p>
--- a/source/net/yacy/search/index/Fulltext.java
+++ b/source/net/yacy/search/index/Fulltext.java
@ -34,8 +34,10 @@ import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Date;
 import java.util.HashSet;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
+import java.util.Map.Entry;
 import java.util.Set;
 import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.atomic.AtomicInteger;
@ -118,7 +120,7 @@ public final class Fulltext {
        this.writeWebgraph = false;
    }

-    public void setUseWebgraph(boolean check) {
+    public void setUseWebgraph(final boolean check) {
        this.writeWebgraph = check;
    }

@ -142,8 +144,8 @@ public final class Fulltext {
        final File solrLocation = new File(this.segmentPath, SOLR_PATH);

        // migrate old solr to new
-        for (String oldVersion: SOLR_OLD_PATH) {
-            File oldLocation = new File(this.segmentPath, oldVersion);
+        for (final String oldVersion: SOLR_OLD_PATH) {
+            final File oldLocation = new File(this.segmentPath, oldVersion);
            if (oldLocation.exists()) {
                if (!oldLocation.renameTo(solrLocation)) {
                    ConcurrentLog.severe("Fulltext", "Failed renaming old Solr location ("
@ -183,11 +185,11 @@ public final class Fulltext {
        return this.solrInstances.getDefaultEmbeddedConnector();
    }

-    public EmbeddedSolrConnector getEmbeddedConnector(String corename) {
+    public EmbeddedSolrConnector getEmbeddedConnector(final String corename) {
        return this.solrInstances.getEmbeddedConnector(corename);
    }

-    public SolrConnector getConnectorForRead(String corename) {
+    public SolrConnector getConnectorForRead(final String corename) {
        if (this.solrInstances.isConnectedRemote()) return this.solrInstances.getRemoteConnector(corename);
        if (this.solrInstances.isConnectedEmbedded()) return this.solrInstances.getEmbeddedConnector(corename);
        return null;
@ -315,7 +317,7 @@ public final class Fulltext {
    }

    private long lastCommit = 0;
-    public void commit(boolean softCommit) {
+    public void commit(final boolean softCommit) {
        final long t = System.currentTimeMillis();
        if (this.lastCommit + 10000 > t) return;
        this.lastCommit = t;
@ -423,7 +425,7 @@ public final class Fulltext {
     * @param freshdate either NULL or a date in the past which is the limit for deletion. Only documents older than this date are deleted
     * @throws IOException
     */
-    public void deleteStaleDomainHashes(final Set<String> hosthashes, Date freshdate) {
+    public void deleteStaleDomainHashes(final Set<String> hosthashes, final Date freshdate) {
        // delete in solr
        final Date now = new Date();
        deleteDomainWithConstraint(this.getDefaultConnector(), CollectionSchema.host_id_s.getSolrFieldName(), hosthashes,
@ -434,7 +436,7 @@ public final class Fulltext {
                    (WebgraphSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]"));
    }

-    public void deleteStaleDomainNames(final Set<String> hostnames, Date freshdate) {
+    public void deleteStaleDomainNames(final Set<String> hostnames, final Date freshdate) {

        final Date now = new Date();
        deleteDomainWithConstraint(this.getDefaultConnector(), CollectionSchema.host_s.getSolrFieldName(), hostnames,
@ -453,7 +455,7 @@ public final class Fulltext {
        deleteDomainWithConstraint(this.getDefaultConnector(), CollectionSchema.host_id_s.getSolrFieldName(), hosthashes, CollectionSchema.failreason_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM);
    }

-    private static void deleteDomainWithConstraint(SolrConnector connector, String fieldname, final Set<String> hosthashes, String constraintQuery) {
+    private static void deleteDomainWithConstraint(final SolrConnector connector, final String fieldname, final Set<String> hosthashes, final String constraintQuery) {
        if (hosthashes == null || hosthashes.size() == 0) return;
        final int subsetscount = 1 + (hosthashes.size() / 255); // if the list is too large, we get a "too many boolean clauses" exception
        int c = 0;
@ -492,7 +494,7 @@ public final class Fulltext {
     * @param basepath the left path of the url; at least until the end of the host
     * @param freshdate either NULL or a date in the past which is the limit for deletion. Only documents older than this date are deleted
     */
-    public int remove(final String basepath, Date freshdate) {
+    public int remove(final String basepath, final Date freshdate) {
        DigestURL uri;
        try {uri = new DigestURL(basepath);} catch (final MalformedURLException e) {return 0;}
        final String host = uri.getHost();
@ -690,12 +692,15 @@ public final class Fulltext {
    public static enum ExportFormat {
        text("txt"), html("html"), rss("rss"), solr("xml"), elasticsearch("flatjson");
        private final String ext;
-        private ExportFormat(String ext) {this.ext = ext;}
+        private ExportFormat(final String ext) {this.ext = ext;}
        public String getExt() {return this.ext;}
    }

    public final static String yacy_dump_prefix = "yacy_dump_";
-    public Export export(Fulltext.ExportFormat format, String filter, String query, final int maxseconds, File path, boolean dom, boolean text) throws IOException {
+    public Export export(
+            final Fulltext.ExportFormat format, final String filter, String query,
+            final int maxseconds, final File path, final boolean dom, final boolean text,
+            final long maxChunkSize, final boolean minified) throws IOException {

        // modify query according to maxseconds
        final long now = System.currentTimeMillis();
@ -760,32 +765,31 @@ public final class Fulltext {
            }
        }

-        String s = new File(path, yacy_dump_prefix +
+        final String filename = yacy_dump_prefix +
                "f" + GenericFormatter.SHORT_MINUTE_FORMATTER.format(firstdate) + "_" +
                "l" + GenericFormatter.SHORT_MINUTE_FORMATTER.format(lastdate) + "_" +
                "n" + GenericFormatter.SHORT_MINUTE_FORMATTER.format(new Date(now)) + "_" +
-                "c" + String.format("%1$012d", doccount)).getAbsolutePath() + "_tc"; // the name ends with the transaction token ('c' = 'created')
+                "c" + String.format("%1$012d", doccount)+ "_tc"; // the name ends with the transaction token ('c' = 'created')

-        // create export file name
-        if (s.indexOf('.',0) < 0) s += "." + format.getExt();
-        final File f = new File(s);
-        f.getParentFile().mkdirs();
-
-        return export(f, filter, query, format, dom, text);
+        return export(path, filename, format.getExt(), filter, query, format, dom, text, maxChunkSize, minified);
    }

    // export methods
-    public Export export(final File f, final String filter, final String query, final ExportFormat format, final boolean dom, final boolean text) {
+    public Export export(
+            final File path, final String filename,
+            final String fileext, final String filter, final String query,
+            final ExportFormat format, final boolean dom, final boolean text,
+            final long maxChunkSize, final boolean minified) {
        if ((this.exportthread != null) && (this.exportthread.isAlive())) {
            ConcurrentLog.warn("LURL-EXPORT", "cannot start another export thread, already one running");
            return this.exportthread;
        }
-        this.exportthread = new Export(f, filter, query, format, dom, text);
+        this.exportthread = new Export(path, filename, fileext, filter, query, format, dom, text, maxChunkSize, minified);
        this.exportthread.start();
        return this.exportthread;
    }

-    public static void main(String args[]) {
+    public static void main(final String args[]) {
        final Date firstdate = null;
        System.out.println(GenericFormatter.SHORT_MINUTE_FORMATTER.format(firstdate));
    }
@ -794,70 +798,110 @@ public final class Fulltext {
        return this.exportthread;
    }

+    private final static Set<String> minified_keys = new HashSet<>();
+    static {
+        //minified_keys.add(CollectionSchema.id.getSolrFieldName());
+        minified_keys.add(CollectionSchema.sku.getSolrFieldName());
+        minified_keys.add(CollectionSchema.title.getSolrFieldName());
+        //minified_keys.add(CollectionSchema.author.getSolrFieldName());
+        minified_keys.add(CollectionSchema.description_txt.getSolrFieldName());
+        //minified_keys.add(CollectionSchema.size_i.getSolrFieldName());
+        minified_keys.add(CollectionSchema.last_modified.getSolrFieldName());
+        minified_keys.add(CollectionSchema.text_t.getSolrFieldName());
+    }
+
    public class Export extends Thread {
-        private final File f;
+        private final File path;
+        private final String filename, fileext;
        private final Pattern pattern;
-        private int count;
        private String failure;
        private final String query;
        private final ExportFormat format;
        private final boolean dom, text;
-
-        private Export(final File f, final String filter, final String query, final ExportFormat format, final boolean dom, final boolean text) {
+        private int docCount, chunkSize, chunkCount;
+        private final long maxChunkSize;
+        private final boolean minified;
+
+        private Export(
+                final File path, final String filename,
+                final String fileext, final String filter, final String query,
+                final ExportFormat format, final boolean dom, final boolean text,
+                final long maxChunkSize, final boolean minified) {
            super("Fulltext.Export");
            // format: 0=text, 1=html, 2=rss/xml
-            this.f = f;
+            this.path = path;
+            this.filename = filename;
+            this.fileext = fileext;
            this.pattern = filter == null ? null : Pattern.compile(filter);
            this.query = query == null? AbstractSolrConnector.CATCHALL_QUERY : query;
-            this.count = 0;
            this.failure = null;
            this.format = format;
            this.dom = dom;
            this.text = text;
+            this.docCount = 0; // number of all documents exported so far
+            this.chunkSize = 0; // number of documents in the current chunk
+            this.chunkCount = 0; // number of chunks opened so far
+            this.maxChunkSize = maxChunkSize; // number of maximum document count per chunk
+            this.minified = minified;
            //if ((dom) && (format == 2)) dom = false;
        }

+        private void printHead(final PrintWriter pw) {
+            if (this.format == ExportFormat.html) {
+                pw.println("<html><head></head><body>");
+            }
+            if (this.format == ExportFormat.rss) {
+                pw.println("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
+                pw.println("<?xml-stylesheet type='text/xsl' href='/yacysearch.xsl' version='1.0'?>");
+                pw.println("<rss version=\"2.0\" xmlns:yacy=\"http://www.yacy.net/\" xmlns:opensearch=\"http://a9.com/-/spec/opensearch/1.1/\" xmlns:atom=\"http://www.w3.org/2005/Atom\">");
+                pw.println("<channel>");
+                pw.println("<title>YaCy Peer-to-Peer - Web-Search URL Export</title>");
+                pw.println("<description></description>");
+                pw.println("<link>http://yacy.net</link>");
+            }
+            if (this.format == ExportFormat.solr) {
+                pw.println("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
+                pw.println("<response>");
+                pw.println("<lst name=\"responseHeader\">");
+                pw.println(" <str format=\"yacy.index.export.solr.xml\"/>");
+                pw.println(" <lst name=\"params\">");
+                pw.println("  <str name=\"q\">" + this.query + "</str>");
+                pw.println(" </lst>");
+                pw.println("</lst>");
+                pw.println("<result>");
+            }
+        }
+
+        private void printTail(final PrintWriter pw) {
+            if (this.format == ExportFormat.html) {
+                pw.println("</body></html>");
+            }
+            if (this.format == ExportFormat.rss) {
+                pw.println("</channel>");
+                pw.println("</rss>");
+            }
+            if (this.format == ExportFormat.solr) {
+                pw.println("</result>");
+                pw.println("</response>");
+            }
+        }
+
        @Override
        public void run() {
            try {
-                final File parentf = this.f.getParentFile();
-                if (parentf != null) {
-                    parentf.mkdirs();
-                }
+                if (this.path != null) this.path.mkdirs();
            } catch(final Exception e) {
                ConcurrentLog.logException(e);
                this.failure = e.getMessage();
                return;
            }

-            try (/* Resources automatically closed by this try-with-resources statement */
-                    final OutputStream os = new FileOutputStream(this.format == ExportFormat.solr ? new File(this.f.getAbsolutePath() + ".gz") : this.f);
-                    final OutputStream wrappedStream = ((this.format == ExportFormat.solr)) ? new GZIPOutputStream(os, 65536){{this.def.setLevel(Deflater.BEST_COMPRESSION);}} : os;
-                    final PrintWriter pw =  new PrintWriter(new BufferedOutputStream(wrappedStream));
-                    ) {
-                if (this.format == ExportFormat.html) {
-                    pw.println("<html><head></head><body>");
-                }
-                if (this.format == ExportFormat.rss) {
-                    pw.println("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
-                    pw.println("<?xml-stylesheet type='text/xsl' href='/yacysearch.xsl' version='1.0'?>");
-                    pw.println("<rss version=\"2.0\" xmlns:yacy=\"http://www.yacy.net/\" xmlns:opensearch=\"http://a9.com/-/spec/opensearch/1.1/\" xmlns:atom=\"http://www.w3.org/2005/Atom\">");
-                    pw.println("<channel>");
-                    pw.println("<title>YaCy Peer-to-Peer - Web-Search URL Export</title>");
-                    pw.println("<description></description>");
-                    pw.println("<link>http://yacy.net</link>");
-                }
-                if (this.format == ExportFormat.solr) {
-                    pw.println("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
-                    pw.println("<response>");
-                    pw.println("<lst name=\"responseHeader\">");
-                    pw.println(" <str format=\"yacy.index.export.solr.xml\"/>");
-                    pw.println(" <lst name=\"params\">");
-                    pw.println("  <str name=\"q\">" + this.query + "</str>");
-                    pw.println(" </lst>");
-                    pw.println("</lst>");
-                    pw.println("<result>");
-                }
+            try {
+                this.docCount = 0;
+                this.chunkSize = 0;
+                this.chunkCount = 0;
+                PrintWriter pw = getWriter();
+                printHead(pw);
                if (this.dom) {
                    final Map<String, ReversibleScoreMap<String>> scores = Fulltext.this.getDefaultConnector().getFacets(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", 100000000, CollectionSchema.host_s.getSolrFieldName());
                    final ReversibleScoreMap<String> stats = scores.get(CollectionSchema.host_s.getSolrFieldName());
@ -865,7 +909,7 @@ public final class Fulltext {
                        if (this.pattern != null && !this.pattern.matcher(host).matches()) continue;
                        if (this.format == ExportFormat.text) pw.println(host);
                        if (this.format == ExportFormat.html) pw.println("<a href=\"http://" + host + "\">" + host + "</a><br>");
-                        this.count++;
+                        this.docCount++; this.chunkSize++;
                    }
                } else {
                    if (this.format == ExportFormat.solr || this.format == ExportFormat.elasticsearch || (this.text && this.format == ExportFormat.text)) {
@ -874,6 +918,12 @@ public final class Fulltext {
                        while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
                            final String url = getStringFrom(doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
                            if (this.pattern != null && !this.pattern.matcher(url).matches()) continue;
+                            if (this.minified) {
+                                final Iterator<Entry<String, Object>> i = doc.iterator();
+                                while (i.hasNext()) {
+                                    if (!minified_keys.contains(i.next().getKey())) i.remove();
+                                }
+                            }
                            final CRIgnoreWriter sw = new CRIgnoreWriter();
                            if (this.text) sw.write((String) doc.getFieldValue(CollectionSchema.text_t.getSolrFieldName()));
                            if (this.format == ExportFormat.solr) EnhancedXMLResponseWriter.writeDoc(sw, doc);
@ -882,7 +932,15 @@ public final class Fulltext {
                            if (this.format == ExportFormat.elasticsearch) pw.println("{\"index\":{}}");
                            final String d = sw.toString();
                            pw.println(d);
-                            this.count++;
+                            this.docCount++; this.chunkSize++;
+                            if (this.chunkSize >= this.maxChunkSize) {
+                                printTail(pw);
+                                pw.close();
+                                this.chunkCount++;
+                                pw = getWriter();
+                                printHead(pw);
+                                this.chunkSize = 0;
+                            }
                        }
                    } else {
                        final BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", null, 0, 100000000, Long.MAX_VALUE, 100, 1, true,
@ -918,21 +976,20 @@ public final class Fulltext {
                                pw.println("<guid isPermaLink=\"false\">" + hash + "</guid>");
                                pw.println("</item>");
                            }
-                            this.count++;
+                            this.docCount++; this.chunkSize++;
+                            if (this.chunkSize >= this.maxChunkSize) {
+                                printTail(pw);
+                                pw.close();
+                                this.chunkCount++;
+                                pw = getWriter();
+                                printHead(pw);
+                                this.chunkSize = 0;
+                            }
                        }
                    }
                }
-                if (this.format == ExportFormat.html) {
-                    pw.println("</body></html>");
-                }
-                if (this.format == ExportFormat.rss) {
-                    pw.println("</channel>");
-                    pw.println("</rss>");
-                }
-                if (this.format == ExportFormat.solr) {
-                    pw.println("</result>");
-                    pw.println("</response>");
-                }
+                printTail(pw);
+                pw.close();
            } catch (final Exception e) {
                /* Catch but log any IO exception that can occur on copy, automatic closing or streams creation */
                ConcurrentLog.logException(e);
@ -942,15 +999,46 @@ public final class Fulltext {
        }

        public File file() {
-            return this.f;
+            final File f = new File(this.path, this.filename + "_" + chunkcount(this.chunkCount) + "." + this.fileext);
+            return f;
+        }
+
+        private PrintWriter getWriter() throws IOException {
+            final File f = file();
+            final OutputStream os = new FileOutputStream(this.format == ExportFormat.solr ? new File(f.getAbsolutePath() + ".gz") : f);
+            final PrintWriter pw =  new PrintWriter(new BufferedOutputStream(((this.format == ExportFormat.solr)) ? new GZIPOutputStream(os, 65536){{this.def.setLevel(Deflater.BEST_COMPRESSION);}} : os));
+            return pw;
+        }
+
+        private String chunkcount(final int count) {
+            if (count < 10) return "000" + count;
+            if (count < 100) return "00" + count;
+            if (count < 1000) return "0" + count;
+            return "" + count;
+        }
+
+        public File path() {
+            return this.path;
+        }
+
+        public String filename() {
+            return this.filename;
+        }
+
+        public String fileext() {
+            return this.fileext;
        }

        public String failed() {
            return this.failure;
        }

-        public int count() {
-            return this.count;
+        public int docCount() {
+            return this.docCount;
+        }
+
+        public int chunkCount() {
+            return this.chunkCount;
        }

        @SuppressWarnings("unchecked")
--- a/source/org/openzim/ZIMFile.java
+++ b/source/org/openzim/ZIMFile.java
@ -113,23 +113,28 @@ public class ZIMFile extends File {
        }
        this.mimeTypeList = mList.toArray(new String[mList.size()]);

-        // Initialize the Url Pointer List
-        this.urlPtrListBlob = new byte[this.header_entryCount * 8];
-        mReader.seek(this.header_urlPtrPos);
-        RandomAccessFileZIMInputStream.readFully(mReader, this.urlPtrListBlob);
-
-        // Initialize the Title Pointer List
-        this.titlePtrListBlob = new byte[this.header_entryCount * 4];
-        mReader.seek(this.header_titlePtrPos);
-        RandomAccessFileZIMInputStream.readFully(mReader, this.titlePtrListBlob);
-
-        // Initialize the Cluster Pointer List
-        this.clusterPtrListBlob = new byte[this.header_clusterCount * 8];
-        mReader.seek(this.header_clusterPtrPos);
-        RandomAccessFileZIMInputStream.readFully(mReader, this.clusterPtrListBlob);
+        try {
+            // Initialize the Url Pointer List
+            this.urlPtrListBlob = new byte[this.header_entryCount * 8];
+            mReader.seek(this.header_urlPtrPos);
+            RandomAccessFileZIMInputStream.readFully(mReader, this.urlPtrListBlob);
+
+            // Initialize the Title Pointer List
+            this.titlePtrListBlob = new byte[this.header_entryCount * 4];
+            mReader.seek(this.header_titlePtrPos);
+            RandomAccessFileZIMInputStream.readFully(mReader, this.titlePtrListBlob);
+
+            // Initialize the Cluster Pointer List
+            this.clusterPtrListBlob = new byte[this.header_clusterCount * 8];
+            mReader.seek(this.header_clusterPtrPos);
+            RandomAccessFileZIMInputStream.readFully(mReader, this.clusterPtrListBlob);
+        } catch (IndexOutOfBoundsException e) {
+            throw new IOException(e.getMessage());
+        }
    }

    public final String getMimeType(int idx) {
+        if (idx >= this.mimeTypeList.length) return "";
        return this.mimeTypeList[idx];
    }

--- a/source/org/openzim/ZIMReader.java
+++ b/source/org/openzim/ZIMReader.java
@ -237,11 +237,25 @@ public class ZIMReader {

    public DirectoryEntry getMainDirectoryEntry() throws IOException {
        DirectoryEntry de = getDirectoryInfo(this.mFile.header_mainPage);
-        if (de.namespace == 'W' && de.url.equals("mainPage") && de instanceof RedirectEntry) {
+        if (de instanceof RedirectEntry) {
            // resolve redirect to get the actual main page
            int redirect = ((RedirectEntry) de).redirect_index;
            de = getDirectoryInfo(redirect);
        }
+        // For the main entry we demand a "text/html" mime type.
+        // Many zim files do not provide this as the main file, which is strange (maybe lazy/irresponsibe)
+        // Because the main entry is important for a validation, we seek for one entry which may
+        // be proper for indexing.
+        int entryNumner = 0;
+        while (!de.getMimeType().equals("text/html") && entryNumner < this.mFile.header_entryCount) {
+            de = getDirectoryInfo(entryNumner);
+            entryNumner++;
+            if (de.namespace != 'C' && de.namespace != 'A') continue;
+            if (!(de instanceof ArticleEntry)) continue;
+            if (!de.getMimeType().equals("text/html")) continue;
+            if (de.url.contains("404") || de.title.contains("404") || de.title.contains("301")) continue; // is a pain
+            return de;
+        }
        return de;
    }

@ -337,10 +351,7 @@ public class ZIMReader {
    public Cluster getCluster(int clusterNumber) throws IOException {
        for (int i = 0; i < this.clusterCache.size(); i++) {
            Cluster c = clusterCache.get(i);
-            if (c.cluster_number == clusterNumber) {
-                c.incUsage(); // cache hit
-                return c;
-            }
+            if (c.cluster_number == clusterNumber) return c;
        }

        // cache miss
@ -348,17 +359,10 @@ public class ZIMReader {

        // check cache size
        if (clusterCache.size() >= MAX_CLUSTER_CACHE_SIZE) {
-            // remove one entry
-            double maxEntry = Double.MIN_VALUE;
-            int pos = -1;
-            for (int i = 0; i < clusterCache.size(); i++) {
-                double r = this.clusterCache.get(i).getUsageRatio();
-                if (r > maxEntry) {maxEntry = r; pos = i;}
-            }
-            if (pos >= 0) this.clusterCache.remove(pos);
+            // remove one entry: the first entry is the oldest entry
+            this.clusterCache.remove(0);
        }

-        c.incUsage();
        this.clusterCache.add(c);
        return c;
    }
@ -378,12 +382,10 @@ public class ZIMReader {

        private int cluster_number; // used to identify the correct cache entry
        private List<byte[]> blobs;
-        private int usageCounter; // used for efficient caching and cache stale detection
        private boolean extended;

        public Cluster(int cluster_number) throws IOException {
            this.cluster_number = cluster_number;
-            this.usageCounter = 0;

            // open the cluster and make a Input Stream with the proper decompression type
            final long clusterPos = mFile.geClusterPtr(cluster_number);
@ -444,21 +446,9 @@ public class ZIMReader {
            return this.blobs.get(i);
        }

-        public void incUsage() {
-            this.usageCounter++;
-        }
-
-        public int getUsage() {
-            return this.usageCounter;
-        }
-
        public int getSize() {
            return this.blobs.size();
        }
-
-        public double getUsageRatio() {
-            return ((double) this.usageCounter) / ((double) this.blobs.size());
-        }
    }

    public byte[] getArticleData(final DirectoryEntry directoryInfo) throws IOException {