From 7db0534d8a0709a2903f1880e98aaa4657fbf462 Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Sun, 5 Nov 2023 02:16:40 +0100
Subject: [PATCH] Added a zim parser to the surrogate import option. You can
 now import zim files into YaCy by simply moving them to the DATA/SURROGATE/IN
 folder. They will be fetched and after parsing moved to DATA/SURROGATE/OUT.
 There are exceptions where the parser is not able to identify the original
 URL of the documents in the zim file. In that case the file is simply
 ignored. This commit also carries an important fix to the pdf parser and an
 increase of the maximum parsing speed to 60000 PPM which should make it
 possible to index up to 1000 files in one second.

---
 htroot/ConfigParser_p.html                    |  21 --
 htroot/Crawler_p.html                         |   4 +-
 ivy.xml                                       |   1 +
 .../cora/document/id/MultiProtocolURL.java    |  14 +-
 .../yacy/document/importer/ZimImporter.java   | 215 ++++++++++++++++--
 .../net/yacy/document/parser/pdfParser.java   | 126 +++-------
 source/net/yacy/htroot/ConfigParser_p.java    |  10 -
 source/net/yacy/htroot/Crawler_p.java         |  10 +-
 .../kelondro/data/meta/URIMetadataNode.java   |  12 +-
 source/net/yacy/search/Switchboard.java       |  18 +-
 .../net/yacy/search/SwitchboardConstants.java |   2 -
 source/org/openzim/ZIMFile.java               |  32 +--
 12 files changed, 279 insertions(+), 186 deletions(-)
diff --git a/htroot/ConfigParser_p.html b/htroot/ConfigParser_p.html
index a51ee1013..66a4665d3 100644
--- a/htroot/ConfigParser_p.html
+++ b/htroot/ConfigParser_p.html
@@ -51,27 +51,6 @@
   </tr>
 </table>
 </fieldset>
-<fieldset><legend id="parser">PDF Parser Attributes</legend>
-<p>
-  This is an experimental setting which makes it possible to split PDF documents into individual index entries.
-  Every page will become a single index hit and the url is artifically extended with a post/get attribute value containing
-  the page number as value. When such an url is displayed within a search result, then the post/get attribute is transformed into an anchor hash link.
-  This makes it possible to view the individual page directly in the pdf.js viewer built-in into firefox,
-  for reference see https://github.com/mozilla/pdf.js/wiki/Viewer-options
-</p>
-<table border="0">
-  <tr class="TableCellLight">
-    <td class="small" align="right" width="90">Split PDF</td>
-    <td class="small" align="left" width="300"><input type="checkbox" name="individualPages" #(individualPages)#::checked="checked" #(/individualPages)#/></td>
-  </tr>
-  <tr class="TableCellLight">
-    <td class="small" align="right">Property Name</td>
-    <td class="small" align="left"><input type="text" name="individualPagePropertyname" value="#[individualPagePropertyname]#"/></td>
-  </tr>
-  <tr class="TableCellDark">
-    <td colspan="3" class="small" ><input type="submit" name="pdfSettings" value="Submit" class="btn btn-primary"/></td>
-  </tr>
-</table>
 </form>
     #%env/templates/footer.template%#
   </body>
diff --git a/htroot/Crawler_p.html b/htroot/Crawler_p.html
index 79a0319c0..3b328a996 100644
--- a/htroot/Crawler_p.html
+++ b/htroot/Crawler_p.html
@@ -134,7 +134,7 @@
           <tr class="TableCellLight"> 
             <td align="left">Speed / PPM<br/>(Pages Per Minute)</td>
             <td align="left" colspan="4">
-            <input id="customPPM" name="customPPM" type="number" min="10" max="30000" style="width:5em" value="#[customPPMdefault]#" /><label for="customPPM"><abbr title="Pages Per Minute">PPM</abbr></label>
+            <input id="customPPM" name="customPPM" type="number" min="10" max="60000" style="width:5em" value="#[customPPMdefault]#" /><label for="customPPM"><abbr title="Pages Per Minute">PPM</abbr></label>
             <input id="latencyFactor" name="latencyFactor" type="number" min="0.1" max="3.0" step="0.1" style="width:3.5em" value="#[latencyFactorDefault]#" />
             <label for="latencyFactor"><abbr title="Latency Factor">LF</abbr></label>
             <input id="MaxSameHostInQueue" name="MaxSameHostInQueue" type="number" min="1" max="30" style="width:3em" value="#[MaxSameHostInQueueDefault]#" />
@@ -147,7 +147,7 @@
             <td align="left">Crawler PPM</td>
             <td align="left" width="60"><span id="ppmNum">&nbsp;&nbsp;&nbsp;</span></td>
             <td align="left" width="260px" colspan="3">
-                <progress id="ppmbar" max="30000" value="0" style="width:94%;"/>
+                <progress id="ppmbar" max="60000" value="0" style="width:94%;"/>
             </td>
           </tr>
           <tr class="TableCellLight"> 
diff --git a/ivy.xml b/ivy.xml
index 61f9ee127..8c072699d 100644
--- a/ivy.xml
+++ b/ivy.xml
@@ -28,6 +28,7 @@
       <dependency org="io.opentracing" name="opentracing-noop" rev="0.33.0"/>
       <dependency org="io.opentracing" name="opentracing-util" rev="0.33.0"/>
       <dependency org="javax.servlet" name="javax.servlet-api" rev="3.1.0"/>
+      <dependency org="javainetlocator" name="inetaddresslocator" rev="2.18" />
       <dependency org="jcifs" name="jcifs" rev="1.3.17"  conf="compile->master" />
       <dependency org="net.arnx" name="jsonic" rev="1.3.10"/>
       <dependency org="net.jthink" name="jaudiotagger" rev="2.2.5"/>
diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java
index a3404bec0..1cac0dace 100644
--- a/source/net/yacy/cora/document/id/MultiProtocolURL.java
+++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java
@@ -2593,14 +2593,18 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
                 return client.fileSize(path) > 0;
             }
             if (isHTTP() || isHTTPS()) {
-                    try (final HTTPClient client = new HTTPClient(agent)) {
-                        client.setHost(getHost());
-                        org.apache.http.HttpResponse response = client.HEADResponse(this, true);
-                        return response != null && (response.getStatusLine().getStatusCode() == 200 || response.getStatusLine().getStatusCode() == 301);
-                    }
+                final HTTPClient client = new HTTPClient(agent);
+                client.setHost(getHost());
+                org.apache.http.HttpResponse response = client.HEADResponse(this, true);
+                client.close();
+                if (response == null) return false;
+                int status = response.getStatusLine().getStatusCode();
+                return status == 200 || status == 301 || status == 302;
             }
             return false;
         } catch (IOException e) {
+            if (e.getMessage().contains("Circular redirect to")) return true; // exception; this is a 302 which the client actually accepts
+            //e.printStackTrace();
             return false;
         }
     }
diff --git a/source/net/yacy/document/importer/ZimImporter.java b/source/net/yacy/document/importer/ZimImporter.java
index 26f36f787..118e27e40 100644
--- a/source/net/yacy/document/importer/ZimImporter.java
+++ b/source/net/yacy/document/importer/ZimImporter.java
@@ -25,12 +25,20 @@ package net.yacy.document.importer;
 
 import java.io.File;
 import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
 import java.util.Collection;
+import java.util.Date;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
 import java.util.Map;
+import java.util.Set;
 import java.util.TreeMap;
 
+import net.yacy.cora.document.encoding.ASCII;
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.protocol.ClientIdentification;
+import net.yacy.cora.protocol.HeaderFramework;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.protocol.ResponseHeader;
 import net.yacy.cora.util.ConcurrentLog;
@@ -81,14 +89,18 @@ public class ZimImporter extends Thread implements Importer {
     public void run() {
         job = this;
         this.startTime = System.currentTimeMillis();
+        Switchboard sb = Switchboard.getSwitchboard();
         try {
             this.reader = new ZIMReader(this.file);
             this.guessedSource = getSource(this.reader);
 
             // verify the source
             DirectoryEntry mainEntry = this.reader.getMainDirectoryEntry();
-            DigestURL url = new DigestURL(mainEntry.url);
-            if (!url.exists(ClientIdentification.browserAgent)) return; 
+            DigestURL mainURL = guessURL(this.guessedSource, mainEntry);
+            if (!mainURL.exists(ClientIdentification.browserAgent)) {
+                sb.log.info("zim importer: file " + this.file.getName() + " failed main url existence test: " + mainURL);
+                return; 
+            }
 
             // read all documents
             for (int i = 0; i < this.file.header_entryCount; i++) {
@@ -98,8 +110,14 @@ public class ZimImporter extends Thread implements Importer {
                 ArticleEntry ae = (ArticleEntry) de;
 
                 // check url
-                String guessedUrl = guessURL(this.guessedSource, de);
-                assert guessedUrl.startsWith("http");
+                DigestURL guessedUrl = guessURL(this.guessedSource, de);
+                if (recordCnt < 10) {
+                    // critical test for the first 10 urls
+                    if (!guessedUrl.exists(ClientIdentification.browserAgent)) {
+                        sb.log.info("zim importer: file " + this.file.getName() + " failed url " + recordCnt + " existence test: " + guessedUrl);
+                        return; 
+                    }
+                }
 
                 // check availability of text parser
                 String mimeType = ae.getMimeType();
@@ -111,7 +129,17 @@ public class ZimImporter extends Thread implements Importer {
                 // create artificial request and response headers for the indexer
                 RequestHeader requestHeader = new RequestHeader();
                 ResponseHeader responseHeader = new ResponseHeader(200);
-                final Request request = new Request(new DigestURL(guessedUrl), null);
+                responseHeader.put(HeaderFramework.CONTENT_TYPE, de.getMimeType()); // very important to tell parser which kind of content
+                final Request request = new Request(
+                        ASCII.getBytes(sb.peers.mySeed().hash),
+                        guessedUrl,
+                        null, // referrerhash the hash of the referrer URL
+                        de.title, // name the name of the document to crawl
+                        null, // appdate the time when the url was first time appeared
+                        sb.crawler.defaultSurrogateProfile.handle(),        // profileHandle the name of the prefetch profile. This must not be null!
+                        0,    // depth the crawling depth of the entry
+                        sb.crawler.defaultSurrogateProfile.timezoneOffset() // timezone offset
+                );
                 final Response response = new Response(
                         request,
                         requestHeader,
@@ -122,7 +150,7 @@ public class ZimImporter extends Thread implements Importer {
                 );
 
                 // throw this to the indexer
-                String error = Switchboard.getSwitchboard().toIndexer(response);
+                String error = sb.toIndexer(response);
                 if (error != null) ConcurrentLog.info("ZimImporter", "error parsing: " + error);
                 this.recordCnt++;
             }
@@ -203,7 +231,7 @@ public class ZimImporter extends Thread implements Importer {
             case "fonts":
                 return "fonts.google.com";
             case "gutenberg":
-                return "gutenberg.org";
+                return "https://dev.library.kiwix.org/viewer#gutenberg_de_all_2023-03";
             case "ifixit":
                 return "ifixit.com";
             case "lesfondamentaux":
@@ -223,11 +251,23 @@ public class ZimImporter extends Thread implements Importer {
             case "rapsberry_pi_docs":
                 return "raspberrypi.org";
             case "ted":
-                return "ted.com";
+                return "www.ted.com/search?q=";
             case "vikidia":
-                return "vikidia.org";
+                return parts[1] + ".vikidia.org/wiki";
             case "westeros":
                 return "westeros.org";
+            case "wikihow":
+                return parts[1].equals("en") ? "wikihow.com" : parts[1] + ".wikihow.com";
+            case "wikisource":
+                return parts[1] + ".wikisource.org/wiki";
+            case "wikiversity":
+                return parts[1] + ".wikiversity.org/wiki";
+            case "wikivoyage":
+                return parts[1] + ".wikivoyage.org/wiki";
+            case "wiktionary":
+                return parts[1] + ".wiktionary.org/wiki";
+            case "wikiquote":
+                return parts[1] + ".wikiquote.org/wiki";
             case "wikibooks":
                 return parts[1] + ".wikibooks.org/wiki";
             case "wikinews":
@@ -273,16 +313,148 @@ public class ZimImporter extends Thread implements Importer {
         return source;
     }
 
-    public static String guessURL(String guessedSource, DirectoryEntry de) {
+    public static DigestURL guessURL(String guessedSource, DirectoryEntry de) throws MalformedURLException {
         String url = de.url;
         if (url.equals("Main_Page")) url = "";
-        if (guessedSource != null) return guessedSource + url;
-        if (url.startsWith("A/")) return "https://" + url.substring(2);
-        if (url.startsWith("H/")) return "https://" + url.substring(2);
-        return guessedSource + url;
+        if (guessedSource != null) return new DigestURL(guessedSource + url);
+        if (url.startsWith("A/")) return new DigestURL("https://" + url.substring(2));
+        if (url.startsWith("H/")) return new DigestURL("https://" + url.substring(2));
+        return new DigestURL(guessedSource + url);
     }
 
+    private final static String[] skip_files = {
+         "iota.stackexchange.com_en_all_2023-05.zim",
+         "stellar.stackexchange.com_en_all_2023-10.zim",
+         "vegetarianism.stackexchange.com_en_all_2023-05.zim",
+         "esperanto.stackexchange.com_eo_all_2023-10.zim",
+         "tezos.stackexchange.com_en_all_2023-10.zim",
+         "eosio.stackexchange.com_en_all_2023-10.zim",
+         "ebooks.stackexchange.com_en_all_2023-10.zim",
+         "poker.stackexchange.com_en_all_2023-05.zim",
+         "cseducators.stackexchange.com_en_all_2023-10.zim",
+         "iot.stackexchange.com_en_all_2023-05.zim",
+         "portuguese.stackexchange.com_pt_all_2023-04.zim",
+         "portuguese.stackexchange.com_pt_all_2023-10.zim",
+         "italian.stackexchange.com_it_all_2023-05.zim",
+         "monero.stackexchange.com_en_all_2022-11.zim",
+         "sustainability.stackexchange.com_en_all_2023-05.zim",
+         "westeros_en_all_nopic_2021-03.zim",
+         "opensource.stackexchange.com_en_all_2023-10.zim",
+         "tor.stackexchange.com_en_all_2023-05.zim",
+         "devops.stackexchange.com_en_all_2023-10.zim",
+         "patents.stackexchange.com_en_all_2023-10.zim",
+         "stackapps.com_en_all_2023-05.zim",
+         "hardwarerecs.stackexchange.com_en_all_2023-05.zim",
+         "hsm.stackexchange.com_en_all_2023-05.zim",
+         "expatriates.stackexchange.com_en_all_2023-11.zim",
+         "opendata.stackexchange.com_en_all_2023-10.zim",
+         "sports.stackexchange.com_en_all_2023-05.zim",
+         "wikinews_de_all_nopic_2023-10.zim",
+         "computergraphics.stackexchange.com_en_all_2023-10.zim",
+         "tridion.stackexchange.com_en_all_2023-10.zim",
+         "bioinformatics.stackexchange.com_en_all_2023-10.zim",
+         "expressionengine.stackexchange.com_en_all_2023-11.zim",
+         "elementaryos.stackexchange.com_en_all_2023-10.zim",
+         "cstheory.stackexchange.com_en_all_2023-10.zim",
+         "chess.stackexchange.com_en_all_2023-05.zim",
+         "vi.stackexchange.com_en_all_2023-05.zim",
+         "fitness.stackexchange.com_en_all_2023-10.zim",
+         "pets.stackexchange.com_en_all_2023-05.zim",
+         "french.stackexchange.com_fr_all_2023-10.zim",
+         "sqa.stackexchange.com_en_all_2023-05.zim",
+         "islam.stackexchange.com_en_all_2023-05.zim",
+         "scicomp.stackexchange.com_en_all_2023-05.zim",
+         "wikinews_en_all_nopic_2023-09.zim",
+         "ai.stackexchange.com_en_all_2023-10.zim",
+         "boardgames.stackexchange.com_en_all_2023-05.zim",
+         "economics.stackexchange.com_en_all_2023-05.zim",
+         "3dprinting.stackexchange.com_en_all_2023-07.zim",
+         "earthscience.stackexchange.com_en_all_2023-05.zim",
+         "emacs.stackexchange.com_en_all_2023-10.zim",
+         "bitcoin.stackexchange.com_en_all_2023-05.zim",
+         "philosophy.stackexchange.com_en_all_2023-05.zim",
+         "law.stackexchange.com_en_all_2023-05.zim",
+         "astronomy.stackexchange.com_en_all_2023-05.zim",
+         "artofproblemsolving_en_all_nopic_2021-03.zim",
+         "engineering.stackexchange.com_en_all_2023-05.zim",
+         "ja.stackoverflow.com_ja_all_2023-06.zim",
+         "webmasters.stackexchange.com_en_all_2023-05.zim",
+         "anime.stackexchange.com_en_all_2023-10.zim",
+         "cooking.stackexchange.com_en_all_2023-05.zim",
+         "arduino.stackexchange.com_en_all_2023-05.zim",
+         "money.stackexchange.com_en_all_2023-05.zim",
+         "judaism.stackexchange.com_en_all_2023-05.zim",
+         "ethereum.stackexchange.com_en_all_2023-05.zim",
+         "datascience.stackexchange.com_en_all_2023-10.zim",
+         "academia.stackexchange.com_en_all_2023-10.zim",
+         "music.stackexchange.com_en_all_2023-05.zim",
+         "cs.stackexchange.com_en_all_2023-03.zim",
+         "dsp.stackexchange.com_en_all_2023-05.zim",
+         "biology.stackexchange.com_en_all_2023-05.zim",
+         "android.stackexchange.com_en_all_2023-10.zim",
+         "bicycles.stackexchange.com_en_all_2023-05.zim",
+         "puzzling.stackexchange.com_en_all_2023-05.zim",
+         "photo.stackexchange.com_en_all_2023-05.zim",
+         "aviation.stackexchange.com_en_all_2023-05.zim",
+         "drupal.stackexchange.com_en_all_2023-05.zim",
+         "ux.stackexchange.com_en_all_2023-05.zim",
+         "ell.stackexchange.com_en_all_2023-10.zim",
+         "openstreetmap-wiki_en_all_nopic_2023-05.zim",
+         "softwareengineering.stackexchange.com_en_all_2023-05.zim",
+         "gaming.stackexchange.com_en_all_2023-10.zim",
+         "mathematica.stackexchange.com_en_all_2023-10.zim",
+         "pt.stackoverflow.com_pt_all_2023-06.zim",
+         "apple.stackexchange.com_en_all_2023-05.zim",
+         "diy.stackexchange.com_en_all_2023-08.zim",
+         "es.stackoverflow.com_es_all_2023-06.zim",
+         "gis.stackexchange.com_en_all_2023-05.zim",
+         "stats.stackexchange.com_en_all_2023-05.zim",
+         "physics.stackexchange.com_en_all_2023-05.zim",
+         "serverfault.com_en_all_2023-05.zim",
+         "electronics.stackexchange.com_en_all_2023-05.zim",
+         "tex.stackexchange.com_en_all_2023-05.zim",
+         "wikibooks_de_all_nopic_2021-03.zim",
+         "askubuntu.com_en_all_2023-05.zim",
+         "superuser.com_en_all_2023-05.zim",
+         "lesfondamentaux.reseau-canope.fr_fr_all_2022-11.zim",
+         "wikibooks_en_all_nopic_2021-03.zim",
+         "courses.lumenlearning.com_en_all_2021-03.zim",
+         "wikipedia_de_all_nopic_2023-10.zim",
+         "wikipedia_en_all_nopic_2023-10.zim",
+         "stackoverflow.com_en_all_nopic_2022-07.zim",
+         "stackoverflow.com_en_all_2023-05.zim",
+         "armypubs_en_all_2023-08.zim",
+         "vikidia_en_all_nopic_2023-09.zim",
+         "wikiquote_de_all_nopic_2023-10.zim",
+         "wikiquote_en_all_nopic_2023-09.zim",
+         "wiktionary_de_all_nopic_2023-10.zim",
+         "wiktionary_en_all_nopic_2023-10.zim",
+         "wikihow_de_maxi_2023-10.zim",
+         "wikivoyage_de_all_nopic_2023-09.zim",
+         "wikiversity_de_all_nopic_2021-03.zim",
+         "wikiversity_en_all_nopic_2021-03.zim",
+         "wikisource_de_all_nopic_2023-09.zim",
+         "wikisource_en_all_nopic_2023-08.zim",
+         "ted_countdown_global_2023-09.zim",
+         "ted_en_design_2023-09.zim",
+         "ted_en_business_2023-09.zim",
+         "ted_en_global_issues_2023-09.zim",
+
+         // 302
+         "moderators.stackexchange.com_en_all_2023-05.zim",
+         "beer.stackexchange.com_en_all_2023-05.zim",
+         "health.stackexchange.com_en_all_2023-05.zim",
+         "avp.stackexchange.com_en_all_2023-05.zim",
+         "lowtechmagazine.com_en_all_2023-08.zim",
+         "ifixit_de_all_2023-07.zim",
+         "ifixit_en_all_2023-10.zim",
+         "der-postillon.com_de_all_2020-12.zim",
+         "wikihow_en_maxi_2023-03.zim",
+    };
+
     public static void main(String[] args) {
+        Set<String> skip = new HashSet<>();
+        for (String s: skip_files) skip.add(s);
         // zim file import test
         // will test mostly if domain names are included in zim file urls
         String zimFilesPath = args[0];
@@ -298,7 +470,10 @@ public class ZimImporter extends Thread implements Importer {
         }
 
         Collection<File> orderedFiles = orderedFileMap.values();
+        Set<String> files_ok = new LinkedHashSet<>();
+        Set<String> files_nok = new LinkedHashSet<>();
         for (File f: orderedFiles) {
+            if (skip.contains(f.getName())) continue;
             try {
                 ZIMFile z = new ZIMFile(f.getAbsolutePath());
                 ZIMReader r = new ZIMReader(z);
@@ -308,16 +483,20 @@ public class ZimImporter extends Thread implements Importer {
                 System.out.println("Namespace: " + de.namespace);
                 System.out.println("Title:     " + de.title);
                 System.out.println("URL:       " + de.url);
-                System.out.println("guessed domain: " + guessDomainName(f.getName()));
+                System.out.println("guessed domain: " + guessDomainName(f.getName())); // uses a table and rules that deduces a source from the file name
                 String source = getSource(r);
-                System.out.println("guessed Source: " + source);
-                String mainURL = guessURL(source, de);
+                System.out.println("guessed Source: " + source); // this uses metadata stored in the zim file
+                DigestURL mainURL = guessURL(source, de);
                 System.out.println("guessed main article: " + mainURL);
-                System.out.println("main article exists: " + new DigestURL(mainURL).exists(ClientIdentification.browserAgent));
+                boolean ok = mainURL.exists(ClientIdentification.browserAgent);
+                System.out.println("main article exists: " + ok);
+                if (ok) files_ok.add(f.getName()); else files_nok.add(f.getName());
                 System.out.println();
             } catch (IOException e) {
                 e.printStackTrace();
             }
         }
+        System.out.println("ok files: " + files_ok.toString());
+        System.out.println("not-ok files: " + files_nok.toString());
     }
 }
diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java
index 0ad6b2248..f02577244 100644
--- a/source/net/yacy/document/parser/pdfParser.java
+++ b/source/net/yacy/document/parser/pdfParser.java
@@ -53,7 +53,6 @@ import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
 import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
 import org.apache.pdfbox.text.PDFTextStripper;
 
-import net.yacy.cora.document.encoding.UTF8;
 import net.yacy.cora.document.id.AnchorURL;
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.document.id.MultiProtocolURL;
@@ -69,9 +68,6 @@ import net.yacy.kelondro.util.MemoryControl;
 
 public class pdfParser extends AbstractParser implements Parser {
 
-    public static boolean individualPages = false;
-    public static String individualPagePropertyname = "page";
-
     public pdfParser() {
         super("Acrobat Portable Document Parser");
         this.SUPPORTED_EXTENSIONS.add("pdf");
@@ -149,98 +145,36 @@ public class pdfParser extends AbstractParser implements Parser {
             // get the links
         	final List<Collection<AnchorURL>> pdflinks = extractPdfLinks(pdfDoc);
 
-            // get the fulltext (either per document or for each page)
-            final PDFTextStripper stripper = new PDFTextStripper(/*StandardCharsets.UTF_8.name()*/);
-
-            if (individualPages) {
-                // this is a hack which stores individual pages of the source pdf into individual index documents
-                // the new documents will get a virtual link with a post argument page=X appended to the original url
-
-                // collect text
-                final int pagecount = pdfDoc.getNumberOfPages();
-                final String[] pages = new String[pagecount];
-                for (int page = 1; page <= pagecount; page++) {
-                    stripper.setStartPage(page);
-                    stripper.setEndPage(page);
-                    pages[page - 1] = stripper.getText(pdfDoc);
-                    //System.out.println("PAGE " + page + ": " + pages[page - 1]);
-                }
-
-                // create individual documents for each page
-                assert pages.length == pdflinks.size() : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.size();
-                result = new Document[Math.min(pages.length, pdflinks.size())];
-                final String loc = location.toNormalform(true);
-                for (int page = 0; page < result.length; page++) {
-                    result[page] = new Document(
-                            new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash
-                            mimeType,
-                            StandardCharsets.UTF_8.name(),
-                            this,
-                            null,
-                            docKeywords,
-                            singleList(docTitle),
-                            docAuthor,
-                            docPublisher,
-                            null,
-                            null,
-                            0.0d, 0.0d,
-                            pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]),
-                            pdflinks == null || page >= pdflinks.size() ? null : pdflinks.get(page),
-                            null,
-                            null,
-                            false,
-                            docDate);
-                }
-            } else {
-                // collect the whole text at once
-                final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
-                byte[] contentBytes = new byte[0];
-                stripper.setEndPage(3); // get first 3 pages (always)
-                writer.append(stripper.getText(pdfDoc));
-                contentBytes = writer.getBytes(); // remember text in case of interrupting thread
-
-                if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read
-                    stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text)
-                    stripper.setEndPage(Integer.MAX_VALUE); // set to default
-                    // we start the pdf parsing in a separate thread to ensure that it can be terminated
-                    final PDDocument pdfDocC = pdfDoc;
-                    final Thread t = new Thread("pdfParser.getText:" + location) {
-                        @Override
-                        public void run() {
-                            try {
-                                writer.append(stripper.getText(pdfDocC));
-                            } catch (final Throwable e) {}
-                        }
-                    };
-                    t.start();
-                    t.join(3000); // pdfbox likes to forget to terminate ... (quite often)
-                    if (t.isAlive()) t.interrupt();
-                    contentBytes = writer.getBytes(); // get final text before closing writer
-                    writer.close(); // free writer resources
-                }
-
-                final Collection<AnchorURL> pdflinksCombined = new HashSet<>();
-                for (final Collection<AnchorURL> pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx);
-                result = new Document[]{new Document(
-                        location,
-                        mimeType,
-                        StandardCharsets.UTF_8.name(),
-                        this,
-                        null,
-                        docKeywords,
-                        singleList(docTitle),
-                        docAuthor,
-                        docPublisher,
-                        null,
-                        null,
-                        0.0d, 0.0d,
-                        contentBytes,
-                        pdflinksCombined,
-                        null,
-                        null,
-                        false,
-                        docDate)};
-            }
+            // collect the whole text at once
+            final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
+            byte[] contentBytes = new byte[0];
+            final PDFTextStripper stripper = new PDFTextStripper();
+            stripper.setEndPage(Integer.MAX_VALUE);
+            writer.append(stripper.getText(pdfDoc));
+            contentBytes = writer.getBytes(); // remember text in case of interrupting thread
+            writer.close(); // free writer resources
+
+            final Collection<AnchorURL> pdflinksCombined = new HashSet<>();
+            for (final Collection<AnchorURL> pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx);
+            result = new Document[]{new Document(
+                    location,
+                    mimeType,
+                    StandardCharsets.UTF_8.name(),
+                    this,
+                    null,
+                    docKeywords,
+                    singleList(docTitle),
+                    docAuthor,
+                    docPublisher,
+                    null,
+                    null,
+                    0.0d, 0.0d,
+                    contentBytes,
+                    pdflinksCombined,
+                    null,
+                    null,
+                    false,
+                    docDate)};
         } catch (final Throwable e) {
             //throw new Parser.Failure(e.getMessage(), location);
         } finally {
diff --git a/source/net/yacy/htroot/ConfigParser_p.java b/source/net/yacy/htroot/ConfigParser_p.java
index e466d783b..943279382 100644
--- a/source/net/yacy/htroot/ConfigParser_p.java
+++ b/source/net/yacy/htroot/ConfigParser_p.java
@@ -61,13 +61,6 @@ public class ConfigParser_p {
                 env.setConfig(SwitchboardConstants.PARSER_MIME_DENY, TextParser.getDenyMime());
                 env.setConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, TextParser.getDenyExtension());
             }
-
-            if (post.containsKey("pdfSettings")) {
-                env.setConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, post.getBoolean("individualPages"));
-                env.setConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, post.get("individualPagePropertyname", "page"));
-                pdfParser.individualPages = sb.getConfigBool(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, false);
-                pdfParser.individualPagePropertyname = sb.getConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, "page");
-            }
         }
 
         int i = 0;
@@ -94,9 +87,6 @@ public class ConfigParser_p {
 
         prop.put("parser", i);
 
-        prop.put("individualPages", sb.getConfigBool(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, false));
-        prop.put("individualPagePropertyname", sb.getConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, "page"));
-
         // return rewrite properties
         return prop;
     }
diff --git a/source/net/yacy/htroot/Crawler_p.java b/source/net/yacy/htroot/Crawler_p.java
index e95562713..8c898f558 100644
--- a/source/net/yacy/htroot/Crawler_p.java
+++ b/source/net/yacy/htroot/Crawler_p.java
@@ -774,7 +774,7 @@ public class Crawler_p {
         }
 
         /*
-         *  <input id="customPPM" name="customPPM" type="number" min="10" max="30000" style="width:46px" value="#[customPPMdefault]#" />PPM
+         *  <input id="customPPM" name="customPPM" type="number" min="10" max="60000" style="width:46px" value="#[customPPMdefault]#" />PPM
             <input id="latencyFactor" name="latencyFactor" type="number" min="0.1" max="3.0" step="0.1" style="width:32px" value="#[latencyFactorDefault]#" />LF
             <input id="MaxSameHostInQueue" name="MaxSameHostInQueue" type="number" min="1" max="30" style="width:32px" value="#[MaxSameHostInQueueDefault]#" />MH
             <input type="submit" name="crawlingPerformance" value="set" />
@@ -784,19 +784,19 @@ public class Crawler_p {
         if (post != null && post.containsKey("crawlingPerformance")) {
             final String crawlingPerformance = post.get("crawlingPerformance", "custom");
             final long LCbusySleep1 = sb.getConfigLong(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, 1000L);
-            int wantedPPM = (LCbusySleep1 == 0) ? 30000 : (int) (60000L / LCbusySleep1);
+            int wantedPPM = (LCbusySleep1 == 0) ? 60000 : (int) (60000L / LCbusySleep1);
             try {
                 wantedPPM = post.getInt("customPPM", wantedPPM);
             } catch (final NumberFormatException e) {}
             if ("minimum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 10;
-            if ("maximum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 30000;
+            if ("maximum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 60000;
 
             int wPPM = wantedPPM;
             if ( wPPM <= 0 ) {
                 wPPM = 1;
             }
-            if ( wPPM >= 30000 ) {
-                wPPM = 30000;
+            if ( wPPM >= 60000 ) {
+                wPPM = 60000;
             }
             final int newBusySleep = 60000 / wPPM; // for wantedPPM = 10: 6000; for wantedPPM = 1000: 60
 
diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java
index 33b797524..2d93ec8b7 100644
--- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java
+++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java
@@ -981,17 +981,7 @@ public class URIMetadataNode extends SolrDocument /* implements Comparable<URIMe
     public String urlstring() {
         if (this.alternative_urlstring != null) return this.alternative_urlstring;
 
-        if (!pdfParser.individualPages) return this.url().toNormalform(true);
-        if (!"pdf".equals(MultiProtocolURL.getFileExtension(this.url().getFileName()).toLowerCase(Locale.ROOT))) return this.url().toNormalform(true);
-        // for pdf links we rewrite the url
-        // this is a special treatment of pdf files which can be splitted into subpages
-        String pageprop = pdfParser.individualPagePropertyname;
-        String resultUrlstring = this.url().toNormalform(true);
-        int p = resultUrlstring.lastIndexOf(pageprop + "=");
-        if (p > 0) {
-          return resultUrlstring.substring(0, p - 1) + "#page=" + resultUrlstring.substring(p + pageprop.length() + 1);
-        }
-        return resultUrlstring;
+        return this.url().toNormalform(true);
     }
     /**
      * used for search result entry
diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java
index 39f856ea3..bebd16cbd 100644
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@@ -176,6 +176,7 @@ import net.yacy.document.content.SurrogateReader;
 import net.yacy.document.importer.JsonListImporter;
 import net.yacy.document.importer.OAIListFriendsLoader;
 import net.yacy.document.importer.WarcImporter;
+import net.yacy.document.importer.ZimImporter;
 import net.yacy.document.parser.audioTagParser;
 import net.yacy.document.parser.pdfParser;
 import net.yacy.document.parser.html.Evaluation;
@@ -906,8 +907,6 @@ public final class Switchboard extends serverSwitch {
 
                 TextParser.setDenyMime(this.getConfig(SwitchboardConstants.PARSER_MIME_DENY, ""));
                 TextParser.setDenyExtension(this.getConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, ""));
-                pdfParser.individualPages = this.getConfigBool(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, false);
-                pdfParser.individualPagePropertyname = this.getConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, "page");
 
                 // start a loader
                 this.log.config("Starting Crawl Loader");
@@ -2153,6 +2152,20 @@ public final class Switchboard extends serverSwitch {
                 this.log.warn("IO Error processing warc file " + infile);
             }
             return moved;
+        } else if (s.endsWith(".zim")) {
+            try {
+                final ZimImporter wri = new ZimImporter(infile.getAbsolutePath());
+                wri.start();
+                try {
+                    wri.join();
+                } catch (final InterruptedException ex) {
+                    return moved;
+                }
+                moved = infile.renameTo(outfile);
+            } catch (final IOException ex) {
+                this.log.warn("IO Error processing zim file " + infile);
+            }
+            return moved;
         } else if (s.endsWith(".jsonlist") || s.endsWith(".jsonlist.gz") || s.endsWith(".flatjson")) {
             return this.processSurrogateJson(infile, outfile);
         }
@@ -2349,6 +2362,7 @@ public final class Switchboard extends serverSwitch {
                     if ( surrogate.endsWith(".xml")
                             || surrogate.endsWith(".xml.gz")
                             || surrogate.endsWith(".xml.zip")
+                            || surrogate.endsWith(".zim")
                             || surrogate.endsWith(".warc")
                             || surrogate.endsWith(".warc.gz")
                             || surrogate.endsWith(".jsonlist")
diff --git a/source/net/yacy/search/SwitchboardConstants.java b/source/net/yacy/search/SwitchboardConstants.java
index b871291cf..fedef45e7 100644
--- a/source/net/yacy/search/SwitchboardConstants.java
+++ b/source/net/yacy/search/SwitchboardConstants.java
@@ -220,8 +220,6 @@ public final class SwitchboardConstants {
     public static final String INDEX_TRANSFER_GZIP_BODY         = "indexTransfer.gzipBody";
     public static final String PARSER_MIME_DENY                 = "parser.mime.deny";
     public static final String PARSER_EXTENSIONS_DENY           = "parser.extensions.deny";
-    public static final String PARSER_PDF_INDIVIDUALPAGES       = "parser.pdf.individualpages";
-    public static final String PARSER_PDF_INDIVIDUALPAGES_KEY   = "parser.pdf.individualpages.key";
     /**
      * <p><code>public static final String <strong>PROXY_ONLINE_CAUTION_DELAY</strong> = "onlineCautionDelay"</code></p>
      * <p>Name of the setting how long indexing should pause after the last time the proxy was used in milliseconds</p>
diff --git a/source/org/openzim/ZIMFile.java b/source/org/openzim/ZIMFile.java
index 906bf30a9..a241507ab 100644
--- a/source/org/openzim/ZIMFile.java
+++ b/source/org/openzim/ZIMFile.java
@@ -113,20 +113,24 @@ public class ZIMFile extends File {
         }
         this.mimeTypeList = mList.toArray(new String[mList.size()]);
 
-        // Initialize the Url Pointer List
-        this.urlPtrListBlob = new byte[this.header_entryCount * 8];
-        mReader.seek(this.header_urlPtrPos);
-        RandomAccessFileZIMInputStream.readFully(mReader, this.urlPtrListBlob);
-
-        // Initialize the Title Pointer List
-        this.titlePtrListBlob = new byte[this.header_entryCount * 4];
-        mReader.seek(this.header_titlePtrPos);
-        RandomAccessFileZIMInputStream.readFully(mReader, this.titlePtrListBlob);
-
-        // Initialize the Cluster Pointer List
-        this.clusterPtrListBlob = new byte[this.header_clusterCount * 8];
-        mReader.seek(this.header_clusterPtrPos);
-        RandomAccessFileZIMInputStream.readFully(mReader, this.clusterPtrListBlob);
+        try {
+            // Initialize the Url Pointer List
+            this.urlPtrListBlob = new byte[this.header_entryCount * 8];
+            mReader.seek(this.header_urlPtrPos);
+            RandomAccessFileZIMInputStream.readFully(mReader, this.urlPtrListBlob);
+
+            // Initialize the Title Pointer List
+            this.titlePtrListBlob = new byte[this.header_entryCount * 4];
+            mReader.seek(this.header_titlePtrPos);
+            RandomAccessFileZIMInputStream.readFully(mReader, this.titlePtrListBlob);
+
+            // Initialize the Cluster Pointer List
+            this.clusterPtrListBlob = new byte[this.header_clusterCount * 8];
+            mReader.seek(this.header_clusterPtrPos);
+            RandomAccessFileZIMInputStream.readFully(mReader, this.clusterPtrListBlob);
+        } catch (IndexOutOfBoundsException e) {
+            throw new IOException(e.getMessage());
+        }
     }
 
     public final String getMimeType(int idx) {