Added a zim parser to the surrogate import option.

You can now import zim files into YaCy by simply moving them to the DATA/SURROGATE/IN folder. They will be fetched and after parsing moved to DATA/SURROGATE/OUT. There are exceptions where the parser is not able to identify the original URL of the documents in the zim file. In that case the file is simply ignored. This commit also carries an important fix to the pdf parser and an increase of the maximum parsing speed to 60000 PPM which should make it possible to index up to 1000 files in one second.
1 year ago · 7db0534d8a
parent 70e29937ef
commit 7db0534d8a
12 changed files with 279 additions and 186 deletions
--- a/htroot/ConfigParser_p.html
+++ b/htroot/ConfigParser_p.html
@ -51,27 +51,6 @@
  </tr>
 </table>
 </fieldset>
-<fieldset><legend id="parser">PDF Parser Attributes</legend>
-<p>
-  This is an experimental setting which makes it possible to split PDF documents into individual index entries.
-  Every page will become a single index hit and the url is artifically extended with a post/get attribute value containing
-  the page number as value. When such an url is displayed within a search result, then the post/get attribute is transformed into an anchor hash link.
-  This makes it possible to view the individual page directly in the pdf.js viewer built-in into firefox,
-  for reference see https://github.com/mozilla/pdf.js/wiki/Viewer-options
-</p>
-<table border="0">
-  <tr class="TableCellLight">
-    <td class="small" align="right" width="90">Split PDF</td>
-    <td class="small" align="left" width="300"><input type="checkbox" name="individualPages" #(individualPages)#::checked="checked" #(/individualPages)#/></td>
-  </tr>
-  <tr class="TableCellLight">
-    <td class="small" align="right">Property Name</td>
-    <td class="small" align="left"><input type="text" name="individualPagePropertyname" value="#[individualPagePropertyname]#"/></td>
-  </tr>
-  <tr class="TableCellDark">
-    <td colspan="3" class="small" ><input type="submit" name="pdfSettings" value="Submit" class="btn btn-primary"/></td>
-  </tr>
-</table>
 </form>
    #%env/templates/footer.template%#
  </body>
--- a/htroot/Crawler_p.html
+++ b/htroot/Crawler_p.html
@ -134,7 +134,7 @@
          <tr class="TableCellLight"> 
            <td align="left">Speed / PPM<br/>(Pages Per Minute)</td>
            <td align="left" colspan="4">
-            <input id="customPPM" name="customPPM" type="number" min="10" max="30000" style="width:5em" value="#[customPPMdefault]#" /><label for="customPPM"><abbr title="Pages Per Minute">PPM</abbr></label>
+            <input id="customPPM" name="customPPM" type="number" min="10" max="60000" style="width:5em" value="#[customPPMdefault]#" /><label for="customPPM"><abbr title="Pages Per Minute">PPM</abbr></label>
            <input id="latencyFactor" name="latencyFactor" type="number" min="0.1" max="3.0" step="0.1" style="width:3.5em" value="#[latencyFactorDefault]#" />
            <label for="latencyFactor"><abbr title="Latency Factor">LF</abbr></label>
            <input id="MaxSameHostInQueue" name="MaxSameHostInQueue" type="number" min="1" max="30" style="width:3em" value="#[MaxSameHostInQueueDefault]#" />
@ -147,7 +147,7 @@
            <td align="left">Crawler PPM</td>
            <td align="left" width="60"><span id="ppmNum">&nbsp;&nbsp;&nbsp;</span></td>
            <td align="left" width="260px" colspan="3">
-                <progress id="ppmbar" max="30000" value="0" style="width:94%;"/>
+                <progress id="ppmbar" max="60000" value="0" style="width:94%;"/>
            </td>
          </tr>
          <tr class="TableCellLight"> 
--- a/ivy.xml
+++ b/ivy.xml
@ -28,6 +28,7 @@
      <dependency org="io.opentracing" name="opentracing-noop" rev="0.33.0"/>
      <dependency org="io.opentracing" name="opentracing-util" rev="0.33.0"/>
      <dependency org="javax.servlet" name="javax.servlet-api" rev="3.1.0"/>
+      <dependency org="javainetlocator" name="inetaddresslocator" rev="2.18" />
      <dependency org="jcifs" name="jcifs" rev="1.3.17"  conf="compile->master" />
      <dependency org="net.arnx" name="jsonic" rev="1.3.10"/>
      <dependency org="net.jthink" name="jaudiotagger" rev="2.2.5"/>
--- a/source/net/yacy/cora/document/id/MultiProtocolURL.java
+++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java
@ -2593,14 +2593,18 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
                return client.fileSize(path) > 0;
            }
            if (isHTTP() || isHTTPS()) {
-                    try (final HTTPClient client = new HTTPClient(agent)) {
-                        client.setHost(getHost());
-                        org.apache.http.HttpResponse response = client.HEADResponse(this, true);
-                        return response != null && (response.getStatusLine().getStatusCode() == 200 || response.getStatusLine().getStatusCode() == 301);
-                    }
+                final HTTPClient client = new HTTPClient(agent);
+                client.setHost(getHost());
+                org.apache.http.HttpResponse response = client.HEADResponse(this, true);
+                client.close();
+                if (response == null) return false;
+                int status = response.getStatusLine().getStatusCode();
+                return status == 200 || status == 301 || status == 302;
            }
            return false;
        } catch (IOException e) {
+            if (e.getMessage().contains("Circular redirect to")) return true; // exception; this is a 302 which the client actually accepts
+            //e.printStackTrace();
            return false;
        }
    }
--- a/source/net/yacy/document/importer/ZimImporter.java
+++ b/source/net/yacy/document/importer/ZimImporter.java
@ -25,12 +25,20 @@ package net.yacy.document.importer;

 import java.io.File;
 import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
 import java.util.Collection;
+import java.util.Date;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
 import java.util.Map;
+import java.util.Set;
 import java.util.TreeMap;

+import net.yacy.cora.document.encoding.ASCII;
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.protocol.ClientIdentification;
+import net.yacy.cora.protocol.HeaderFramework;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.protocol.ResponseHeader;
 import net.yacy.cora.util.ConcurrentLog;
@ -81,14 +89,18 @@ public class ZimImporter extends Thread implements Importer {
    public void run() {
        job = this;
        this.startTime = System.currentTimeMillis();
+        Switchboard sb = Switchboard.getSwitchboard();
        try {
            this.reader = new ZIMReader(this.file);
            this.guessedSource = getSource(this.reader);

            // verify the source
            DirectoryEntry mainEntry = this.reader.getMainDirectoryEntry();
-            DigestURL url = new DigestURL(mainEntry.url);
-            if (!url.exists(ClientIdentification.browserAgent)) return; 
+            DigestURL mainURL = guessURL(this.guessedSource, mainEntry);
+            if (!mainURL.exists(ClientIdentification.browserAgent)) {
+                sb.log.info("zim importer: file " + this.file.getName() + " failed main url existence test: " + mainURL);
+                return; 
+            }

            // read all documents
            for (int i = 0; i < this.file.header_entryCount; i++) {
@ -98,8 +110,14 @@ public class ZimImporter extends Thread implements Importer {
                ArticleEntry ae = (ArticleEntry) de;

                // check url
-                String guessedUrl = guessURL(this.guessedSource, de);
-                assert guessedUrl.startsWith("http");
+                DigestURL guessedUrl = guessURL(this.guessedSource, de);
+                if (recordCnt < 10) {
+                    // critical test for the first 10 urls
+                    if (!guessedUrl.exists(ClientIdentification.browserAgent)) {
+                        sb.log.info("zim importer: file " + this.file.getName() + " failed url " + recordCnt + " existence test: " + guessedUrl);
+                        return; 
+                    }
+                }

                // check availability of text parser
                String mimeType = ae.getMimeType();
@ -111,7 +129,17 @@ public class ZimImporter extends Thread implements Importer {
                // create artificial request and response headers for the indexer
                RequestHeader requestHeader = new RequestHeader();
                ResponseHeader responseHeader = new ResponseHeader(200);
-                final Request request = new Request(new DigestURL(guessedUrl), null);
+                responseHeader.put(HeaderFramework.CONTENT_TYPE, de.getMimeType()); // very important to tell parser which kind of content
+                final Request request = new Request(
+                        ASCII.getBytes(sb.peers.mySeed().hash),
+                        guessedUrl,
+                        null, // referrerhash the hash of the referrer URL
+                        de.title, // name the name of the document to crawl
+                        null, // appdate the time when the url was first time appeared
+                        sb.crawler.defaultSurrogateProfile.handle(),        // profileHandle the name of the prefetch profile. This must not be null!
+                        0,    // depth the crawling depth of the entry
+                        sb.crawler.defaultSurrogateProfile.timezoneOffset() // timezone offset
+                );
                final Response response = new Response(
                        request,
                        requestHeader,
@ -122,7 +150,7 @@ public class ZimImporter extends Thread implements Importer {
                );

                // throw this to the indexer
-                String error = Switchboard.getSwitchboard().toIndexer(response);
+                String error = sb.toIndexer(response);
                if (error != null) ConcurrentLog.info("ZimImporter", "error parsing: " + error);
                this.recordCnt++;
            }
@ -203,7 +231,7 @@ public class ZimImporter extends Thread implements Importer {
            case "fonts":
                return "fonts.google.com";
            case "gutenberg":
-                return "gutenberg.org";
+                return "https://dev.library.kiwix.org/viewer#gutenberg_de_all_2023-03";
            case "ifixit":
                return "ifixit.com";
            case "lesfondamentaux":
@ -223,11 +251,23 @@ public class ZimImporter extends Thread implements Importer {
            case "rapsberry_pi_docs":
                return "raspberrypi.org";
            case "ted":
-                return "ted.com";
+                return "www.ted.com/search?q=";
            case "vikidia":
-                return "vikidia.org";
+                return parts[1] + ".vikidia.org/wiki";
            case "westeros":
                return "westeros.org";
+            case "wikihow":
+                return parts[1].equals("en") ? "wikihow.com" : parts[1] + ".wikihow.com";
+            case "wikisource":
+                return parts[1] + ".wikisource.org/wiki";
+            case "wikiversity":
+                return parts[1] + ".wikiversity.org/wiki";
+            case "wikivoyage":
+                return parts[1] + ".wikivoyage.org/wiki";
+            case "wiktionary":
+                return parts[1] + ".wiktionary.org/wiki";
+            case "wikiquote":
+                return parts[1] + ".wikiquote.org/wiki";
            case "wikibooks":
                return parts[1] + ".wikibooks.org/wiki";
            case "wikinews":
@ -273,16 +313,148 @@ public class ZimImporter extends Thread implements Importer {
        return source;
    }

-    public static String guessURL(String guessedSource, DirectoryEntry de) {
+    public static DigestURL guessURL(String guessedSource, DirectoryEntry de) throws MalformedURLException {
        String url = de.url;
        if (url.equals("Main_Page")) url = "";
-        if (guessedSource != null) return guessedSource + url;
-        if (url.startsWith("A/")) return "https://" + url.substring(2);
-        if (url.startsWith("H/")) return "https://" + url.substring(2);
-        return guessedSource + url;
+        if (guessedSource != null) return new DigestURL(guessedSource + url);
+        if (url.startsWith("A/")) return new DigestURL("https://" + url.substring(2));
+        if (url.startsWith("H/")) return new DigestURL("https://" + url.substring(2));
+        return new DigestURL(guessedSource + url);
    }

+    private final static String[] skip_files = {
+         "iota.stackexchange.com_en_all_2023-05.zim",
+         "stellar.stackexchange.com_en_all_2023-10.zim",
+         "vegetarianism.stackexchange.com_en_all_2023-05.zim",
+         "esperanto.stackexchange.com_eo_all_2023-10.zim",
+         "tezos.stackexchange.com_en_all_2023-10.zim",
+         "eosio.stackexchange.com_en_all_2023-10.zim",
+         "ebooks.stackexchange.com_en_all_2023-10.zim",
+         "poker.stackexchange.com_en_all_2023-05.zim",
+         "cseducators.stackexchange.com_en_all_2023-10.zim",
+         "iot.stackexchange.com_en_all_2023-05.zim",
+         "portuguese.stackexchange.com_pt_all_2023-04.zim",
+         "portuguese.stackexchange.com_pt_all_2023-10.zim",
+         "italian.stackexchange.com_it_all_2023-05.zim",
+         "monero.stackexchange.com_en_all_2022-11.zim",
+         "sustainability.stackexchange.com_en_all_2023-05.zim",
+         "westeros_en_all_nopic_2021-03.zim",
+         "opensource.stackexchange.com_en_all_2023-10.zim",
+         "tor.stackexchange.com_en_all_2023-05.zim",
+         "devops.stackexchange.com_en_all_2023-10.zim",
+         "patents.stackexchange.com_en_all_2023-10.zim",
+         "stackapps.com_en_all_2023-05.zim",
+         "hardwarerecs.stackexchange.com_en_all_2023-05.zim",
+         "hsm.stackexchange.com_en_all_2023-05.zim",
+         "expatriates.stackexchange.com_en_all_2023-11.zim",
+         "opendata.stackexchange.com_en_all_2023-10.zim",
+         "sports.stackexchange.com_en_all_2023-05.zim",
+         "wikinews_de_all_nopic_2023-10.zim",
+         "computergraphics.stackexchange.com_en_all_2023-10.zim",
+         "tridion.stackexchange.com_en_all_2023-10.zim",
+         "bioinformatics.stackexchange.com_en_all_2023-10.zim",
+         "expressionengine.stackexchange.com_en_all_2023-11.zim",
+         "elementaryos.stackexchange.com_en_all_2023-10.zim",
+         "cstheory.stackexchange.com_en_all_2023-10.zim",
+         "chess.stackexchange.com_en_all_2023-05.zim",
+         "vi.stackexchange.com_en_all_2023-05.zim",
+         "fitness.stackexchange.com_en_all_2023-10.zim",
+         "pets.stackexchange.com_en_all_2023-05.zim",
+         "french.stackexchange.com_fr_all_2023-10.zim",
+         "sqa.stackexchange.com_en_all_2023-05.zim",
+         "islam.stackexchange.com_en_all_2023-05.zim",
+         "scicomp.stackexchange.com_en_all_2023-05.zim",
+         "wikinews_en_all_nopic_2023-09.zim",
+         "ai.stackexchange.com_en_all_2023-10.zim",
+         "boardgames.stackexchange.com_en_all_2023-05.zim",
+         "economics.stackexchange.com_en_all_2023-05.zim",
+         "3dprinting.stackexchange.com_en_all_2023-07.zim",
+         "earthscience.stackexchange.com_en_all_2023-05.zim",
+         "emacs.stackexchange.com_en_all_2023-10.zim",
+         "bitcoin.stackexchange.com_en_all_2023-05.zim",
+         "philosophy.stackexchange.com_en_all_2023-05.zim",
+         "law.stackexchange.com_en_all_2023-05.zim",
+         "astronomy.stackexchange.com_en_all_2023-05.zim",
+         "artofproblemsolving_en_all_nopic_2021-03.zim",
+         "engineering.stackexchange.com_en_all_2023-05.zim",
+         "ja.stackoverflow.com_ja_all_2023-06.zim",
+         "webmasters.stackexchange.com_en_all_2023-05.zim",
+         "anime.stackexchange.com_en_all_2023-10.zim",
+         "cooking.stackexchange.com_en_all_2023-05.zim",
+         "arduino.stackexchange.com_en_all_2023-05.zim",
+         "money.stackexchange.com_en_all_2023-05.zim",
+         "judaism.stackexchange.com_en_all_2023-05.zim",
+         "ethereum.stackexchange.com_en_all_2023-05.zim",
+         "datascience.stackexchange.com_en_all_2023-10.zim",
+         "academia.stackexchange.com_en_all_2023-10.zim",
+         "music.stackexchange.com_en_all_2023-05.zim",
+         "cs.stackexchange.com_en_all_2023-03.zim",
+         "dsp.stackexchange.com_en_all_2023-05.zim",
+         "biology.stackexchange.com_en_all_2023-05.zim",
+         "android.stackexchange.com_en_all_2023-10.zim",
+         "bicycles.stackexchange.com_en_all_2023-05.zim",
+         "puzzling.stackexchange.com_en_all_2023-05.zim",
+         "photo.stackexchange.com_en_all_2023-05.zim",
+         "aviation.stackexchange.com_en_all_2023-05.zim",
+         "drupal.stackexchange.com_en_all_2023-05.zim",
+         "ux.stackexchange.com_en_all_2023-05.zim",
+         "ell.stackexchange.com_en_all_2023-10.zim",
+         "openstreetmap-wiki_en_all_nopic_2023-05.zim",
+         "softwareengineering.stackexchange.com_en_all_2023-05.zim",
+         "gaming.stackexchange.com_en_all_2023-10.zim",
+         "mathematica.stackexchange.com_en_all_2023-10.zim",
+         "pt.stackoverflow.com_pt_all_2023-06.zim",
+         "apple.stackexchange.com_en_all_2023-05.zim",
+         "diy.stackexchange.com_en_all_2023-08.zim",
+         "es.stackoverflow.com_es_all_2023-06.zim",
+         "gis.stackexchange.com_en_all_2023-05.zim",
+         "stats.stackexchange.com_en_all_2023-05.zim",
+         "physics.stackexchange.com_en_all_2023-05.zim",
+         "serverfault.com_en_all_2023-05.zim",
+         "electronics.stackexchange.com_en_all_2023-05.zim",
+         "tex.stackexchange.com_en_all_2023-05.zim",
+         "wikibooks_de_all_nopic_2021-03.zim",
+         "askubuntu.com_en_all_2023-05.zim",
+         "superuser.com_en_all_2023-05.zim",
+         "lesfondamentaux.reseau-canope.fr_fr_all_2022-11.zim",
+         "wikibooks_en_all_nopic_2021-03.zim",
+         "courses.lumenlearning.com_en_all_2021-03.zim",
+         "wikipedia_de_all_nopic_2023-10.zim",
+         "wikipedia_en_all_nopic_2023-10.zim",
+         "stackoverflow.com_en_all_nopic_2022-07.zim",
+         "stackoverflow.com_en_all_2023-05.zim",
+         "armypubs_en_all_2023-08.zim",
+         "vikidia_en_all_nopic_2023-09.zim",
+         "wikiquote_de_all_nopic_2023-10.zim",
+         "wikiquote_en_all_nopic_2023-09.zim",
+         "wiktionary_de_all_nopic_2023-10.zim",
+         "wiktionary_en_all_nopic_2023-10.zim",
+         "wikihow_de_maxi_2023-10.zim",
+         "wikivoyage_de_all_nopic_2023-09.zim",
+         "wikiversity_de_all_nopic_2021-03.zim",
+         "wikiversity_en_all_nopic_2021-03.zim",
+         "wikisource_de_all_nopic_2023-09.zim",
+         "wikisource_en_all_nopic_2023-08.zim",
+         "ted_countdown_global_2023-09.zim",
+         "ted_en_design_2023-09.zim",
+         "ted_en_business_2023-09.zim",
+         "ted_en_global_issues_2023-09.zim",
+
+         // 302
+         "moderators.stackexchange.com_en_all_2023-05.zim",
+         "beer.stackexchange.com_en_all_2023-05.zim",
+         "health.stackexchange.com_en_all_2023-05.zim",
+         "avp.stackexchange.com_en_all_2023-05.zim",
+         "lowtechmagazine.com_en_all_2023-08.zim",
+         "ifixit_de_all_2023-07.zim",
+         "ifixit_en_all_2023-10.zim",
+         "der-postillon.com_de_all_2020-12.zim",
+         "wikihow_en_maxi_2023-03.zim",
+    };
+
    public static void main(String[] args) {
+        Set<String> skip = new HashSet<>();
+        for (String s: skip_files) skip.add(s);
        // zim file import test
        // will test mostly if domain names are included in zim file urls
        String zimFilesPath = args[0];
@ -298,7 +470,10 @@ public class ZimImporter extends Thread implements Importer {
        }

        Collection<File> orderedFiles = orderedFileMap.values();
+        Set<String> files_ok = new LinkedHashSet<>();
+        Set<String> files_nok = new LinkedHashSet<>();
        for (File f: orderedFiles) {
+            if (skip.contains(f.getName())) continue;
            try {
                ZIMFile z = new ZIMFile(f.getAbsolutePath());
                ZIMReader r = new ZIMReader(z);
@ -308,16 +483,20 @@ public class ZimImporter extends Thread implements Importer {
                System.out.println("Namespace: " + de.namespace);
                System.out.println("Title:     " + de.title);
                System.out.println("URL:       " + de.url);
-                System.out.println("guessed domain: " + guessDomainName(f.getName()));
+                System.out.println("guessed domain: " + guessDomainName(f.getName())); // uses a table and rules that deduces a source from the file name
                String source = getSource(r);
-                System.out.println("guessed Source: " + source);
-                String mainURL = guessURL(source, de);
+                System.out.println("guessed Source: " + source); // this uses metadata stored in the zim file
+                DigestURL mainURL = guessURL(source, de);
                System.out.println("guessed main article: " + mainURL);
-                System.out.println("main article exists: " + new DigestURL(mainURL).exists(ClientIdentification.browserAgent));
+                boolean ok = mainURL.exists(ClientIdentification.browserAgent);
+                System.out.println("main article exists: " + ok);
+                if (ok) files_ok.add(f.getName()); else files_nok.add(f.getName());
                System.out.println();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
+        System.out.println("ok files: " + files_ok.toString());
+        System.out.println("not-ok files: " + files_nok.toString());
    }
 }
--- a/source/net/yacy/document/parser/pdfParser.java
+++ b/source/net/yacy/document/parser/pdfParser.java
@ -53,7 +53,6 @@ import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
 import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
 import org.apache.pdfbox.text.PDFTextStripper;

-import net.yacy.cora.document.encoding.UTF8;
 import net.yacy.cora.document.id.AnchorURL;
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.document.id.MultiProtocolURL;
@ -69,9 +68,6 @@ import net.yacy.kelondro.util.MemoryControl;

 public class pdfParser extends AbstractParser implements Parser {

-    public static boolean individualPages = false;
-    public static String individualPagePropertyname = "page";
-
    public pdfParser() {
        super("Acrobat Portable Document Parser");
        this.SUPPORTED_EXTENSIONS.add("pdf");
@ -149,98 +145,36 @@ public class pdfParser extends AbstractParser implements Parser {
            // get the links
        	final List<Collection<AnchorURL>> pdflinks = extractPdfLinks(pdfDoc);

-            // get the fulltext (either per document or for each page)
-            final PDFTextStripper stripper = new PDFTextStripper(/*StandardCharsets.UTF_8.name()*/);
-
-            if (individualPages) {
-                // this is a hack which stores individual pages of the source pdf into individual index documents
-                // the new documents will get a virtual link with a post argument page=X appended to the original url
-
-                // collect text
-                final int pagecount = pdfDoc.getNumberOfPages();
-                final String[] pages = new String[pagecount];
-                for (int page = 1; page <= pagecount; page++) {
-                    stripper.setStartPage(page);
-                    stripper.setEndPage(page);
-                    pages[page - 1] = stripper.getText(pdfDoc);
-                    //System.out.println("PAGE " + page + ": " + pages[page - 1]);
-                }
-
-                // create individual documents for each page
-                assert pages.length == pdflinks.size() : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.size();
-                result = new Document[Math.min(pages.length, pdflinks.size())];
-                final String loc = location.toNormalform(true);
-                for (int page = 0; page < result.length; page++) {
-                    result[page] = new Document(
-                            new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash
-                            mimeType,
-                            StandardCharsets.UTF_8.name(),
-                            this,
-                            null,
-                            docKeywords,
-                            singleList(docTitle),
-                            docAuthor,
-                            docPublisher,
-                            null,
-                            null,
-                            0.0d, 0.0d,
-                            pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]),
-                            pdflinks == null || page >= pdflinks.size() ? null : pdflinks.get(page),
-                            null,
-                            null,
-                            false,
-                            docDate);
-                }
-            } else {
-                // collect the whole text at once
-                final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
-                byte[] contentBytes = new byte[0];
-                stripper.setEndPage(3); // get first 3 pages (always)
-                writer.append(stripper.getText(pdfDoc));
-                contentBytes = writer.getBytes(); // remember text in case of interrupting thread
-
-                if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read
-                    stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text)
-                    stripper.setEndPage(Integer.MAX_VALUE); // set to default
-                    // we start the pdf parsing in a separate thread to ensure that it can be terminated
-                    final PDDocument pdfDocC = pdfDoc;
-                    final Thread t = new Thread("pdfParser.getText:" + location) {
-                        @Override
-                        public void run() {
-                            try {
-                                writer.append(stripper.getText(pdfDocC));
-                            } catch (final Throwable e) {}
-                        }
-                    };
-                    t.start();
-                    t.join(3000); // pdfbox likes to forget to terminate ... (quite often)
-                    if (t.isAlive()) t.interrupt();
-                    contentBytes = writer.getBytes(); // get final text before closing writer
-                    writer.close(); // free writer resources
-                }
-
-                final Collection<AnchorURL> pdflinksCombined = new HashSet<>();
-                for (final Collection<AnchorURL> pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx);
-                result = new Document[]{new Document(
-                        location,
-                        mimeType,
-                        StandardCharsets.UTF_8.name(),
-                        this,
-                        null,
-                        docKeywords,
-                        singleList(docTitle),
-                        docAuthor,
-                        docPublisher,
-                        null,
-                        null,
-                        0.0d, 0.0d,
-                        contentBytes,
-                        pdflinksCombined,
-                        null,
-                        null,
-                        false,
-                        docDate)};
-            }
+            // collect the whole text at once
+            final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
+            byte[] contentBytes = new byte[0];
+            final PDFTextStripper stripper = new PDFTextStripper();
+            stripper.setEndPage(Integer.MAX_VALUE);
+            writer.append(stripper.getText(pdfDoc));
+            contentBytes = writer.getBytes(); // remember text in case of interrupting thread
+            writer.close(); // free writer resources
+
+            final Collection<AnchorURL> pdflinksCombined = new HashSet<>();
+            for (final Collection<AnchorURL> pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx);
+            result = new Document[]{new Document(
+                    location,
+                    mimeType,
+                    StandardCharsets.UTF_8.name(),
+                    this,
+                    null,
+                    docKeywords,
+                    singleList(docTitle),
+                    docAuthor,
+                    docPublisher,
+                    null,
+                    null,
+                    0.0d, 0.0d,
+                    contentBytes,
+                    pdflinksCombined,
+                    null,
+                    null,
+                    false,
+                    docDate)};
        } catch (final Throwable e) {
            //throw new Parser.Failure(e.getMessage(), location);
        } finally {
--- a/source/net/yacy/htroot/ConfigParser_p.java
+++ b/source/net/yacy/htroot/ConfigParser_p.java
@ -61,13 +61,6 @@ public class ConfigParser_p {
                env.setConfig(SwitchboardConstants.PARSER_MIME_DENY, TextParser.getDenyMime());
                env.setConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, TextParser.getDenyExtension());
            }
-
-            if (post.containsKey("pdfSettings")) {
-                env.setConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, post.getBoolean("individualPages"));
-                env.setConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, post.get("individualPagePropertyname", "page"));
-                pdfParser.individualPages = sb.getConfigBool(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, false);
-                pdfParser.individualPagePropertyname = sb.getConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, "page");
-            }
        }

        int i = 0;
@ -94,9 +87,6 @@ public class ConfigParser_p {

        prop.put("parser", i);

-        prop.put("individualPages", sb.getConfigBool(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, false));
-        prop.put("individualPagePropertyname", sb.getConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, "page"));
-
        // return rewrite properties
        return prop;
    }
--- a/source/net/yacy/htroot/Crawler_p.java
+++ b/source/net/yacy/htroot/Crawler_p.java
@ -774,7 +774,7 @@ public class Crawler_p {
        }

        /*
-         *  <input id="customPPM" name="customPPM" type="number" min="10" max="30000" style="width:46px" value="#[customPPMdefault]#" />PPM
+         *  <input id="customPPM" name="customPPM" type="number" min="10" max="60000" style="width:46px" value="#[customPPMdefault]#" />PPM
            <input id="latencyFactor" name="latencyFactor" type="number" min="0.1" max="3.0" step="0.1" style="width:32px" value="#[latencyFactorDefault]#" />LF
            <input id="MaxSameHostInQueue" name="MaxSameHostInQueue" type="number" min="1" max="30" style="width:32px" value="#[MaxSameHostInQueueDefault]#" />MH
            <input type="submit" name="crawlingPerformance" value="set" />
@ -784,19 +784,19 @@ public class Crawler_p {
        if (post != null && post.containsKey("crawlingPerformance")) {
            final String crawlingPerformance = post.get("crawlingPerformance", "custom");
            final long LCbusySleep1 = sb.getConfigLong(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, 1000L);
-            int wantedPPM = (LCbusySleep1 == 0) ? 30000 : (int) (60000L / LCbusySleep1);
+            int wantedPPM = (LCbusySleep1 == 0) ? 60000 : (int) (60000L / LCbusySleep1);
            try {
                wantedPPM = post.getInt("customPPM", wantedPPM);
            } catch (final NumberFormatException e) {}
            if ("minimum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 10;
-            if ("maximum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 30000;
+            if ("maximum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 60000;

            int wPPM = wantedPPM;
            if ( wPPM <= 0 ) {
                wPPM = 1;
            }
-            if ( wPPM >= 30000 ) {
-                wPPM = 30000;
+            if ( wPPM >= 60000 ) {
+                wPPM = 60000;
            }
            final int newBusySleep = 60000 / wPPM; // for wantedPPM = 10: 6000; for wantedPPM = 1000: 60

--- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java
+++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java
@ -981,17 +981,7 @@ public class URIMetadataNode extends SolrDocument /* implements Comparable<URIMe
    public String urlstring() {
        if (this.alternative_urlstring != null) return this.alternative_urlstring;

-        if (!pdfParser.individualPages) return this.url().toNormalform(true);
-        if (!"pdf".equals(MultiProtocolURL.getFileExtension(this.url().getFileName()).toLowerCase(Locale.ROOT))) return this.url().toNormalform(true);
-        // for pdf links we rewrite the url
-        // this is a special treatment of pdf files which can be splitted into subpages
-        String pageprop = pdfParser.individualPagePropertyname;
-        String resultUrlstring = this.url().toNormalform(true);
-        int p = resultUrlstring.lastIndexOf(pageprop + "=");
-        if (p > 0) {
-          return resultUrlstring.substring(0, p - 1) + "#page=" + resultUrlstring.substring(p + pageprop.length() + 1);
-        }
-        return resultUrlstring;
+        return this.url().toNormalform(true);
    }
    /**
     * used for search result entry
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -176,6 +176,7 @@ import net.yacy.document.content.SurrogateReader;
 import net.yacy.document.importer.JsonListImporter;
 import net.yacy.document.importer.OAIListFriendsLoader;
 import net.yacy.document.importer.WarcImporter;
+import net.yacy.document.importer.ZimImporter;
 import net.yacy.document.parser.audioTagParser;
 import net.yacy.document.parser.pdfParser;
 import net.yacy.document.parser.html.Evaluation;
@ -906,8 +907,6 @@ public final class Switchboard extends serverSwitch {

                TextParser.setDenyMime(this.getConfig(SwitchboardConstants.PARSER_MIME_DENY, ""));
                TextParser.setDenyExtension(this.getConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, ""));
-                pdfParser.individualPages = this.getConfigBool(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, false);
-                pdfParser.individualPagePropertyname = this.getConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, "page");

                // start a loader
                this.log.config("Starting Crawl Loader");
@ -2153,6 +2152,20 @@ public final class Switchboard extends serverSwitch {
                this.log.warn("IO Error processing warc file " + infile);
            }
            return moved;
+        } else if (s.endsWith(".zim")) {
+            try {
+                final ZimImporter wri = new ZimImporter(infile.getAbsolutePath());
+                wri.start();
+                try {
+                    wri.join();
+                } catch (final InterruptedException ex) {
+                    return moved;
+                }
+                moved = infile.renameTo(outfile);
+            } catch (final IOException ex) {
+                this.log.warn("IO Error processing zim file " + infile);
+            }
+            return moved;
        } else if (s.endsWith(".jsonlist") || s.endsWith(".jsonlist.gz") || s.endsWith(".flatjson")) {
            return this.processSurrogateJson(infile, outfile);
        }
@ -2349,6 +2362,7 @@ public final class Switchboard extends serverSwitch {
                    if ( surrogate.endsWith(".xml")
                            || surrogate.endsWith(".xml.gz")
                            || surrogate.endsWith(".xml.zip")
+                            || surrogate.endsWith(".zim")
                            || surrogate.endsWith(".warc")
                            || surrogate.endsWith(".warc.gz")
                            || surrogate.endsWith(".jsonlist")
--- a/source/net/yacy/search/SwitchboardConstants.java
+++ b/source/net/yacy/search/SwitchboardConstants.java
@ -220,8 +220,6 @@ public final class SwitchboardConstants {
    public static final String INDEX_TRANSFER_GZIP_BODY         = "indexTransfer.gzipBody";
    public static final String PARSER_MIME_DENY                 = "parser.mime.deny";
    public static final String PARSER_EXTENSIONS_DENY           = "parser.extensions.deny";
-    public static final String PARSER_PDF_INDIVIDUALPAGES       = "parser.pdf.individualpages";
-    public static final String PARSER_PDF_INDIVIDUALPAGES_KEY   = "parser.pdf.individualpages.key";
    /**
     * <p><code>public static final String <strong>PROXY_ONLINE_CAUTION_DELAY</strong> = "onlineCautionDelay"</code></p>
     * <p>Name of the setting how long indexing should pause after the last time the proxy was used in milliseconds</p>
--- a/source/org/openzim/ZIMFile.java
+++ b/source/org/openzim/ZIMFile.java
@ -113,20 +113,24 @@ public class ZIMFile extends File {
        }
        this.mimeTypeList = mList.toArray(new String[mList.size()]);

-        // Initialize the Url Pointer List
-        this.urlPtrListBlob = new byte[this.header_entryCount * 8];
-        mReader.seek(this.header_urlPtrPos);
-        RandomAccessFileZIMInputStream.readFully(mReader, this.urlPtrListBlob);
-
-        // Initialize the Title Pointer List
-        this.titlePtrListBlob = new byte[this.header_entryCount * 4];
-        mReader.seek(this.header_titlePtrPos);
-        RandomAccessFileZIMInputStream.readFully(mReader, this.titlePtrListBlob);
-
-        // Initialize the Cluster Pointer List
-        this.clusterPtrListBlob = new byte[this.header_clusterCount * 8];
-        mReader.seek(this.header_clusterPtrPos);
-        RandomAccessFileZIMInputStream.readFully(mReader, this.clusterPtrListBlob);
+        try {
+            // Initialize the Url Pointer List
+            this.urlPtrListBlob = new byte[this.header_entryCount * 8];
+            mReader.seek(this.header_urlPtrPos);
+            RandomAccessFileZIMInputStream.readFully(mReader, this.urlPtrListBlob);
+
+            // Initialize the Title Pointer List
+            this.titlePtrListBlob = new byte[this.header_entryCount * 4];
+            mReader.seek(this.header_titlePtrPos);
+            RandomAccessFileZIMInputStream.readFully(mReader, this.titlePtrListBlob);
+
+            // Initialize the Cluster Pointer List
+            this.clusterPtrListBlob = new byte[this.header_clusterCount * 8];
+            mReader.seek(this.header_clusterPtrPos);
+            RandomAccessFileZIMInputStream.readFully(mReader, this.clusterPtrListBlob);
+        } catch (IndexOutOfBoundsException e) {
+            throw new IOException(e.getMessage());
+        }
    }

    public final String getMimeType(int idx) {