Merge branch 'master' of https://github.com/yacy/yacy_search_server.git

1 year ago · 53b01dbf2e
parent 41856e9f34 1c0df28bfb
commit 53b01dbf2e
3 changed files with 348 additions and 16 deletions
--- a/source/net/yacy/document/importer/ZimImporter.java
+++ b/source/net/yacy/document/importer/ZimImporter.java
@ -0,0 +1,306 @@
 /**
 * ZimImporter.java
 * (C) 2023 by Michael Peter Christen @orbiter
 *
 * This is a part of YaCy, a peer-to-peer based web search engine
 *
 * LICENSE
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.
 * If not, see <http://www.gnu.org/licenses/>.
 */
 package net.yacy.document.importer;
 import java.io.File;
 import java.io.IOException;
 import java.util.Collection;
 import java.util.Map;
 import java.util.TreeMap;
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.protocol.ResponseHeader;
 import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.crawler.retrieval.Request;
 import net.yacy.crawler.retrieval.Response;
 import net.yacy.document.TextParser;
 import net.yacy.search.Switchboard;
 import org.openzim.ZIMFile;
 import org.openzim.ZIMReader;
 import org.openzim.ZIMReader.ArticleEntry;
 import org.openzim.ZIMReader.DirectoryEntry;
 /**
 * ZIM importer
 * can import ZIM file i.e. from https://download.kiwix.org/zim/ or mirrors like https://ftp.fau.de/kiwix/zim/
 * These files contains identifiers named "URL" which are not actually full URLs but just paths inside a well-known domains.
 * These domains are sometimes given by a "Source" metadata field, but that is rare - we have to guess them.
 * For that we have a guessing function, but we must check if the guessing was correct by testing some of the given
 * URLs against the actual internet-hosted document. Only if that check succeeds we should import the files.
 * In all other cases the import should work as well but should also only be done in a non-p2p environment to prevent
 * that such links are shared.
 */
 public class ZimImporter extends Thread implements Importer {
    static public ZimImporter job;
    private ZIMFile file;
    private ZIMReader reader;
    private String path; 
    private String guessedSource;
    private int recordCnt;
    private long startTime;
    private final long sourceSize;
    private long consumed;
    private boolean abort = false;
    public ZimImporter(String path) throws IOException {
       super("ZimImporter - from file " + path);
       this.path = path;
       this.file = new ZIMFile(this.path); // this will read already some of the metadata and could consume some time
       this.sourceSize = this.file.length();
    }
    @Override
    public void run() {
        job = this;
        this.startTime = System.currentTimeMillis();
        try {
            this.reader = new ZIMReader(this.file);
            this.guessedSource = getSource(this.reader);
            for (int i = 0; i < this.file.header_entryCount; i++) {
                if (this.abort) break;
                DirectoryEntry de = this.reader.getDirectoryInfo(i);
                if (!(de instanceof ZIMReader.ArticleEntry)) continue;
                ArticleEntry ae = (ArticleEntry) de;
                // check url
                String guessedUrl = guessURL(this.guessedSource, de);
                assert guessedUrl.startsWith("http");
                // check availability of text parser
                String mimeType = ae.getMimeType();
                if (TextParser.supportsMime(mimeType) != null) continue;
                // read the content
                byte[] b = this.reader.getArticleData(ae);
                // create artificial request and response headers for the indexer
                RequestHeader requestHeader = new RequestHeader();
                ResponseHeader responseHeader = new ResponseHeader(200);
                final Request request = new Request(new DigestURL(guessedUrl), null);
                final Response response = new Response(
                        request,
                        requestHeader,
                        responseHeader,
                        Switchboard.getSwitchboard().crawler.defaultSurrogateProfile,
                        false,
                        b
                );
                // throw this to the indexer
                String error = Switchboard.getSwitchboard().toIndexer(response);
                if (error != null) ConcurrentLog.info("ZimImporter", "error parsing: " + error);
                this.recordCnt++;
            }
        } catch (IOException e) {
            ConcurrentLog.info("ZimImporter", "error reading: " + e.getMessage());
        }
        ConcurrentLog.info("ZimImporter", "Indexed " + this.recordCnt + " documents");
        job = null;
    }
    public void quit() {
        this.abort = true;
    }
    @Override
    public String source() {
        return this.path;
    }
    @Override
    public int count() {
        return this.recordCnt;
    }
    @Override
    public int speed() {
        if (this.recordCnt == 0) return 0;
        return (int) (this.recordCnt / Math.max(0L, runningTime() ));
    }
    @Override
    public long runningTime() {
        return (System.currentTimeMillis() - this.startTime) / 1000L;
    }
    @Override
    public long remainingTime() {
        if (this.consumed == 0) {
            return 0;
        }
        long speed = this.consumed / runningTime();
        return (this.sourceSize - this.consumed) / speed;
    }
    @Override
    public String status() {
        return "";
    }
    public static String guessDomainName(String fileName) {
        if (fileName == null || fileName.isEmpty()) {
            return null; // Handle null or empty input
        }
        String[] parts = fileName.split("_");
        if (parts.length == 0) {
            return null;
        }
        String firstPart = parts[0];
        // Handling special cases where the domain name might not be obvious
        // These are based on your provided list and can be expanded as needed
        switch (firstPart) {
            case "100r-off-the-grid":
                return "100resilientcities.org";
            case "armypubs":
                return "armypubs.army.mil";
            case "artofproblemsolving":
                return "artofproblemsolving.com";
            case "based":
                return "based.cooking";
            case "booksdash":
                return "booksdash.com";
            case "coopmaths":
                return "coopmaths.fr";
            case "fas-military-medicine":
                return "fas.org";
            case "fonts":
                return "fonts.google.com";
            case "gutenberg":
                return "gutenberg.org";
            case "ifixit":
                return "ifixit.com";
            case "lesfondamentaux":
                return "reseau-canope.fr";
            case "lowtechmagazine":
                return "lowtechmagazine.com";
            case "mutopiaproject":
                return "mutopiaproject.org";
            case "openstreetmap-wiki":
                return "wiki.openstreetmap.org";
            case "opentextbooks":
                return "opentextbooks.org";
            case "phet":
                return "phet.colorado.edu";
            case "practical_action":
                return "practicalaction.org";
            case "rapsberry_pi_docs":
                return "raspberrypi.org";
            case "ted":
                return "ted.com";
            case "vikidia":
                return "vikidia.org";
            case "westeros":
                return "westeros.org";
            case "wikipedia":
                return parts[1] + ".wikipedia.org/wiki";
            case "www.ready.gov":
                return "ready.gov";
        }
        // Handling domain patterns
        if (firstPart.contains(".stackexchange.com")) {
            return firstPart;
        } else if (firstPart.endsWith(".com") || firstPart.endsWith(".org") || firstPart.endsWith(".de") || 
                   firstPart.endsWith(".fr") || firstPart.endsWith(".pt") || firstPart.endsWith(".it") || 
                   firstPart.endsWith(".ja") || firstPart.endsWith(".es") || firstPart.endsWith(".eo")) {
            return firstPart;
        } else if (firstPart.contains("-")) {
            return firstPart.substring(0, firstPart.indexOf("-"));
        }
        // Additional general domain extraction logic
        if (firstPart.contains(".")) {
            int lastDotIndex = firstPart.lastIndexOf('.');
            if (lastDotIndex > 0 && lastDotIndex < firstPart.length() - 1) {
                // Extract up to the next character beyond the TLD, to support TLDs of variable length
                int endIndex = firstPart.indexOf('.', lastDotIndex + 1);
                if (endIndex == -1) {
                    endIndex = firstPart.length();
                }
                return firstPart.substring(0, endIndex);
            }
        }
        // Default return if none of the above conditions meet
        return null;
    }
    public static String getSource(ZIMReader r) throws IOException {
        String source = r.getMetadata("Source");
        if (source != null) return source;
        source = "https://" + guessDomainName(r.getZIMFile().getName()) + "/";
        return source;
    }
    public static String guessURL(String guessedSource, DirectoryEntry de) {
        String url = de.url;
        if (url.equals("Main_Page")) url = "";
        return guessedSource + url;
    }
    public static void main(String[] args) {
        // zim file import test
        // will test mostly if domain names are included in zim file urls
        String zimFilesPath = args[0];
        File zimFiles = new File(zimFilesPath);
        // make ordered file list; order by file size (start with smallest)
        String[] filelist = zimFiles.list();
        Map<Long, File> orderedFileMap = new TreeMap<>();
        for (int i = 0; i < filelist.length; i++) {
            if (!filelist[i].endsWith(".zim")) continue;
            File f = new File(zimFiles, filelist[i]);
            orderedFileMap.put(f.length() * 1000 + i, f);
        }
        Collection<File> orderedFiles = orderedFileMap.values();
        for (File f: orderedFiles) {
            try {
                ZIMFile z = new ZIMFile(f.getAbsolutePath());
                ZIMReader r = new ZIMReader(z);
                DirectoryEntry de = r.getMainDirectoryEntry();
                System.out.println("ZIM file:  " + f.getAbsolutePath());
                for (String key: ZIMReader.METADATA_KEYS) {String s = r.getMetadata(key); if (s != null) System.out.println("Metadata " + key + ": " + s);};
                System.out.println("Namespace: " + de.namespace);
                System.out.println("Title:     " + de.title);
                System.out.println("URL:       " + de.url);
                System.out.println("guessed domain: " + guessDomainName(f.getName()));
                String source = getSource(r);
                System.out.println("guessed Source: " + source);
                System.out.println("guessed main article: " + guessURL(source, de));
                System.out.println();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
 }
--- a/source/org/openzim/ZIMFile.java
+++ b/source/org/openzim/ZIMFile.java
@ -108,7 +108,7 @@ public class ZIMFile extends File {
                break;
            }
            String mimeType = mimeBuffer.toString();
-            System.out.println(mimeType);
+            //System.out.println(mimeType);
            mList.add(mimeType);
        }
        this.mimeTypeList = mList.toArray(new String[mList.size()]);
--- a/source/org/openzim/ZIMReader.java
+++ b/source/org/openzim/ZIMReader.java
@ -25,6 +25,7 @@ import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.nio.charset.StandardCharsets;
 import org.tukaani.xz.SingleXZInputStream;
 import com.github.luben.zstd.ZstdInputStream;
@ -45,6 +46,11 @@ import com.github.luben.zstd.ZstdInputStream;
 */
 public class ZIMReader {
    public final static String[] METADATA_KEYS = new String[] {
            "Name", "Title", "Creator", "Publisher", "Date", "Description", "LongDescription",
            "Language", "License", "Tags", "Relation", "Flavour", "Source", "Counter", "Scraper"
    };
    private final ZIMFile mFile;
    public class DirectoryEntry {
@ -53,10 +59,10 @@ public class ZIMReader {
        public final char namespace;
        public final String url;
        public final String title;
-        public final long urlListindex;
+        public final int urlListindex;
        public DirectoryEntry(
-                final long urlListindex,
+                final int urlListindex,
                final char namespace, final String url, final String title, final int mimeType) {
            assert url != null;
            assert title != null;
@ -79,7 +85,7 @@ public class ZIMReader {
        public final int blob_number;
        public ArticleEntry(
-                final long urlListindex,
+                final int urlListindex,
                final char namespace, final String url, final String title, final int mimeType,
                final int cluster_number, final int blob_number) {
            super(urlListindex, namespace, url, title, mimeType);
@ -91,12 +97,12 @@ public class ZIMReader {
    public class RedirectEntry extends DirectoryEntry {
-        public final long redirect_index;
+        public final int redirect_index;
        public RedirectEntry(
-                final long urlListindex,
+                final int urlListindex,
                final char namespace, final String url, final String title, final int mimeType,
-                final long redirect_index) {
+                final int redirect_index) {
            super(urlListindex, namespace, url, title, mimeType);
            this.redirect_index = redirect_index;
        }
@ -199,6 +205,25 @@ public class ZIMReader {
        }
    }
    public final String getMetadata(String key) throws IOException {
        DirectoryEntry de = getDirectoryInfo('M', key);
        if (de == null) return null; // metadata not found; that would be normal
        byte[] val = getArticleData(de);
        if (val == null) return null; // article data not found: that is not normal
        if (val.length == 0) return null; // that empty string is a proper value, however, not usable for a client
        return new String(val, StandardCharsets.UTF_8);
    }
    public DirectoryEntry getMainDirectoryEntry() throws IOException {
        DirectoryEntry de = getDirectoryInfo(this.mFile.header_mainPage);
        if (de.namespace == 'W' && de.url.equals("mainPage") && de instanceof RedirectEntry) {
            // resolve redirect to get the actual main page
            int redirect = ((RedirectEntry) de).redirect_index;
            de = getDirectoryInfo(redirect);
        }
        return de;
    }
    public String getURLByURLOrder(final int entryNumber) throws IOException {
        // The position of URL i
@ -422,6 +447,7 @@ public class ZIMReader {
        is.read(buffer);
        long offset2 = extended? RandomAccessFileZIMInputStream.toEightLittleEndianLong(buffer) : RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer);
        long blob_size = offset2 - offset1;
        if (blob_size == 0) return new byte[0]; // skip the skipping to get to a zero-length object (they exist!)
        byte[] entry = new byte[(int) blob_size]; // TODO: we should be able to read blobs larger than MAXINT
        // we must do two skip steps: first to the end of the offset list and second to the start of the blob
        // - the whole number of offset list entries is numberOfBlobs1, which includes the extra entry for the end offset