diff --git a/source/net/yacy/document/importer/ZimImporter.java b/source/net/yacy/document/importer/ZimImporter.java new file mode 100644 index 000000000..a1917f0bc --- /dev/null +++ b/source/net/yacy/document/importer/ZimImporter.java @@ -0,0 +1,306 @@ +/** + * ZimImporter.java + * (C) 2023 by Michael Peter Christen @orbiter + * + * This is a part of YaCy, a peer-to-peer based web search engine + * + * LICENSE + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. + * If not, see . + */ + +package net.yacy.document.importer; + +import java.io.File; +import java.io.IOException; +import java.util.Collection; +import java.util.Map; +import java.util.TreeMap; + +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.protocol.RequestHeader; +import net.yacy.cora.protocol.ResponseHeader; +import net.yacy.cora.util.ConcurrentLog; +import net.yacy.crawler.retrieval.Request; +import net.yacy.crawler.retrieval.Response; +import net.yacy.document.TextParser; +import net.yacy.search.Switchboard; + +import org.openzim.ZIMFile; +import org.openzim.ZIMReader; +import org.openzim.ZIMReader.ArticleEntry; +import org.openzim.ZIMReader.DirectoryEntry; + +/** + * ZIM importer + * can import ZIM file i.e. from https://download.kiwix.org/zim/ or mirrors like https://ftp.fau.de/kiwix/zim/ + * These files contains identifiers named "URL" which are not actually full URLs but just paths inside a well-known domains. + * These domains are sometimes given by a "Source" metadata field, but that is rare - we have to guess them. + * For that we have a guessing function, but we must check if the guessing was correct by testing some of the given + * URLs against the actual internet-hosted document. Only if that check succeeds we should import the files. + * In all other cases the import should work as well but should also only be done in a non-p2p environment to prevent + * that such links are shared. + */ +public class ZimImporter extends Thread implements Importer { + + static public ZimImporter job; + + private ZIMFile file; + private ZIMReader reader; + private String path; + private String guessedSource; + + private int recordCnt; + private long startTime; + private final long sourceSize; + private long consumed; + private boolean abort = false; + + public ZimImporter(String path) throws IOException { + super("ZimImporter - from file " + path); + this.path = path; + this.file = new ZIMFile(this.path); // this will read already some of the metadata and could consume some time + this.sourceSize = this.file.length(); + } + + @Override + public void run() { + job = this; + this.startTime = System.currentTimeMillis(); + try { + this.reader = new ZIMReader(this.file); + this.guessedSource = getSource(this.reader); + + for (int i = 0; i < this.file.header_entryCount; i++) { + if (this.abort) break; + DirectoryEntry de = this.reader.getDirectoryInfo(i); + if (!(de instanceof ZIMReader.ArticleEntry)) continue; + ArticleEntry ae = (ArticleEntry) de; + + // check url + String guessedUrl = guessURL(this.guessedSource, de); + assert guessedUrl.startsWith("http"); + + // check availability of text parser + String mimeType = ae.getMimeType(); + if (TextParser.supportsMime(mimeType) != null) continue; + + // read the content + byte[] b = this.reader.getArticleData(ae); + + // create artificial request and response headers for the indexer + RequestHeader requestHeader = new RequestHeader(); + ResponseHeader responseHeader = new ResponseHeader(200); + final Request request = new Request(new DigestURL(guessedUrl), null); + final Response response = new Response( + request, + requestHeader, + responseHeader, + Switchboard.getSwitchboard().crawler.defaultSurrogateProfile, + false, + b + ); + + // throw this to the indexer + String error = Switchboard.getSwitchboard().toIndexer(response); + if (error != null) ConcurrentLog.info("ZimImporter", "error parsing: " + error); + this.recordCnt++; + } + } catch (IOException e) { + ConcurrentLog.info("ZimImporter", "error reading: " + e.getMessage()); + } + ConcurrentLog.info("ZimImporter", "Indexed " + this.recordCnt + " documents"); + job = null; + } + + public void quit() { + this.abort = true; + } + + @Override + public String source() { + return this.path; + } + + @Override + public int count() { + return this.recordCnt; + } + + @Override + public int speed() { + if (this.recordCnt == 0) return 0; + return (int) (this.recordCnt / Math.max(0L, runningTime() )); + } + + @Override + public long runningTime() { + return (System.currentTimeMillis() - this.startTime) / 1000L; + } + + @Override + public long remainingTime() { + if (this.consumed == 0) { + return 0; + } + long speed = this.consumed / runningTime(); + return (this.sourceSize - this.consumed) / speed; + } + + @Override + public String status() { + return ""; + } + + public static String guessDomainName(String fileName) { + if (fileName == null || fileName.isEmpty()) { + return null; // Handle null or empty input + } + + String[] parts = fileName.split("_"); + if (parts.length == 0) { + return null; + } + String firstPart = parts[0]; + + // Handling special cases where the domain name might not be obvious + // These are based on your provided list and can be expanded as needed + switch (firstPart) { + case "100r-off-the-grid": + return "100resilientcities.org"; + case "armypubs": + return "armypubs.army.mil"; + case "artofproblemsolving": + return "artofproblemsolving.com"; + case "based": + return "based.cooking"; + case "booksdash": + return "booksdash.com"; + case "coopmaths": + return "coopmaths.fr"; + case "fas-military-medicine": + return "fas.org"; + case "fonts": + return "fonts.google.com"; + case "gutenberg": + return "gutenberg.org"; + case "ifixit": + return "ifixit.com"; + case "lesfondamentaux": + return "reseau-canope.fr"; + case "lowtechmagazine": + return "lowtechmagazine.com"; + case "mutopiaproject": + return "mutopiaproject.org"; + case "openstreetmap-wiki": + return "wiki.openstreetmap.org"; + case "opentextbooks": + return "opentextbooks.org"; + case "phet": + return "phet.colorado.edu"; + case "practical_action": + return "practicalaction.org"; + case "rapsberry_pi_docs": + return "raspberrypi.org"; + case "ted": + return "ted.com"; + case "vikidia": + return "vikidia.org"; + case "westeros": + return "westeros.org"; + case "wikipedia": + return parts[1] + ".wikipedia.org/wiki"; + case "www.ready.gov": + return "ready.gov"; + } + + // Handling domain patterns + if (firstPart.contains(".stackexchange.com")) { + return firstPart; + } else if (firstPart.endsWith(".com") || firstPart.endsWith(".org") || firstPart.endsWith(".de") || + firstPart.endsWith(".fr") || firstPart.endsWith(".pt") || firstPart.endsWith(".it") || + firstPart.endsWith(".ja") || firstPart.endsWith(".es") || firstPart.endsWith(".eo")) { + return firstPart; + } else if (firstPart.contains("-")) { + return firstPart.substring(0, firstPart.indexOf("-")); + } + + // Additional general domain extraction logic + if (firstPart.contains(".")) { + int lastDotIndex = firstPart.lastIndexOf('.'); + if (lastDotIndex > 0 && lastDotIndex < firstPart.length() - 1) { + // Extract up to the next character beyond the TLD, to support TLDs of variable length + int endIndex = firstPart.indexOf('.', lastDotIndex + 1); + if (endIndex == -1) { + endIndex = firstPart.length(); + } + return firstPart.substring(0, endIndex); + } + } + + // Default return if none of the above conditions meet + return null; + } + + public static String getSource(ZIMReader r) throws IOException { + String source = r.getMetadata("Source"); + if (source != null) return source; + source = "https://" + guessDomainName(r.getZIMFile().getName()) + "/"; + return source; + } + + public static String guessURL(String guessedSource, DirectoryEntry de) { + String url = de.url; + if (url.equals("Main_Page")) url = ""; + return guessedSource + url; + } + + public static void main(String[] args) { + // zim file import test + // will test mostly if domain names are included in zim file urls + String zimFilesPath = args[0]; + File zimFiles = new File(zimFilesPath); + + // make ordered file list; order by file size (start with smallest) + String[] filelist = zimFiles.list(); + Map orderedFileMap = new TreeMap<>(); + for (int i = 0; i < filelist.length; i++) { + if (!filelist[i].endsWith(".zim")) continue; + File f = new File(zimFiles, filelist[i]); + orderedFileMap.put(f.length() * 1000 + i, f); + } + + Collection orderedFiles = orderedFileMap.values(); + for (File f: orderedFiles) { + try { + ZIMFile z = new ZIMFile(f.getAbsolutePath()); + ZIMReader r = new ZIMReader(z); + DirectoryEntry de = r.getMainDirectoryEntry(); + System.out.println("ZIM file: " + f.getAbsolutePath()); + for (String key: ZIMReader.METADATA_KEYS) {String s = r.getMetadata(key); if (s != null) System.out.println("Metadata " + key + ": " + s);}; + System.out.println("Namespace: " + de.namespace); + System.out.println("Title: " + de.title); + System.out.println("URL: " + de.url); + System.out.println("guessed domain: " + guessDomainName(f.getName())); + String source = getSource(r); + System.out.println("guessed Source: " + source); + System.out.println("guessed main article: " + guessURL(source, de)); + System.out.println(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } +} diff --git a/source/org/openzim/ZIMFile.java b/source/org/openzim/ZIMFile.java index 32ce15539..45f1e1789 100644 --- a/source/org/openzim/ZIMFile.java +++ b/source/org/openzim/ZIMFile.java @@ -108,7 +108,7 @@ public class ZIMFile extends File { break; } String mimeType = mimeBuffer.toString(); - System.out.println(mimeType); + //System.out.println(mimeType); mList.add(mimeType); } this.mimeTypeList = mList.toArray(new String[mList.size()]); diff --git a/source/org/openzim/ZIMReader.java b/source/org/openzim/ZIMReader.java index 82a86b479..9a1f77c5d 100644 --- a/source/org/openzim/ZIMReader.java +++ b/source/org/openzim/ZIMReader.java @@ -19,12 +19,13 @@ package org.openzim; import java.io.IOException; -import java.io.InputStream; +import java.io.InputStream; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; -import java.util.Map; +import java.util.Map; +import java.nio.charset.StandardCharsets; import org.tukaani.xz.SingleXZInputStream; import com.github.luben.zstd.ZstdInputStream; @@ -45,6 +46,11 @@ import com.github.luben.zstd.ZstdInputStream; */ public class ZIMReader { + public final static String[] METADATA_KEYS = new String[] { + "Name", "Title", "Creator", "Publisher", "Date", "Description", "LongDescription", + "Language", "License", "Tags", "Relation", "Flavour", "Source", "Counter", "Scraper" + }; + private final ZIMFile mFile; public class DirectoryEntry { @@ -53,13 +59,13 @@ public class ZIMReader { public final char namespace; public final String url; public final String title; - public final long urlListindex; + public final int urlListindex; - public DirectoryEntry( - final long urlListindex, + public DirectoryEntry( + final int urlListindex, final char namespace, final String url, final String title, final int mimeType) { assert url != null; - assert title != null; + assert title != null; this.mimetype = mimeType; this.namespace = namespace; this.url = url; @@ -78,11 +84,11 @@ public class ZIMReader { public final int cluster_number; public final int blob_number; - public ArticleEntry( - final long urlListindex, + public ArticleEntry( + final int urlListindex, final char namespace, final String url, final String title, final int mimeType, final int cluster_number, final int blob_number) { - super(urlListindex, namespace, url, title, mimeType); + super(urlListindex, namespace, url, title, mimeType); this.cluster_number = cluster_number; this.blob_number = blob_number; } @@ -91,13 +97,13 @@ public class ZIMReader { public class RedirectEntry extends DirectoryEntry { - public final long redirect_index; - + public final int redirect_index; + public RedirectEntry( - final long urlListindex, + final int urlListindex, final char namespace, final String url, final String title, final int mimeType, - final long redirect_index) { - super(urlListindex, namespace, url, title, mimeType); + final int redirect_index) { + super(urlListindex, namespace, url, title, mimeType); this.redirect_index = redirect_index; } @@ -124,7 +130,7 @@ public class ZIMReader { public ZIMFile getZIMFile() { return this.mFile; } - + public List getAllArticles() throws IOException { List list = new ArrayList<>(); for (int i = 0; i < this.mFile.header_entryCount; i++) { @@ -198,6 +204,25 @@ public class ZIMReader { return abe; } } + + public final String getMetadata(String key) throws IOException { + DirectoryEntry de = getDirectoryInfo('M', key); + if (de == null) return null; // metadata not found; that would be normal + byte[] val = getArticleData(de); + if (val == null) return null; // article data not found: that is not normal + if (val.length == 0) return null; // that empty string is a proper value, however, not usable for a client + return new String(val, StandardCharsets.UTF_8); + } + + public DirectoryEntry getMainDirectoryEntry() throws IOException { + DirectoryEntry de = getDirectoryInfo(this.mFile.header_mainPage); + if (de.namespace == 'W' && de.url.equals("mainPage") && de instanceof RedirectEntry) { + // resolve redirect to get the actual main page + int redirect = ((RedirectEntry) de).redirect_index; + de = getDirectoryInfo(redirect); + } + return de; + } public String getURLByURLOrder(final int entryNumber) throws IOException { @@ -422,6 +447,7 @@ public class ZIMReader { is.read(buffer); long offset2 = extended? RandomAccessFileZIMInputStream.toEightLittleEndianLong(buffer) : RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer); long blob_size = offset2 - offset1; + if (blob_size == 0) return new byte[0]; // skip the skipping to get to a zero-length object (they exist!) byte[] entry = new byte[(int) blob_size]; // TODO: we should be able to read blobs larger than MAXINT // we must do two skip steps: first to the end of the offset list and second to the start of the blob // - the whole number of offset list entries is numberOfBlobs1, which includes the extra entry for the end offset