Michael Peter Christen 1 year ago
commit 53b01dbf2e

@ -0,0 +1,306 @@
/**
* ZimImporter.java
* (C) 2023 by Michael Peter Christen @orbiter
*
* This is a part of YaCy, a peer-to-peer based web search engine
*
* LICENSE
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program.
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.document.importer;
import java.io.File;
import java.io.IOException;
import java.util.Collection;
import java.util.Map;
import java.util.TreeMap;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.retrieval.Response;
import net.yacy.document.TextParser;
import net.yacy.search.Switchboard;
import org.openzim.ZIMFile;
import org.openzim.ZIMReader;
import org.openzim.ZIMReader.ArticleEntry;
import org.openzim.ZIMReader.DirectoryEntry;
/**
* ZIM importer
* can import ZIM file i.e. from https://download.kiwix.org/zim/ or mirrors like https://ftp.fau.de/kiwix/zim/
* These files contains identifiers named "URL" which are not actually full URLs but just paths inside a well-known domains.
* These domains are sometimes given by a "Source" metadata field, but that is rare - we have to guess them.
* For that we have a guessing function, but we must check if the guessing was correct by testing some of the given
* URLs against the actual internet-hosted document. Only if that check succeeds we should import the files.
* In all other cases the import should work as well but should also only be done in a non-p2p environment to prevent
* that such links are shared.
*/
public class ZimImporter extends Thread implements Importer {
static public ZimImporter job;
private ZIMFile file;
private ZIMReader reader;
private String path;
private String guessedSource;
private int recordCnt;
private long startTime;
private final long sourceSize;
private long consumed;
private boolean abort = false;
public ZimImporter(String path) throws IOException {
super("ZimImporter - from file " + path);
this.path = path;
this.file = new ZIMFile(this.path); // this will read already some of the metadata and could consume some time
this.sourceSize = this.file.length();
}
@Override
public void run() {
job = this;
this.startTime = System.currentTimeMillis();
try {
this.reader = new ZIMReader(this.file);
this.guessedSource = getSource(this.reader);
for (int i = 0; i < this.file.header_entryCount; i++) {
if (this.abort) break;
DirectoryEntry de = this.reader.getDirectoryInfo(i);
if (!(de instanceof ZIMReader.ArticleEntry)) continue;
ArticleEntry ae = (ArticleEntry) de;
// check url
String guessedUrl = guessURL(this.guessedSource, de);
assert guessedUrl.startsWith("http");
// check availability of text parser
String mimeType = ae.getMimeType();
if (TextParser.supportsMime(mimeType) != null) continue;
// read the content
byte[] b = this.reader.getArticleData(ae);
// create artificial request and response headers for the indexer
RequestHeader requestHeader = new RequestHeader();
ResponseHeader responseHeader = new ResponseHeader(200);
final Request request = new Request(new DigestURL(guessedUrl), null);
final Response response = new Response(
request,
requestHeader,
responseHeader,
Switchboard.getSwitchboard().crawler.defaultSurrogateProfile,
false,
b
);
// throw this to the indexer
String error = Switchboard.getSwitchboard().toIndexer(response);
if (error != null) ConcurrentLog.info("ZimImporter", "error parsing: " + error);
this.recordCnt++;
}
} catch (IOException e) {
ConcurrentLog.info("ZimImporter", "error reading: " + e.getMessage());
}
ConcurrentLog.info("ZimImporter", "Indexed " + this.recordCnt + " documents");
job = null;
}
public void quit() {
this.abort = true;
}
@Override
public String source() {
return this.path;
}
@Override
public int count() {
return this.recordCnt;
}
@Override
public int speed() {
if (this.recordCnt == 0) return 0;
return (int) (this.recordCnt / Math.max(0L, runningTime() ));
}
@Override
public long runningTime() {
return (System.currentTimeMillis() - this.startTime) / 1000L;
}
@Override
public long remainingTime() {
if (this.consumed == 0) {
return 0;
}
long speed = this.consumed / runningTime();
return (this.sourceSize - this.consumed) / speed;
}
@Override
public String status() {
return "";
}
public static String guessDomainName(String fileName) {
if (fileName == null || fileName.isEmpty()) {
return null; // Handle null or empty input
}
String[] parts = fileName.split("_");
if (parts.length == 0) {
return null;
}
String firstPart = parts[0];
// Handling special cases where the domain name might not be obvious
// These are based on your provided list and can be expanded as needed
switch (firstPart) {
case "100r-off-the-grid":
return "100resilientcities.org";
case "armypubs":
return "armypubs.army.mil";
case "artofproblemsolving":
return "artofproblemsolving.com";
case "based":
return "based.cooking";
case "booksdash":
return "booksdash.com";
case "coopmaths":
return "coopmaths.fr";
case "fas-military-medicine":
return "fas.org";
case "fonts":
return "fonts.google.com";
case "gutenberg":
return "gutenberg.org";
case "ifixit":
return "ifixit.com";
case "lesfondamentaux":
return "reseau-canope.fr";
case "lowtechmagazine":
return "lowtechmagazine.com";
case "mutopiaproject":
return "mutopiaproject.org";
case "openstreetmap-wiki":
return "wiki.openstreetmap.org";
case "opentextbooks":
return "opentextbooks.org";
case "phet":
return "phet.colorado.edu";
case "practical_action":
return "practicalaction.org";
case "rapsberry_pi_docs":
return "raspberrypi.org";
case "ted":
return "ted.com";
case "vikidia":
return "vikidia.org";
case "westeros":
return "westeros.org";
case "wikipedia":
return parts[1] + ".wikipedia.org/wiki";
case "www.ready.gov":
return "ready.gov";
}
// Handling domain patterns
if (firstPart.contains(".stackexchange.com")) {
return firstPart;
} else if (firstPart.endsWith(".com") || firstPart.endsWith(".org") || firstPart.endsWith(".de") ||
firstPart.endsWith(".fr") || firstPart.endsWith(".pt") || firstPart.endsWith(".it") ||
firstPart.endsWith(".ja") || firstPart.endsWith(".es") || firstPart.endsWith(".eo")) {
return firstPart;
} else if (firstPart.contains("-")) {
return firstPart.substring(0, firstPart.indexOf("-"));
}
// Additional general domain extraction logic
if (firstPart.contains(".")) {
int lastDotIndex = firstPart.lastIndexOf('.');
if (lastDotIndex > 0 && lastDotIndex < firstPart.length() - 1) {
// Extract up to the next character beyond the TLD, to support TLDs of variable length
int endIndex = firstPart.indexOf('.', lastDotIndex + 1);
if (endIndex == -1) {
endIndex = firstPart.length();
}
return firstPart.substring(0, endIndex);
}
}
// Default return if none of the above conditions meet
return null;
}
public static String getSource(ZIMReader r) throws IOException {
String source = r.getMetadata("Source");
if (source != null) return source;
source = "https://" + guessDomainName(r.getZIMFile().getName()) + "/";
return source;
}
public static String guessURL(String guessedSource, DirectoryEntry de) {
String url = de.url;
if (url.equals("Main_Page")) url = "";
return guessedSource + url;
}
public static void main(String[] args) {
// zim file import test
// will test mostly if domain names are included in zim file urls
String zimFilesPath = args[0];
File zimFiles = new File(zimFilesPath);
// make ordered file list; order by file size (start with smallest)
String[] filelist = zimFiles.list();
Map<Long, File> orderedFileMap = new TreeMap<>();
for (int i = 0; i < filelist.length; i++) {
if (!filelist[i].endsWith(".zim")) continue;
File f = new File(zimFiles, filelist[i]);
orderedFileMap.put(f.length() * 1000 + i, f);
}
Collection<File> orderedFiles = orderedFileMap.values();
for (File f: orderedFiles) {
try {
ZIMFile z = new ZIMFile(f.getAbsolutePath());
ZIMReader r = new ZIMReader(z);
DirectoryEntry de = r.getMainDirectoryEntry();
System.out.println("ZIM file: " + f.getAbsolutePath());
for (String key: ZIMReader.METADATA_KEYS) {String s = r.getMetadata(key); if (s != null) System.out.println("Metadata " + key + ": " + s);};
System.out.println("Namespace: " + de.namespace);
System.out.println("Title: " + de.title);
System.out.println("URL: " + de.url);
System.out.println("guessed domain: " + guessDomainName(f.getName()));
String source = getSource(r);
System.out.println("guessed Source: " + source);
System.out.println("guessed main article: " + guessURL(source, de));
System.out.println();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}

@ -108,7 +108,7 @@ public class ZIMFile extends File {
break; break;
} }
String mimeType = mimeBuffer.toString(); String mimeType = mimeBuffer.toString();
System.out.println(mimeType); //System.out.println(mimeType);
mList.add(mimeType); mList.add(mimeType);
} }
this.mimeTypeList = mList.toArray(new String[mList.size()]); this.mimeTypeList = mList.toArray(new String[mList.size()]);

@ -19,12 +19,13 @@
package org.openzim; package org.openzim;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.nio.charset.StandardCharsets;
import org.tukaani.xz.SingleXZInputStream; import org.tukaani.xz.SingleXZInputStream;
import com.github.luben.zstd.ZstdInputStream; import com.github.luben.zstd.ZstdInputStream;
@ -45,6 +46,11 @@ import com.github.luben.zstd.ZstdInputStream;
*/ */
public class ZIMReader { public class ZIMReader {
public final static String[] METADATA_KEYS = new String[] {
"Name", "Title", "Creator", "Publisher", "Date", "Description", "LongDescription",
"Language", "License", "Tags", "Relation", "Flavour", "Source", "Counter", "Scraper"
};
private final ZIMFile mFile; private final ZIMFile mFile;
public class DirectoryEntry { public class DirectoryEntry {
@ -53,13 +59,13 @@ public class ZIMReader {
public final char namespace; public final char namespace;
public final String url; public final String url;
public final String title; public final String title;
public final long urlListindex; public final int urlListindex;
public DirectoryEntry( public DirectoryEntry(
final long urlListindex, final int urlListindex,
final char namespace, final String url, final String title, final int mimeType) { final char namespace, final String url, final String title, final int mimeType) {
assert url != null; assert url != null;
assert title != null; assert title != null;
this.mimetype = mimeType; this.mimetype = mimeType;
this.namespace = namespace; this.namespace = namespace;
this.url = url; this.url = url;
@ -78,11 +84,11 @@ public class ZIMReader {
public final int cluster_number; public final int cluster_number;
public final int blob_number; public final int blob_number;
public ArticleEntry( public ArticleEntry(
final long urlListindex, final int urlListindex,
final char namespace, final String url, final String title, final int mimeType, final char namespace, final String url, final String title, final int mimeType,
final int cluster_number, final int blob_number) { final int cluster_number, final int blob_number) {
super(urlListindex, namespace, url, title, mimeType); super(urlListindex, namespace, url, title, mimeType);
this.cluster_number = cluster_number; this.cluster_number = cluster_number;
this.blob_number = blob_number; this.blob_number = blob_number;
} }
@ -91,13 +97,13 @@ public class ZIMReader {
public class RedirectEntry extends DirectoryEntry { public class RedirectEntry extends DirectoryEntry {
public final long redirect_index; public final int redirect_index;
public RedirectEntry( public RedirectEntry(
final long urlListindex, final int urlListindex,
final char namespace, final String url, final String title, final int mimeType, final char namespace, final String url, final String title, final int mimeType,
final long redirect_index) { final int redirect_index) {
super(urlListindex, namespace, url, title, mimeType); super(urlListindex, namespace, url, title, mimeType);
this.redirect_index = redirect_index; this.redirect_index = redirect_index;
} }
@ -124,7 +130,7 @@ public class ZIMReader {
public ZIMFile getZIMFile() { public ZIMFile getZIMFile() {
return this.mFile; return this.mFile;
} }
public List<ArticleEntry> getAllArticles() throws IOException { public List<ArticleEntry> getAllArticles() throws IOException {
List<ArticleEntry> list = new ArrayList<>(); List<ArticleEntry> list = new ArrayList<>();
for (int i = 0; i < this.mFile.header_entryCount; i++) { for (int i = 0; i < this.mFile.header_entryCount; i++) {
@ -198,6 +204,25 @@ public class ZIMReader {
return abe; return abe;
} }
} }
public final String getMetadata(String key) throws IOException {
DirectoryEntry de = getDirectoryInfo('M', key);
if (de == null) return null; // metadata not found; that would be normal
byte[] val = getArticleData(de);
if (val == null) return null; // article data not found: that is not normal
if (val.length == 0) return null; // that empty string is a proper value, however, not usable for a client
return new String(val, StandardCharsets.UTF_8);
}
public DirectoryEntry getMainDirectoryEntry() throws IOException {
DirectoryEntry de = getDirectoryInfo(this.mFile.header_mainPage);
if (de.namespace == 'W' && de.url.equals("mainPage") && de instanceof RedirectEntry) {
// resolve redirect to get the actual main page
int redirect = ((RedirectEntry) de).redirect_index;
de = getDirectoryInfo(redirect);
}
return de;
}
public String getURLByURLOrder(final int entryNumber) throws IOException { public String getURLByURLOrder(final int entryNumber) throws IOException {
@ -422,6 +447,7 @@ public class ZIMReader {
is.read(buffer); is.read(buffer);
long offset2 = extended? RandomAccessFileZIMInputStream.toEightLittleEndianLong(buffer) : RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer); long offset2 = extended? RandomAccessFileZIMInputStream.toEightLittleEndianLong(buffer) : RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer);
long blob_size = offset2 - offset1; long blob_size = offset2 - offset1;
if (blob_size == 0) return new byte[0]; // skip the skipping to get to a zero-length object (they exist!)
byte[] entry = new byte[(int) blob_size]; // TODO: we should be able to read blobs larger than MAXINT byte[] entry = new byte[(int) blob_size]; // TODO: we should be able to read blobs larger than MAXINT
// we must do two skip steps: first to the end of the offset list and second to the start of the blob // we must do two skip steps: first to the end of the offset list and second to the start of the blob
// - the whole number of offset list entries is numberOfBlobs1, which includes the extra entry for the end offset // - the whole number of offset list entries is numberOfBlobs1, which includes the extra entry for the end offset

Loading…
Cancel
Save