From b9912ff50d1af3a20bb0c5c1d2b19e35442f7db1 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Sun, 29 Oct 2023 22:09:24 +0000 Subject: [PATCH 1/2] repaired dockerfiles for aarch64 and armv7 --- docker/Dockerfile.aarch64 | 52 +++++++++------------------------ docker/Dockerfile.armv7 | 60 +++++++++++---------------------------- 2 files changed, 30 insertions(+), 82 deletions(-) diff --git a/docker/Dockerfile.aarch64 b/docker/Dockerfile.aarch64 index 0f536c9a1..2adafcd04 100644 --- a/docker/Dockerfile.aarch64 +++ b/docker/Dockerfile.aarch64 @@ -1,56 +1,30 @@ -# Build a docker image from latest YaCy sources +# Docker image for YaCy -# Base image : latest Debian stable official jdk 17 image from Docker -FROM arm64v8/openjdk:17-buster +# build with +# docker build -t yacy/yacy_search_server:aarch64-latest -f Dockerfile.aarch64 ../ + +# run with +# docker run -d --name yacy -p 8090:8090 -p 8443:8443 -v yacy_data:/opt/yacy_search_server/DATA --log-opt max-size=200m --log-opt max-file=2 yacy/yacy_search_server:aarch64-latest -# Install needed packages not in base image -# (curl for sh scripts in /bin, and wkhtmltopdf,imagemagick,xvfb and ghostscript to enable PDF and image snapshot generation) -RUN apt-get update && apt-get install -yq curl wkhtmltopdf imagemagick xvfb ghostscript && \ - rm -rf /var/lib/apt/lists/* +FROM arm64v8/openjdk:17-buster +LABEL maintainer="Michael Peter Christen " -# trace java version +RUN apt-get update && \ + apt-get install -yq ca-certificates ca-certificates-java curl wkhtmltopdf imagemagick xvfb ghostscript ant git RUN java -version -# set current working dir WORKDIR /opt - -# All in one step to reduce image size growth : -# - install ant package -# - Compile with ant -# - remove unnecessary and size consuming .git directory -# - remove ant package - -# copy sources COPY . /opt/yacy_search_server/ - RUN rm -rf /opt/yacy_search_server/.git && \ - apt-get update && \ - apt-get install -yq ant && \ ant compile -f /opt/yacy_search_server/build.xml && \ apt-get purge -yq --auto-remove ant && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -RUN \ -# Set initial admin password: "yacy" (encoded with custom yacy md5 function net.yacy.cora.order.Digest.encodeMD5Hex()) -# > java -classpath classes net.yacy.cora.order.Digest -strfhex "admin:The YaCy access is limited to administrators. If you don't know the password, you can change it using /bin/passwd.sh :docker" - sed -i "/adminAccountBase64MD5=/c\adminAccountBase64MD5=MD5:8cffbc0d66567a0987a4aba1ec46d63c" /opt/yacy_search_server/defaults/yacy.init && \ - sed -i "/adminAccountForLocalhost=/c\adminAccountForLocalhost=false" /opt/yacy_search_server/defaults/yacy.init && \ -# Intially enable HTTPS: this is the most secure option for remote administrator authentication - sed -i "/server.https=false/c\server.https=true" /opt/yacy_search_server/defaults/yacy.init && \ -# Create user and group yacy: this user will be used to run YaCy main process - adduser --system --group --no-create-home --disabled-password yacy && \ -# Set ownership of yacy install directory to yacy user/group - chown yacy:yacy -R /opt/yacy_search_server - -# Expose HTTP and HTTPS default ports EXPOSE 8090 8443 - -# Set data volume: yacy data and configuration will persist even after container stop or destruction VOLUME ["/opt/yacy_search_server/DATA"] - -# Next commands run as yacy as non-root user for improved security +RUN sed -i "/server.https=false/c\server.https=true" /opt/yacy_search_server/defaults/yacy.init && \ + adduser --system --group --no-create-home --disabled-password yacy && \ + chown -R yacy:yacy /opt/yacy_search_server USER yacy - -# Start yacy as a foreground process (-f) to display console logs and to wait for yacy process CMD ["/bin/sh","/opt/yacy_search_server/startYACY.sh","-f"] diff --git a/docker/Dockerfile.armv7 b/docker/Dockerfile.armv7 index 7a3134eac..b66212753 100644 --- a/docker/Dockerfile.armv7 +++ b/docker/Dockerfile.armv7 @@ -1,56 +1,30 @@ -# Build a docker image from latest YaCy sources +# Docker image for YaCy -# Base image : latest Debian stable official jdk 11 image from Docker -FROM arm32v7/openjdk:11-jdk +# build with +# docker build -t yacy/yacy_search_server:armv7-latest -f Dockerfile.armv7 ../ -# Install needed packages not in base image -# (curl for sh scripts in /bin, and wkhtmltopdf,imagemagick,xvfb and ghostscript to enable PDF and image snapshot generation) -RUN apt-get update && apt-get install -yq curl wkhtmltopdf imagemagick xvfb ghostscript && \ - rm -rf /var/lib/apt/lists/* +# run with +# docker run -d --name yacy -p 8090:8090 -p 8443:8443 -v yacy_data:/opt/yacy_search_server/DATA --log-opt max-size=200m --log-opt max-file=2 yacy/yacy_search_server:armv7-latest -# trace java version +FROM arm32v7/debian +LABEL maintainer="Michael Peter Christen " + +RUN apt-get update && \ + apt-get install -yq ca-certificates ca-certificates-java curl wkhtmltopdf imagemagick xvfb ghostscript default-jdk ant git RUN java -version -# set current working dir WORKDIR /opt - -# All in one step to reduce image size growth : -# - install ant package -# - Compile with ant -# - remove unnecessary and size consuming .git directory -# - remove ant package - -# copy sources COPY . /opt/yacy_search_server/ - RUN rm -rf /opt/yacy_search_server/.git && \ - apt-get update && \ - apt-get install -yq ant && \ - ant compile -f /opt/yacy_search_server/build.xml && \ - apt-get purge -yq --auto-remove ant && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* + ant compile -f /opt/yacy_search_server/build.xml && \ + apt-get purge -yq --auto-remove ant && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* -RUN \ -# Set initial admin password: "yacy" (encoded with custom yacy md5 function net.yacy.cora.order.Digest.encodeMD5Hex()) -# > java -classpath classes net.yacy.cora.order.Digest -strfhex "admin:The YaCy access is limited to administrators. If you don't know the password, you can change it using /bin/passwd.sh :docker" - sed -i "/adminAccountBase64MD5=/c\adminAccountBase64MD5=MD5:8cffbc0d66567a0987a4aba1ec46d63c" /opt/yacy_search_server/defaults/yacy.init && \ - sed -i "/adminAccountForLocalhost=/c\adminAccountForLocalhost=false" /opt/yacy_search_server/defaults/yacy.init && \ -# Intially enable HTTPS: this is the most secure option for remote administrator authentication - sed -i "/server.https=false/c\server.https=true" /opt/yacy_search_server/defaults/yacy.init && \ -# Create user and group yacy: this user will be used to run YaCy main process - adduser --system --group --no-create-home --disabled-password yacy && \ -# Set ownership of yacy install directory to yacy user/group - chown yacy:yacy -R /opt/yacy_search_server - -# Expose HTTP and HTTPS default ports EXPOSE 8090 8443 - -# Set data volume: yacy data and configuration will persist even after container stop or destruction VOLUME ["/opt/yacy_search_server/DATA"] - -# Next commands run as yacy as non-root user for improved security +RUN sed -i "/server.https=false/c\server.https=true" /opt/yacy_search_server/defaults/yacy.init && \ + adduser --system --group --no-create-home --disabled-password yacy && \ + chown -R yacy:yacy /opt/yacy_search_server USER yacy - -# Start yacy as a foreground process (-f) to display console logs and to wait for yacy process CMD ["/bin/sh","/opt/yacy_search_server/startYACY.sh","-f"] From 1c0df28bfbf65603219b9b32a2924f97378877c1 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 1 Nov 2023 18:48:40 +0100 Subject: [PATCH 2/2] added a zim importer that can be used for surrogate imports. Can not be used yet because it requires some security additions to verify that the given urls actually work. --- .../yacy/document/importer/ZimImporter.java | 306 ++++++++++++++++++ source/org/openzim/ZIMFile.java | 2 +- source/org/openzim/ZIMReader.java | 38 ++- 3 files changed, 339 insertions(+), 7 deletions(-) create mode 100644 source/net/yacy/document/importer/ZimImporter.java diff --git a/source/net/yacy/document/importer/ZimImporter.java b/source/net/yacy/document/importer/ZimImporter.java new file mode 100644 index 000000000..2b1f5551c --- /dev/null +++ b/source/net/yacy/document/importer/ZimImporter.java @@ -0,0 +1,306 @@ +/** + * ZimImporter.java + * (C) 2023 by Michael Peter Christen @orbiter + * + * This is a part of YaCy, a peer-to-peer based web search engine + * + * LICENSE + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. + * If not, see . + */ + +package net.yacy.document.importer; + +import java.io.File; +import java.io.IOException; +import java.util.Collection; +import java.util.Map; +import java.util.TreeMap; + +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.protocol.RequestHeader; +import net.yacy.cora.protocol.ResponseHeader; +import net.yacy.cora.util.ConcurrentLog; +import net.yacy.crawler.retrieval.Request; +import net.yacy.crawler.retrieval.Response; +import net.yacy.document.TextParser; +import net.yacy.search.Switchboard; + +import org.openzim.ZIMFile; +import org.openzim.ZIMReader; +import org.openzim.ZIMReader.ArticleEntry; +import org.openzim.ZIMReader.DirectoryEntry; + +/** + * ZIM importer + * can import ZIM file i.e. from https://download.kiwix.org/zim/ or mirrors like https://ftp.fau.de/kiwix/zim/ + * These files contains identifiers named "URL" which are not actually full URLs but just paths inside a well-known domains. + * These domains are sometimes given by a "Source" metadata field, but that is rare - we have to guess them. + * For that we have a guessing function, but we must check if the guessing was correct by testing some of the given + * URLs against the actual internet-hosted document. Only if that check succeeds we should import the files. + * In all other cases the import should work as well but should also only be done in a non-p2p environment to prevent + * that such links are shared. + */ +public class ZimImporter extends Thread implements Importer { + + static public ZimImporter job; + + private ZIMFile file; + private ZIMReader reader; + private String path; + private String guessedSource; + + private int recordCnt; + private long startTime; + private final long sourceSize; + private long consumed; + private boolean abort = false; + + public ZimImporter(String path) throws IOException { + super("ZimImporter - from file " + path); + this.path = path; + this.file = new ZIMFile(this.path); // this will read already some of the metadata and could consume some time + this.sourceSize = this.file.length(); + } + + @Override + public void run() { + job = this; + this.startTime = System.currentTimeMillis(); + try { + this.reader = new ZIMReader(this.file); + this.guessedSource = getSource(this.reader); + + for (int i = 0; i < this.file.header_entryCount; i++) { + if (this.abort) break; + DirectoryEntry de = this.reader.getDirectoryInfo(i); + if (!(de instanceof ZIMReader.ArticleEntry)) continue; + ArticleEntry ae = (ArticleEntry) de; + + // check url + String guessedUrl = guessURL(this.guessedSource, de); + assert guessedUrl.startsWith("http"); + + // check availability of text parser + String mimeType = this.file.getMimeType(ae.mimetype); + if (TextParser.supportsMime(mimeType) != null) continue; + + // read the content + byte[] b = this.reader.getArticleData(ae); + + // create artificial request and response headers for the indexer + RequestHeader requestHeader = new RequestHeader(); + ResponseHeader responseHeader = new ResponseHeader(200); + final Request request = new Request(new DigestURL(guessedUrl), null); + final Response response = new Response( + request, + requestHeader, + responseHeader, + Switchboard.getSwitchboard().crawler.defaultSurrogateProfile, + false, + b + ); + + // throw this to the indexer + String error = Switchboard.getSwitchboard().toIndexer(response); + if (error != null) ConcurrentLog.info("ZimImporter", "error parsing: " + error); + this.recordCnt++; + } + } catch (IOException e) { + ConcurrentLog.info("ZimImporter", "error reading: " + e.getMessage()); + } + ConcurrentLog.info("ZimImporter", "Indexed " + this.recordCnt + " documents"); + job = null; + } + + public void quit() { + this.abort = true; + } + + @Override + public String source() { + return this.path; + } + + @Override + public int count() { + return this.recordCnt; + } + + @Override + public int speed() { + if (this.recordCnt == 0) return 0; + return (int) (this.recordCnt / Math.max(0L, runningTime() )); + } + + @Override + public long runningTime() { + return (System.currentTimeMillis() - this.startTime) / 1000L; + } + + @Override + public long remainingTime() { + if (this.consumed == 0) { + return 0; + } + long speed = this.consumed / runningTime(); + return (this.sourceSize - this.consumed) / speed; + } + + @Override + public String status() { + return ""; + } + + public static String guessDomainName(String fileName) { + if (fileName == null || fileName.isEmpty()) { + return null; // Handle null or empty input + } + + String[] parts = fileName.split("_"); + if (parts.length == 0) { + return null; + } + String firstPart = parts[0]; + + // Handling special cases where the domain name might not be obvious + // These are based on your provided list and can be expanded as needed + switch (firstPart) { + case "100r-off-the-grid": + return "100resilientcities.org"; + case "armypubs": + return "armypubs.army.mil"; + case "artofproblemsolving": + return "artofproblemsolving.com"; + case "based": + return "based.cooking"; + case "booksdash": + return "booksdash.com"; + case "coopmaths": + return "coopmaths.fr"; + case "fas-military-medicine": + return "fas.org"; + case "fonts": + return "fonts.google.com"; + case "gutenberg": + return "gutenberg.org"; + case "ifixit": + return "ifixit.com"; + case "lesfondamentaux": + return "reseau-canope.fr"; + case "lowtechmagazine": + return "lowtechmagazine.com"; + case "mutopiaproject": + return "mutopiaproject.org"; + case "openstreetmap-wiki": + return "wiki.openstreetmap.org"; + case "opentextbooks": + return "opentextbooks.org"; + case "phet": + return "phet.colorado.edu"; + case "practical_action": + return "practicalaction.org"; + case "rapsberry_pi_docs": + return "raspberrypi.org"; + case "ted": + return "ted.com"; + case "vikidia": + return "vikidia.org"; + case "westeros": + return "westeros.org"; + case "wikipedia": + return parts[1] + ".wikipedia.org/wiki"; + case "www.ready.gov": + return "ready.gov"; + } + + // Handling domain patterns + if (firstPart.contains(".stackexchange.com")) { + return firstPart; + } else if (firstPart.endsWith(".com") || firstPart.endsWith(".org") || firstPart.endsWith(".de") || + firstPart.endsWith(".fr") || firstPart.endsWith(".pt") || firstPart.endsWith(".it") || + firstPart.endsWith(".ja") || firstPart.endsWith(".es") || firstPart.endsWith(".eo")) { + return firstPart; + } else if (firstPart.contains("-")) { + return firstPart.substring(0, firstPart.indexOf("-")); + } + + // Additional general domain extraction logic + if (firstPart.contains(".")) { + int lastDotIndex = firstPart.lastIndexOf('.'); + if (lastDotIndex > 0 && lastDotIndex < firstPart.length() - 1) { + // Extract up to the next character beyond the TLD, to support TLDs of variable length + int endIndex = firstPart.indexOf('.', lastDotIndex + 1); + if (endIndex == -1) { + endIndex = firstPart.length(); + } + return firstPart.substring(0, endIndex); + } + } + + // Default return if none of the above conditions meet + return null; + } + + public static String getSource(ZIMReader r) throws IOException { + String source = r.getMetadata("Source"); + if (source != null) return source; + source = "https://" + guessDomainName(r.getZIMFile().getName()) + "/"; + return source; + } + + public static String guessURL(String guessedSource, DirectoryEntry de) { + String url = de.url; + if (url.equals("Main_Page")) url = ""; + return guessedSource + url; + } + + public static void main(String[] args) { + // zim file import test + // will test mostly if domain names are included in zim file urls + String zimFilesPath = args[0]; + File zimFiles = new File(zimFilesPath); + + // make ordered file list; order by file size (start with smallest) + String[] filelist = zimFiles.list(); + Map orderedFileMap = new TreeMap<>(); + for (int i = 0; i < filelist.length; i++) { + if (!filelist[i].endsWith(".zim")) continue; + File f = new File(zimFiles, filelist[i]); + orderedFileMap.put(f.length() * 1000 + i, f); + } + + Collection orderedFiles = orderedFileMap.values(); + for (File f: orderedFiles) { + try { + ZIMFile z = new ZIMFile(f.getAbsolutePath()); + ZIMReader r = new ZIMReader(z); + DirectoryEntry de = r.getMainDirectoryEntry(); + System.out.println("ZIM file: " + f.getAbsolutePath()); + for (String key: ZIMReader.METADATA_KEYS) {String s = r.getMetadata(key); if (s != null) System.out.println("Metadata " + key + ": " + s);}; + System.out.println("Namespace: " + de.namespace); + System.out.println("Title: " + de.title); + System.out.println("URL: " + de.url); + System.out.println("guessed domain: " + guessDomainName(f.getName())); + String source = getSource(r); + System.out.println("guessed Source: " + source); + System.out.println("guessed main article: " + guessURL(source, de)); + System.out.println(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } +} diff --git a/source/org/openzim/ZIMFile.java b/source/org/openzim/ZIMFile.java index dd209b5e9..6381294df 100644 --- a/source/org/openzim/ZIMFile.java +++ b/source/org/openzim/ZIMFile.java @@ -106,7 +106,7 @@ public class ZIMFile extends File { break; } String mimeType = mimeBuffer.toString(); - System.out.println(mimeType); + //System.out.println(mimeType); mList.add(mimeType); } this.mimeTypeList = mList.toArray(new String[mList.size()]); diff --git a/source/org/openzim/ZIMReader.java b/source/org/openzim/ZIMReader.java index 8d773d473..70dde16bc 100644 --- a/source/org/openzim/ZIMReader.java +++ b/source/org/openzim/ZIMReader.java @@ -20,6 +20,7 @@ package org.openzim; import java.io.IOException; import java.io.InputStream; +import java.nio.charset.StandardCharsets; import org.tukaani.xz.SingleXZInputStream; import com.github.luben.zstd.ZstdInputStream; @@ -39,6 +40,11 @@ import com.github.luben.zstd.ZstdInputStream; */ public class ZIMReader { + public final static String[] METADATA_KEYS = new String[] { + "Name", "Title", "Creator", "Publisher", "Date", "Description", "LongDescription", + "Language", "License", "Tags", "Relation", "Flavour", "Source", "Counter", "Scraper" + }; + private final ZIMFile mFile; public static abstract class DirectoryEntry { @@ -48,13 +54,13 @@ public class ZIMReader { public final int cluster_number; public final String url; public final String title; - public final long urlListindex; + public final int urlListindex; public DirectoryEntry( final int mimeType, final char namespace, final int cluster_number, final String url, final String title, - final long index) { + final int index) { this.mimetype = mimeType; this.namespace = namespace; this.cluster_number = cluster_number; @@ -74,7 +80,7 @@ public class ZIMReader { final int mimeType, final char namespace, final int cluster_number, final int blob_number, final String url, final String title, - final long urlListindex) { + final int urlListindex) { super(mimeType, namespace, cluster_number, url, title, urlListindex); this.cluster_number = cluster_number; this.blob_number = blob_number; @@ -84,11 +90,11 @@ public class ZIMReader { public static class RedirectEntry extends DirectoryEntry { - public final long redirect_index; + public final int redirect_index; public RedirectEntry(final int mimeType, final char namespace, - final long redirect_index, final String url, final String title, - final long urlListindex) { + final int redirect_index, final String url, final String title, + final int urlListindex) { super(mimeType, namespace, 0, url, title, urlListindex); this.redirect_index = redirect_index; } @@ -103,6 +109,25 @@ public class ZIMReader { return this.mFile; } + public final String getMetadata(String key) throws IOException { + DirectoryEntry de = getDirectoryInfo('M', key); + if (de == null) return null; // metadata not found; that would be normal + byte[] val = getArticleData(de); + if (val == null) return null; // article data not found: that is not normal + if (val.length == 0) return null; // that empty string is a proper value, however, not usable for a client + return new String(val, StandardCharsets.UTF_8); + } + + public DirectoryEntry getMainDirectoryEntry() throws IOException { + DirectoryEntry de = getDirectoryInfo(this.mFile.header_mainPage); + if (de.namespace == 'W' && de.url.equals("mainPage") && de instanceof RedirectEntry) { + // resolve redirect to get the actual main page + int redirect = ((RedirectEntry) de).redirect_index; + de = getDirectoryInfo(redirect); + } + return de; + } + public String getURLByURLOrder(final int entryNumber) throws IOException { // The position of URL i @@ -283,6 +308,7 @@ public class ZIMReader { is.read(buffer); long offset2 = extended? RandomAccessFileZIMInputStream.toEightLittleEndianLong(buffer) : RandomAccessFileZIMInputStream.toFourLittleEndianInteger(buffer); long blob_size = offset2 - offset1; + if (blob_size == 0) return new byte[0]; // skip the skipping to get to a zero-length object (they exist!) byte[] entry = new byte[(int) blob_size]; // TODO: we should be able to read blobs larger than MAXINT // we must do two skip steps: first to the end of the offset list and second to the start of the blob // - the whole number of offset list entries is numberOfBlobs1, which includes the extra entry for the end offset