diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java index 1cac0dace..768ca0aa6 100644 --- a/source/net/yacy/cora/document/id/MultiProtocolURL.java +++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java @@ -37,7 +37,6 @@ import java.io.UnsupportedEncodingException; import java.net.InetAddress; import java.net.MalformedURLException; import java.net.URLDecoder; -import java.net.http.HttpResponse; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.util.BitSet; diff --git a/source/net/yacy/document/importer/ZimImporter.java b/source/net/yacy/document/importer/ZimImporter.java index 118e27e40..bc7266e0a 100644 --- a/source/net/yacy/document/importer/ZimImporter.java +++ b/source/net/yacy/document/importer/ZimImporter.java @@ -26,11 +26,13 @@ package net.yacy.document.importer; import java.io.File; import java.io.IOException; import java.net.MalformedURLException; -import java.net.URL; +import java.text.ParseException; +import java.text.SimpleDateFormat; import java.util.Collection; import java.util.Date; import java.util.HashSet; import java.util.LinkedHashSet; +import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.TreeMap; @@ -93,6 +95,8 @@ public class ZimImporter extends Thread implements Importer { try { this.reader = new ZIMReader(this.file); this.guessedSource = getSource(this.reader); + Date guessedDate = getDate(this.reader); + String dates = HeaderFramework.newRfc1123Format().format(guessedDate); // verify the source DirectoryEntry mainEntry = this.reader.getMainDirectoryEntry(); @@ -108,6 +112,7 @@ public class ZimImporter extends Thread implements Importer { DirectoryEntry de = this.reader.getDirectoryInfo(i); if (!(de instanceof ZIMReader.ArticleEntry)) continue; ArticleEntry ae = (ArticleEntry) de; + if (ae.namespace != 'C' && ae.namespace != 'A') continue; // check url DigestURL guessedUrl = guessURL(this.guessedSource, de); @@ -121,6 +126,7 @@ public class ZimImporter extends Thread implements Importer { // check availability of text parser String mimeType = ae.getMimeType(); + if (!mimeType.startsWith("text/") && !mimeType.equals("application/epub+zip")) continue; // in this import we want only text, not everything that is possible if (TextParser.supportsMime(mimeType) != null) continue; // read the content @@ -130,6 +136,7 @@ public class ZimImporter extends Thread implements Importer { RequestHeader requestHeader = new RequestHeader(); ResponseHeader responseHeader = new ResponseHeader(200); responseHeader.put(HeaderFramework.CONTENT_TYPE, de.getMimeType()); // very important to tell parser which kind of content + responseHeader.put(HeaderFramework.LAST_MODIFIED, dates); // put in the guessd date to have something that is not the current date final Request request = new Request( ASCII.getBytes(sb.peers.mySeed().hash), guessedUrl, @@ -230,8 +237,6 @@ public class ZimImporter extends Thread implements Importer { return "fas.org"; case "fonts": return "fonts.google.com"; - case "gutenberg": - return "https://dev.library.kiwix.org/viewer#gutenberg_de_all_2023-03"; case "ifixit": return "ifixit.com"; case "lesfondamentaux": @@ -313,12 +318,22 @@ public class ZimImporter extends Thread implements Importer { return source; } + public static Date getDate(ZIMReader r) throws IOException { + String date = r.getMetadata("Date"); + if (date != null) try { + SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd", Locale.US); + return format.parse(date); + } catch (ParseException e) {} + // failover situation: use file date + return new Date(r.getZIMFile().lastModified()); + } + public static DigestURL guessURL(String guessedSource, DirectoryEntry de) throws MalformedURLException { String url = de.url; if (url.equals("Main_Page")) url = ""; - if (guessedSource != null) return new DigestURL(guessedSource + url); if (url.startsWith("A/")) return new DigestURL("https://" + url.substring(2)); if (url.startsWith("H/")) return new DigestURL("https://" + url.substring(2)); + if (guessedSource != null) return new DigestURL(guessedSource + url); return new DigestURL(guessedSource + url); } @@ -439,6 +454,22 @@ public class ZimImporter extends Thread implements Importer { "ted_en_design_2023-09.zim", "ted_en_business_2023-09.zim", "ted_en_global_issues_2023-09.zim", + "opentextbooks_en_all_2023-08.zim", + "bestedlessons.org_en_all_2023-08.zim", + "wikivoyage_en_all_nopic_2023-10.zim", + "based.cooking_en_all_2023-10.zim", + "wordnet_en_all_2023-04.zim", + "internet-encyclopedia-philosophy_en_all_2023-08.zim", + "100r-off-the-grid_en_2023-09.zim", + "coopmaths_2023-04.zim", + "birds-of-ladakh_en_all_2023-02.zim", + "storyweaver.org_en_2023-09.zim", + "developer.mozilla.org_en_all_2023-02.zim", + "www.ready.gov_es_2023-06.zim", + "teoria.com_en_2023-08.zim", + "theworldfactbook_en_all_2023-06.zim", + "mutopiaproject.org_en_2023-08.zim", + "dp.la_en_all_2023-08.zim", // 302 "moderators.stackexchange.com_en_all_2023-05.zim", @@ -483,6 +514,7 @@ public class ZimImporter extends Thread implements Importer { System.out.println("Namespace: " + de.namespace); System.out.println("Title: " + de.title); System.out.println("URL: " + de.url); + System.out.println("Mime Type " + de.getMimeType()); System.out.println("guessed domain: " + guessDomainName(f.getName())); // uses a table and rules that deduces a source from the file name String source = getSource(r); System.out.println("guessed Source: " + source); // this uses metadata stored in the zim file diff --git a/source/org/openzim/ZIMFile.java b/source/org/openzim/ZIMFile.java index a241507ab..cbde3a0a8 100644 --- a/source/org/openzim/ZIMFile.java +++ b/source/org/openzim/ZIMFile.java @@ -134,6 +134,7 @@ public class ZIMFile extends File { } public final String getMimeType(int idx) { + if (idx >= this.mimeTypeList.length) return ""; return this.mimeTypeList[idx]; } diff --git a/source/org/openzim/ZIMReader.java b/source/org/openzim/ZIMReader.java index 27d544e27..14bc47dfd 100644 --- a/source/org/openzim/ZIMReader.java +++ b/source/org/openzim/ZIMReader.java @@ -237,11 +237,25 @@ public class ZIMReader { public DirectoryEntry getMainDirectoryEntry() throws IOException { DirectoryEntry de = getDirectoryInfo(this.mFile.header_mainPage); - if (de.namespace == 'W' && de.url.equals("mainPage") && de instanceof RedirectEntry) { + if (de instanceof RedirectEntry) { // resolve redirect to get the actual main page int redirect = ((RedirectEntry) de).redirect_index; de = getDirectoryInfo(redirect); } + // For the main entry we demand a "text/html" mime type. + // Many zim files do not provide this as the main file, which is strange (maybe lazy/irresponsibe) + // Because the main entry is important for a validation, we seek for one entry which may + // be proper for indexing. + int entryNumner = 0; + while (!de.getMimeType().equals("text/html") && entryNumner < this.mFile.header_entryCount) { + de = getDirectoryInfo(entryNumner); + entryNumner++; + if (de.namespace != 'C' && de.namespace != 'A') continue; + if (!(de instanceof ArticleEntry)) continue; + if (!de.getMimeType().equals("text/html")) continue; + if (de.url.contains("404") || de.title.contains("404") || de.title.contains("301")) continue; // is a pain + return de; + } return de; }