From 496f768c4431544af1b1ebb1555716c47a05aa0e Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Fri, 3 Nov 2023 18:20:10 +0100 Subject: [PATCH 01/11] modified cache strategy for zim clusters --- source/org/openzim/ZIMReader.java | 30 +++--------------------------- 1 file changed, 3 insertions(+), 27 deletions(-) diff --git a/source/org/openzim/ZIMReader.java b/source/org/openzim/ZIMReader.java index bc39fd36b..27d544e27 100644 --- a/source/org/openzim/ZIMReader.java +++ b/source/org/openzim/ZIMReader.java @@ -337,10 +337,7 @@ public class ZIMReader { public Cluster getCluster(int clusterNumber) throws IOException { for (int i = 0; i < this.clusterCache.size(); i++) { Cluster c = clusterCache.get(i); - if (c.cluster_number == clusterNumber) { - c.incUsage(); // cache hit - return c; - } + if (c.cluster_number == clusterNumber) return c; } // cache miss @@ -348,17 +345,10 @@ public class ZIMReader { // check cache size if (clusterCache.size() >= MAX_CLUSTER_CACHE_SIZE) { - // remove one entry - double maxEntry = Double.MIN_VALUE; - int pos = -1; - for (int i = 0; i < clusterCache.size(); i++) { - double r = this.clusterCache.get(i).getUsageRatio(); - if (r > maxEntry) {maxEntry = r; pos = i;} - } - if (pos >= 0) this.clusterCache.remove(pos); + // remove one entry: the first entry is the oldest entry + this.clusterCache.remove(0); } - c.incUsage(); this.clusterCache.add(c); return c; } @@ -378,12 +368,10 @@ public class ZIMReader { private int cluster_number; // used to identify the correct cache entry private List blobs; - private int usageCounter; // used for efficient caching and cache stale detection private boolean extended; public Cluster(int cluster_number) throws IOException { this.cluster_number = cluster_number; - this.usageCounter = 0; // open the cluster and make a Input Stream with the proper decompression type final long clusterPos = mFile.geClusterPtr(cluster_number); @@ -444,21 +432,9 @@ public class ZIMReader { return this.blobs.get(i); } - public void incUsage() { - this.usageCounter++; - } - - public int getUsage() { - return this.usageCounter; - } - public int getSize() { return this.blobs.size(); } - - public double getUsageRatio() { - return ((double) this.usageCounter) / ((double) this.blobs.size()); - } } public byte[] getArticleData(final DirectoryEntry directoryInfo) throws IOException { From 70e29937ef76b2f3c7f5061d71bb4b3ce740a361 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Sat, 4 Nov 2023 19:07:50 +0100 Subject: [PATCH 02/11] added a check in zim importer which tests if import URLs actually exist --- .../cora/document/id/MultiProtocolURL.java | 27 +++++++++++++++++++ .../yacy/document/importer/ZimImporter.java | 11 +++++++- 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java index e9a46fc91..a3404bec0 100644 --- a/source/net/yacy/cora/document/id/MultiProtocolURL.java +++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java @@ -37,6 +37,7 @@ import java.io.UnsupportedEncodingException; import java.net.InetAddress; import java.net.MalformedURLException; import java.net.URLDecoder; +import java.net.http.HttpResponse; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.util.BitSet; @@ -2578,6 +2579,32 @@ public class MultiProtocolURL implements Serializable, Comparable 0; + } + if (isHTTP() || isHTTPS()) { + try (final HTTPClient client = new HTTPClient(agent)) { + client.setHost(getHost()); + org.apache.http.HttpResponse response = client.HEADResponse(this, true); + return response != null && (response.getStatusLine().getStatusCode() == 200 || response.getStatusLine().getStatusCode() == 301); + } + } + return false; + } catch (IOException e) { + return false; + } + } + /** * Read fully the source, close it and return its content as a bytes array. * @param source the source to read diff --git a/source/net/yacy/document/importer/ZimImporter.java b/source/net/yacy/document/importer/ZimImporter.java index a96a79b18..26f36f787 100644 --- a/source/net/yacy/document/importer/ZimImporter.java +++ b/source/net/yacy/document/importer/ZimImporter.java @@ -30,6 +30,7 @@ import java.util.Map; import java.util.TreeMap; import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.util.ConcurrentLog; @@ -84,6 +85,12 @@ public class ZimImporter extends Thread implements Importer { this.reader = new ZIMReader(this.file); this.guessedSource = getSource(this.reader); + // verify the source + DirectoryEntry mainEntry = this.reader.getMainDirectoryEntry(); + DigestURL url = new DigestURL(mainEntry.url); + if (!url.exists(ClientIdentification.browserAgent)) return; + + // read all documents for (int i = 0; i < this.file.header_entryCount; i++) { if (this.abort) break; DirectoryEntry de = this.reader.getDirectoryInfo(i); @@ -304,7 +311,9 @@ public class ZimImporter extends Thread implements Importer { System.out.println("guessed domain: " + guessDomainName(f.getName())); String source = getSource(r); System.out.println("guessed Source: " + source); - System.out.println("guessed main article: " + guessURL(source, de)); + String mainURL = guessURL(source, de); + System.out.println("guessed main article: " + mainURL); + System.out.println("main article exists: " + new DigestURL(mainURL).exists(ClientIdentification.browserAgent)); System.out.println(); } catch (IOException e) { e.printStackTrace(); From 7db0534d8a0709a2903f1880e98aaa4657fbf462 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Sun, 5 Nov 2023 02:16:40 +0100 Subject: [PATCH 03/11] Added a zim parser to the surrogate import option. You can now import zim files into YaCy by simply moving them to the DATA/SURROGATE/IN folder. They will be fetched and after parsing moved to DATA/SURROGATE/OUT. There are exceptions where the parser is not able to identify the original URL of the documents in the zim file. In that case the file is simply ignored. This commit also carries an important fix to the pdf parser and an increase of the maximum parsing speed to 60000 PPM which should make it possible to index up to 1000 files in one second. --- htroot/ConfigParser_p.html | 21 -- htroot/Crawler_p.html | 4 +- ivy.xml | 1 + .../cora/document/id/MultiProtocolURL.java | 14 +- .../yacy/document/importer/ZimImporter.java | 215 ++++++++++++++++-- .../net/yacy/document/parser/pdfParser.java | 126 +++------- source/net/yacy/htroot/ConfigParser_p.java | 10 - source/net/yacy/htroot/Crawler_p.java | 10 +- .../kelondro/data/meta/URIMetadataNode.java | 12 +- source/net/yacy/search/Switchboard.java | 18 +- .../net/yacy/search/SwitchboardConstants.java | 2 - source/org/openzim/ZIMFile.java | 32 +-- 12 files changed, 279 insertions(+), 186 deletions(-) diff --git a/htroot/ConfigParser_p.html b/htroot/ConfigParser_p.html index a51ee1013..66a4665d3 100644 --- a/htroot/ConfigParser_p.html +++ b/htroot/ConfigParser_p.html @@ -51,27 +51,6 @@ -
PDF Parser Attributes -

- This is an experimental setting which makes it possible to split PDF documents into individual index entries. - Every page will become a single index hit and the url is artifically extended with a post/get attribute value containing - the page number as value. When such an url is displayed within a search result, then the post/get attribute is transformed into an anchor hash link. - This makes it possible to view the individual page directly in the pdf.js viewer built-in into firefox, - for reference see https://github.com/mozilla/pdf.js/wiki/Viewer-options -

- - - - - - - - - - - - -
Split PDF
Property Name
#%env/templates/footer.template%# diff --git a/htroot/Crawler_p.html b/htroot/Crawler_p.html index 79a0319c0..3b328a996 100644 --- a/htroot/Crawler_p.html +++ b/htroot/Crawler_p.html @@ -134,7 +134,7 @@ Speed / PPM
(Pages Per Minute) - + @@ -147,7 +147,7 @@ Crawler PPM     - + diff --git a/ivy.xml b/ivy.xml index 61f9ee127..8c072699d 100644 --- a/ivy.xml +++ b/ivy.xml @@ -28,6 +28,7 @@ + diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java index a3404bec0..1cac0dace 100644 --- a/source/net/yacy/cora/document/id/MultiProtocolURL.java +++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java @@ -2593,14 +2593,18 @@ public class MultiProtocolURL implements Serializable, Comparable 0; } if (isHTTP() || isHTTPS()) { - try (final HTTPClient client = new HTTPClient(agent)) { - client.setHost(getHost()); - org.apache.http.HttpResponse response = client.HEADResponse(this, true); - return response != null && (response.getStatusLine().getStatusCode() == 200 || response.getStatusLine().getStatusCode() == 301); - } + final HTTPClient client = new HTTPClient(agent); + client.setHost(getHost()); + org.apache.http.HttpResponse response = client.HEADResponse(this, true); + client.close(); + if (response == null) return false; + int status = response.getStatusLine().getStatusCode(); + return status == 200 || status == 301 || status == 302; } return false; } catch (IOException e) { + if (e.getMessage().contains("Circular redirect to")) return true; // exception; this is a 302 which the client actually accepts + //e.printStackTrace(); return false; } } diff --git a/source/net/yacy/document/importer/ZimImporter.java b/source/net/yacy/document/importer/ZimImporter.java index 26f36f787..118e27e40 100644 --- a/source/net/yacy/document/importer/ZimImporter.java +++ b/source/net/yacy/document/importer/ZimImporter.java @@ -25,12 +25,20 @@ package net.yacy.document.importer; import java.io.File; import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; import java.util.Collection; +import java.util.Date; +import java.util.HashSet; +import java.util.LinkedHashSet; import java.util.Map; +import java.util.Set; import java.util.TreeMap; +import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.ClientIdentification; +import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.util.ConcurrentLog; @@ -81,14 +89,18 @@ public class ZimImporter extends Thread implements Importer { public void run() { job = this; this.startTime = System.currentTimeMillis(); + Switchboard sb = Switchboard.getSwitchboard(); try { this.reader = new ZIMReader(this.file); this.guessedSource = getSource(this.reader); // verify the source DirectoryEntry mainEntry = this.reader.getMainDirectoryEntry(); - DigestURL url = new DigestURL(mainEntry.url); - if (!url.exists(ClientIdentification.browserAgent)) return; + DigestURL mainURL = guessURL(this.guessedSource, mainEntry); + if (!mainURL.exists(ClientIdentification.browserAgent)) { + sb.log.info("zim importer: file " + this.file.getName() + " failed main url existence test: " + mainURL); + return; + } // read all documents for (int i = 0; i < this.file.header_entryCount; i++) { @@ -98,8 +110,14 @@ public class ZimImporter extends Thread implements Importer { ArticleEntry ae = (ArticleEntry) de; // check url - String guessedUrl = guessURL(this.guessedSource, de); - assert guessedUrl.startsWith("http"); + DigestURL guessedUrl = guessURL(this.guessedSource, de); + if (recordCnt < 10) { + // critical test for the first 10 urls + if (!guessedUrl.exists(ClientIdentification.browserAgent)) { + sb.log.info("zim importer: file " + this.file.getName() + " failed url " + recordCnt + " existence test: " + guessedUrl); + return; + } + } // check availability of text parser String mimeType = ae.getMimeType(); @@ -111,7 +129,17 @@ public class ZimImporter extends Thread implements Importer { // create artificial request and response headers for the indexer RequestHeader requestHeader = new RequestHeader(); ResponseHeader responseHeader = new ResponseHeader(200); - final Request request = new Request(new DigestURL(guessedUrl), null); + responseHeader.put(HeaderFramework.CONTENT_TYPE, de.getMimeType()); // very important to tell parser which kind of content + final Request request = new Request( + ASCII.getBytes(sb.peers.mySeed().hash), + guessedUrl, + null, // referrerhash the hash of the referrer URL + de.title, // name the name of the document to crawl + null, // appdate the time when the url was first time appeared + sb.crawler.defaultSurrogateProfile.handle(), // profileHandle the name of the prefetch profile. This must not be null! + 0, // depth the crawling depth of the entry + sb.crawler.defaultSurrogateProfile.timezoneOffset() // timezone offset + ); final Response response = new Response( request, requestHeader, @@ -122,7 +150,7 @@ public class ZimImporter extends Thread implements Importer { ); // throw this to the indexer - String error = Switchboard.getSwitchboard().toIndexer(response); + String error = sb.toIndexer(response); if (error != null) ConcurrentLog.info("ZimImporter", "error parsing: " + error); this.recordCnt++; } @@ -203,7 +231,7 @@ public class ZimImporter extends Thread implements Importer { case "fonts": return "fonts.google.com"; case "gutenberg": - return "gutenberg.org"; + return "https://dev.library.kiwix.org/viewer#gutenberg_de_all_2023-03"; case "ifixit": return "ifixit.com"; case "lesfondamentaux": @@ -223,11 +251,23 @@ public class ZimImporter extends Thread implements Importer { case "rapsberry_pi_docs": return "raspberrypi.org"; case "ted": - return "ted.com"; + return "www.ted.com/search?q="; case "vikidia": - return "vikidia.org"; + return parts[1] + ".vikidia.org/wiki"; case "westeros": return "westeros.org"; + case "wikihow": + return parts[1].equals("en") ? "wikihow.com" : parts[1] + ".wikihow.com"; + case "wikisource": + return parts[1] + ".wikisource.org/wiki"; + case "wikiversity": + return parts[1] + ".wikiversity.org/wiki"; + case "wikivoyage": + return parts[1] + ".wikivoyage.org/wiki"; + case "wiktionary": + return parts[1] + ".wiktionary.org/wiki"; + case "wikiquote": + return parts[1] + ".wikiquote.org/wiki"; case "wikibooks": return parts[1] + ".wikibooks.org/wiki"; case "wikinews": @@ -273,16 +313,148 @@ public class ZimImporter extends Thread implements Importer { return source; } - public static String guessURL(String guessedSource, DirectoryEntry de) { + public static DigestURL guessURL(String guessedSource, DirectoryEntry de) throws MalformedURLException { String url = de.url; if (url.equals("Main_Page")) url = ""; - if (guessedSource != null) return guessedSource + url; - if (url.startsWith("A/")) return "https://" + url.substring(2); - if (url.startsWith("H/")) return "https://" + url.substring(2); - return guessedSource + url; + if (guessedSource != null) return new DigestURL(guessedSource + url); + if (url.startsWith("A/")) return new DigestURL("https://" + url.substring(2)); + if (url.startsWith("H/")) return new DigestURL("https://" + url.substring(2)); + return new DigestURL(guessedSource + url); } + private final static String[] skip_files = { + "iota.stackexchange.com_en_all_2023-05.zim", + "stellar.stackexchange.com_en_all_2023-10.zim", + "vegetarianism.stackexchange.com_en_all_2023-05.zim", + "esperanto.stackexchange.com_eo_all_2023-10.zim", + "tezos.stackexchange.com_en_all_2023-10.zim", + "eosio.stackexchange.com_en_all_2023-10.zim", + "ebooks.stackexchange.com_en_all_2023-10.zim", + "poker.stackexchange.com_en_all_2023-05.zim", + "cseducators.stackexchange.com_en_all_2023-10.zim", + "iot.stackexchange.com_en_all_2023-05.zim", + "portuguese.stackexchange.com_pt_all_2023-04.zim", + "portuguese.stackexchange.com_pt_all_2023-10.zim", + "italian.stackexchange.com_it_all_2023-05.zim", + "monero.stackexchange.com_en_all_2022-11.zim", + "sustainability.stackexchange.com_en_all_2023-05.zim", + "westeros_en_all_nopic_2021-03.zim", + "opensource.stackexchange.com_en_all_2023-10.zim", + "tor.stackexchange.com_en_all_2023-05.zim", + "devops.stackexchange.com_en_all_2023-10.zim", + "patents.stackexchange.com_en_all_2023-10.zim", + "stackapps.com_en_all_2023-05.zim", + "hardwarerecs.stackexchange.com_en_all_2023-05.zim", + "hsm.stackexchange.com_en_all_2023-05.zim", + "expatriates.stackexchange.com_en_all_2023-11.zim", + "opendata.stackexchange.com_en_all_2023-10.zim", + "sports.stackexchange.com_en_all_2023-05.zim", + "wikinews_de_all_nopic_2023-10.zim", + "computergraphics.stackexchange.com_en_all_2023-10.zim", + "tridion.stackexchange.com_en_all_2023-10.zim", + "bioinformatics.stackexchange.com_en_all_2023-10.zim", + "expressionengine.stackexchange.com_en_all_2023-11.zim", + "elementaryos.stackexchange.com_en_all_2023-10.zim", + "cstheory.stackexchange.com_en_all_2023-10.zim", + "chess.stackexchange.com_en_all_2023-05.zim", + "vi.stackexchange.com_en_all_2023-05.zim", + "fitness.stackexchange.com_en_all_2023-10.zim", + "pets.stackexchange.com_en_all_2023-05.zim", + "french.stackexchange.com_fr_all_2023-10.zim", + "sqa.stackexchange.com_en_all_2023-05.zim", + "islam.stackexchange.com_en_all_2023-05.zim", + "scicomp.stackexchange.com_en_all_2023-05.zim", + "wikinews_en_all_nopic_2023-09.zim", + "ai.stackexchange.com_en_all_2023-10.zim", + "boardgames.stackexchange.com_en_all_2023-05.zim", + "economics.stackexchange.com_en_all_2023-05.zim", + "3dprinting.stackexchange.com_en_all_2023-07.zim", + "earthscience.stackexchange.com_en_all_2023-05.zim", + "emacs.stackexchange.com_en_all_2023-10.zim", + "bitcoin.stackexchange.com_en_all_2023-05.zim", + "philosophy.stackexchange.com_en_all_2023-05.zim", + "law.stackexchange.com_en_all_2023-05.zim", + "astronomy.stackexchange.com_en_all_2023-05.zim", + "artofproblemsolving_en_all_nopic_2021-03.zim", + "engineering.stackexchange.com_en_all_2023-05.zim", + "ja.stackoverflow.com_ja_all_2023-06.zim", + "webmasters.stackexchange.com_en_all_2023-05.zim", + "anime.stackexchange.com_en_all_2023-10.zim", + "cooking.stackexchange.com_en_all_2023-05.zim", + "arduino.stackexchange.com_en_all_2023-05.zim", + "money.stackexchange.com_en_all_2023-05.zim", + "judaism.stackexchange.com_en_all_2023-05.zim", + "ethereum.stackexchange.com_en_all_2023-05.zim", + "datascience.stackexchange.com_en_all_2023-10.zim", + "academia.stackexchange.com_en_all_2023-10.zim", + "music.stackexchange.com_en_all_2023-05.zim", + "cs.stackexchange.com_en_all_2023-03.zim", + "dsp.stackexchange.com_en_all_2023-05.zim", + "biology.stackexchange.com_en_all_2023-05.zim", + "android.stackexchange.com_en_all_2023-10.zim", + "bicycles.stackexchange.com_en_all_2023-05.zim", + "puzzling.stackexchange.com_en_all_2023-05.zim", + "photo.stackexchange.com_en_all_2023-05.zim", + "aviation.stackexchange.com_en_all_2023-05.zim", + "drupal.stackexchange.com_en_all_2023-05.zim", + "ux.stackexchange.com_en_all_2023-05.zim", + "ell.stackexchange.com_en_all_2023-10.zim", + "openstreetmap-wiki_en_all_nopic_2023-05.zim", + "softwareengineering.stackexchange.com_en_all_2023-05.zim", + "gaming.stackexchange.com_en_all_2023-10.zim", + "mathematica.stackexchange.com_en_all_2023-10.zim", + "pt.stackoverflow.com_pt_all_2023-06.zim", + "apple.stackexchange.com_en_all_2023-05.zim", + "diy.stackexchange.com_en_all_2023-08.zim", + "es.stackoverflow.com_es_all_2023-06.zim", + "gis.stackexchange.com_en_all_2023-05.zim", + "stats.stackexchange.com_en_all_2023-05.zim", + "physics.stackexchange.com_en_all_2023-05.zim", + "serverfault.com_en_all_2023-05.zim", + "electronics.stackexchange.com_en_all_2023-05.zim", + "tex.stackexchange.com_en_all_2023-05.zim", + "wikibooks_de_all_nopic_2021-03.zim", + "askubuntu.com_en_all_2023-05.zim", + "superuser.com_en_all_2023-05.zim", + "lesfondamentaux.reseau-canope.fr_fr_all_2022-11.zim", + "wikibooks_en_all_nopic_2021-03.zim", + "courses.lumenlearning.com_en_all_2021-03.zim", + "wikipedia_de_all_nopic_2023-10.zim", + "wikipedia_en_all_nopic_2023-10.zim", + "stackoverflow.com_en_all_nopic_2022-07.zim", + "stackoverflow.com_en_all_2023-05.zim", + "armypubs_en_all_2023-08.zim", + "vikidia_en_all_nopic_2023-09.zim", + "wikiquote_de_all_nopic_2023-10.zim", + "wikiquote_en_all_nopic_2023-09.zim", + "wiktionary_de_all_nopic_2023-10.zim", + "wiktionary_en_all_nopic_2023-10.zim", + "wikihow_de_maxi_2023-10.zim", + "wikivoyage_de_all_nopic_2023-09.zim", + "wikiversity_de_all_nopic_2021-03.zim", + "wikiversity_en_all_nopic_2021-03.zim", + "wikisource_de_all_nopic_2023-09.zim", + "wikisource_en_all_nopic_2023-08.zim", + "ted_countdown_global_2023-09.zim", + "ted_en_design_2023-09.zim", + "ted_en_business_2023-09.zim", + "ted_en_global_issues_2023-09.zim", + + // 302 + "moderators.stackexchange.com_en_all_2023-05.zim", + "beer.stackexchange.com_en_all_2023-05.zim", + "health.stackexchange.com_en_all_2023-05.zim", + "avp.stackexchange.com_en_all_2023-05.zim", + "lowtechmagazine.com_en_all_2023-08.zim", + "ifixit_de_all_2023-07.zim", + "ifixit_en_all_2023-10.zim", + "der-postillon.com_de_all_2020-12.zim", + "wikihow_en_maxi_2023-03.zim", + }; + public static void main(String[] args) { + Set skip = new HashSet<>(); + for (String s: skip_files) skip.add(s); // zim file import test // will test mostly if domain names are included in zim file urls String zimFilesPath = args[0]; @@ -298,7 +470,10 @@ public class ZimImporter extends Thread implements Importer { } Collection orderedFiles = orderedFileMap.values(); + Set files_ok = new LinkedHashSet<>(); + Set files_nok = new LinkedHashSet<>(); for (File f: orderedFiles) { + if (skip.contains(f.getName())) continue; try { ZIMFile z = new ZIMFile(f.getAbsolutePath()); ZIMReader r = new ZIMReader(z); @@ -308,16 +483,20 @@ public class ZimImporter extends Thread implements Importer { System.out.println("Namespace: " + de.namespace); System.out.println("Title: " + de.title); System.out.println("URL: " + de.url); - System.out.println("guessed domain: " + guessDomainName(f.getName())); + System.out.println("guessed domain: " + guessDomainName(f.getName())); // uses a table and rules that deduces a source from the file name String source = getSource(r); - System.out.println("guessed Source: " + source); - String mainURL = guessURL(source, de); + System.out.println("guessed Source: " + source); // this uses metadata stored in the zim file + DigestURL mainURL = guessURL(source, de); System.out.println("guessed main article: " + mainURL); - System.out.println("main article exists: " + new DigestURL(mainURL).exists(ClientIdentification.browserAgent)); + boolean ok = mainURL.exists(ClientIdentification.browserAgent); + System.out.println("main article exists: " + ok); + if (ok) files_ok.add(f.getName()); else files_nok.add(f.getName()); System.out.println(); } catch (IOException e) { e.printStackTrace(); } } + System.out.println("ok files: " + files_ok.toString()); + System.out.println("not-ok files: " + files_nok.toString()); } } diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java index 0ad6b2248..f02577244 100644 --- a/source/net/yacy/document/parser/pdfParser.java +++ b/source/net/yacy/document/parser/pdfParser.java @@ -53,7 +53,6 @@ import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink; import org.apache.pdfbox.text.PDFTextStripper; -import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; @@ -69,9 +68,6 @@ import net.yacy.kelondro.util.MemoryControl; public class pdfParser extends AbstractParser implements Parser { - public static boolean individualPages = false; - public static String individualPagePropertyname = "page"; - public pdfParser() { super("Acrobat Portable Document Parser"); this.SUPPORTED_EXTENSIONS.add("pdf"); @@ -149,98 +145,36 @@ public class pdfParser extends AbstractParser implements Parser { // get the links final List> pdflinks = extractPdfLinks(pdfDoc); - // get the fulltext (either per document or for each page) - final PDFTextStripper stripper = new PDFTextStripper(/*StandardCharsets.UTF_8.name()*/); - - if (individualPages) { - // this is a hack which stores individual pages of the source pdf into individual index documents - // the new documents will get a virtual link with a post argument page=X appended to the original url - - // collect text - final int pagecount = pdfDoc.getNumberOfPages(); - final String[] pages = new String[pagecount]; - for (int page = 1; page <= pagecount; page++) { - stripper.setStartPage(page); - stripper.setEndPage(page); - pages[page - 1] = stripper.getText(pdfDoc); - //System.out.println("PAGE " + page + ": " + pages[page - 1]); - } - - // create individual documents for each page - assert pages.length == pdflinks.size() : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.size(); - result = new Document[Math.min(pages.length, pdflinks.size())]; - final String loc = location.toNormalform(true); - for (int page = 0; page < result.length; page++) { - result[page] = new Document( - new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash - mimeType, - StandardCharsets.UTF_8.name(), - this, - null, - docKeywords, - singleList(docTitle), - docAuthor, - docPublisher, - null, - null, - 0.0d, 0.0d, - pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]), - pdflinks == null || page >= pdflinks.size() ? null : pdflinks.get(page), - null, - null, - false, - docDate); - } - } else { - // collect the whole text at once - final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE); - byte[] contentBytes = new byte[0]; - stripper.setEndPage(3); // get first 3 pages (always) - writer.append(stripper.getText(pdfDoc)); - contentBytes = writer.getBytes(); // remember text in case of interrupting thread - - if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read - stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text) - stripper.setEndPage(Integer.MAX_VALUE); // set to default - // we start the pdf parsing in a separate thread to ensure that it can be terminated - final PDDocument pdfDocC = pdfDoc; - final Thread t = new Thread("pdfParser.getText:" + location) { - @Override - public void run() { - try { - writer.append(stripper.getText(pdfDocC)); - } catch (final Throwable e) {} - } - }; - t.start(); - t.join(3000); // pdfbox likes to forget to terminate ... (quite often) - if (t.isAlive()) t.interrupt(); - contentBytes = writer.getBytes(); // get final text before closing writer - writer.close(); // free writer resources - } - - final Collection pdflinksCombined = new HashSet<>(); - for (final Collection pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx); - result = new Document[]{new Document( - location, - mimeType, - StandardCharsets.UTF_8.name(), - this, - null, - docKeywords, - singleList(docTitle), - docAuthor, - docPublisher, - null, - null, - 0.0d, 0.0d, - contentBytes, - pdflinksCombined, - null, - null, - false, - docDate)}; - } + // collect the whole text at once + final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE); + byte[] contentBytes = new byte[0]; + final PDFTextStripper stripper = new PDFTextStripper(); + stripper.setEndPage(Integer.MAX_VALUE); + writer.append(stripper.getText(pdfDoc)); + contentBytes = writer.getBytes(); // remember text in case of interrupting thread + writer.close(); // free writer resources + + final Collection pdflinksCombined = new HashSet<>(); + for (final Collection pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx); + result = new Document[]{new Document( + location, + mimeType, + StandardCharsets.UTF_8.name(), + this, + null, + docKeywords, + singleList(docTitle), + docAuthor, + docPublisher, + null, + null, + 0.0d, 0.0d, + contentBytes, + pdflinksCombined, + null, + null, + false, + docDate)}; } catch (final Throwable e) { //throw new Parser.Failure(e.getMessage(), location); } finally { diff --git a/source/net/yacy/htroot/ConfigParser_p.java b/source/net/yacy/htroot/ConfigParser_p.java index e466d783b..943279382 100644 --- a/source/net/yacy/htroot/ConfigParser_p.java +++ b/source/net/yacy/htroot/ConfigParser_p.java @@ -61,13 +61,6 @@ public class ConfigParser_p { env.setConfig(SwitchboardConstants.PARSER_MIME_DENY, TextParser.getDenyMime()); env.setConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, TextParser.getDenyExtension()); } - - if (post.containsKey("pdfSettings")) { - env.setConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, post.getBoolean("individualPages")); - env.setConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, post.get("individualPagePropertyname", "page")); - pdfParser.individualPages = sb.getConfigBool(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, false); - pdfParser.individualPagePropertyname = sb.getConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, "page"); - } } int i = 0; @@ -94,9 +87,6 @@ public class ConfigParser_p { prop.put("parser", i); - prop.put("individualPages", sb.getConfigBool(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, false)); - prop.put("individualPagePropertyname", sb.getConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, "page")); - // return rewrite properties return prop; } diff --git a/source/net/yacy/htroot/Crawler_p.java b/source/net/yacy/htroot/Crawler_p.java index e95562713..8c898f558 100644 --- a/source/net/yacy/htroot/Crawler_p.java +++ b/source/net/yacy/htroot/Crawler_p.java @@ -774,7 +774,7 @@ public class Crawler_p { } /* - * PPM + * PPM LF MH @@ -784,19 +784,19 @@ public class Crawler_p { if (post != null && post.containsKey("crawlingPerformance")) { final String crawlingPerformance = post.get("crawlingPerformance", "custom"); final long LCbusySleep1 = sb.getConfigLong(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, 1000L); - int wantedPPM = (LCbusySleep1 == 0) ? 30000 : (int) (60000L / LCbusySleep1); + int wantedPPM = (LCbusySleep1 == 0) ? 60000 : (int) (60000L / LCbusySleep1); try { wantedPPM = post.getInt("customPPM", wantedPPM); } catch (final NumberFormatException e) {} if ("minimum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 10; - if ("maximum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 30000; + if ("maximum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 60000; int wPPM = wantedPPM; if ( wPPM <= 0 ) { wPPM = 1; } - if ( wPPM >= 30000 ) { - wPPM = 30000; + if ( wPPM >= 60000 ) { + wPPM = 60000; } final int newBusySleep = 60000 / wPPM; // for wantedPPM = 10: 6000; for wantedPPM = 1000: 60 diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java index 33b797524..2d93ec8b7 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java @@ -981,17 +981,7 @@ public class URIMetadataNode extends SolrDocument /* implements Comparablepublic static final String PROXY_ONLINE_CAUTION_DELAY = "onlineCautionDelay"

*

Name of the setting how long indexing should pause after the last time the proxy was used in milliseconds

diff --git a/source/org/openzim/ZIMFile.java b/source/org/openzim/ZIMFile.java index 906bf30a9..a241507ab 100644 --- a/source/org/openzim/ZIMFile.java +++ b/source/org/openzim/ZIMFile.java @@ -113,20 +113,24 @@ public class ZIMFile extends File { } this.mimeTypeList = mList.toArray(new String[mList.size()]); - // Initialize the Url Pointer List - this.urlPtrListBlob = new byte[this.header_entryCount * 8]; - mReader.seek(this.header_urlPtrPos); - RandomAccessFileZIMInputStream.readFully(mReader, this.urlPtrListBlob); - - // Initialize the Title Pointer List - this.titlePtrListBlob = new byte[this.header_entryCount * 4]; - mReader.seek(this.header_titlePtrPos); - RandomAccessFileZIMInputStream.readFully(mReader, this.titlePtrListBlob); - - // Initialize the Cluster Pointer List - this.clusterPtrListBlob = new byte[this.header_clusterCount * 8]; - mReader.seek(this.header_clusterPtrPos); - RandomAccessFileZIMInputStream.readFully(mReader, this.clusterPtrListBlob); + try { + // Initialize the Url Pointer List + this.urlPtrListBlob = new byte[this.header_entryCount * 8]; + mReader.seek(this.header_urlPtrPos); + RandomAccessFileZIMInputStream.readFully(mReader, this.urlPtrListBlob); + + // Initialize the Title Pointer List + this.titlePtrListBlob = new byte[this.header_entryCount * 4]; + mReader.seek(this.header_titlePtrPos); + RandomAccessFileZIMInputStream.readFully(mReader, this.titlePtrListBlob); + + // Initialize the Cluster Pointer List + this.clusterPtrListBlob = new byte[this.header_clusterCount * 8]; + mReader.seek(this.header_clusterPtrPos); + RandomAccessFileZIMInputStream.readFully(mReader, this.clusterPtrListBlob); + } catch (IndexOutOfBoundsException e) { + throw new IOException(e.getMessage()); + } } public final String getMimeType(int idx) { From 34a9fc1a076e89a67b351bc19bd1c2a67e730c60 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Sun, 5 Nov 2023 12:46:37 +0100 Subject: [PATCH 04/11] bugfixes to zim reader: --- .../cora/document/id/MultiProtocolURL.java | 1 - .../yacy/document/importer/ZimImporter.java | 40 +++++++++++++++++-- source/org/openzim/ZIMFile.java | 1 + source/org/openzim/ZIMReader.java | 16 +++++++- 4 files changed, 52 insertions(+), 6 deletions(-) diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java index 1cac0dace..768ca0aa6 100644 --- a/source/net/yacy/cora/document/id/MultiProtocolURL.java +++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java @@ -37,7 +37,6 @@ import java.io.UnsupportedEncodingException; import java.net.InetAddress; import java.net.MalformedURLException; import java.net.URLDecoder; -import java.net.http.HttpResponse; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.util.BitSet; diff --git a/source/net/yacy/document/importer/ZimImporter.java b/source/net/yacy/document/importer/ZimImporter.java index 118e27e40..bc7266e0a 100644 --- a/source/net/yacy/document/importer/ZimImporter.java +++ b/source/net/yacy/document/importer/ZimImporter.java @@ -26,11 +26,13 @@ package net.yacy.document.importer; import java.io.File; import java.io.IOException; import java.net.MalformedURLException; -import java.net.URL; +import java.text.ParseException; +import java.text.SimpleDateFormat; import java.util.Collection; import java.util.Date; import java.util.HashSet; import java.util.LinkedHashSet; +import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.TreeMap; @@ -93,6 +95,8 @@ public class ZimImporter extends Thread implements Importer { try { this.reader = new ZIMReader(this.file); this.guessedSource = getSource(this.reader); + Date guessedDate = getDate(this.reader); + String dates = HeaderFramework.newRfc1123Format().format(guessedDate); // verify the source DirectoryEntry mainEntry = this.reader.getMainDirectoryEntry(); @@ -108,6 +112,7 @@ public class ZimImporter extends Thread implements Importer { DirectoryEntry de = this.reader.getDirectoryInfo(i); if (!(de instanceof ZIMReader.ArticleEntry)) continue; ArticleEntry ae = (ArticleEntry) de; + if (ae.namespace != 'C' && ae.namespace != 'A') continue; // check url DigestURL guessedUrl = guessURL(this.guessedSource, de); @@ -121,6 +126,7 @@ public class ZimImporter extends Thread implements Importer { // check availability of text parser String mimeType = ae.getMimeType(); + if (!mimeType.startsWith("text/") && !mimeType.equals("application/epub+zip")) continue; // in this import we want only text, not everything that is possible if (TextParser.supportsMime(mimeType) != null) continue; // read the content @@ -130,6 +136,7 @@ public class ZimImporter extends Thread implements Importer { RequestHeader requestHeader = new RequestHeader(); ResponseHeader responseHeader = new ResponseHeader(200); responseHeader.put(HeaderFramework.CONTENT_TYPE, de.getMimeType()); // very important to tell parser which kind of content + responseHeader.put(HeaderFramework.LAST_MODIFIED, dates); // put in the guessd date to have something that is not the current date final Request request = new Request( ASCII.getBytes(sb.peers.mySeed().hash), guessedUrl, @@ -230,8 +237,6 @@ public class ZimImporter extends Thread implements Importer { return "fas.org"; case "fonts": return "fonts.google.com"; - case "gutenberg": - return "https://dev.library.kiwix.org/viewer#gutenberg_de_all_2023-03"; case "ifixit": return "ifixit.com"; case "lesfondamentaux": @@ -313,12 +318,22 @@ public class ZimImporter extends Thread implements Importer { return source; } + public static Date getDate(ZIMReader r) throws IOException { + String date = r.getMetadata("Date"); + if (date != null) try { + SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd", Locale.US); + return format.parse(date); + } catch (ParseException e) {} + // failover situation: use file date + return new Date(r.getZIMFile().lastModified()); + } + public static DigestURL guessURL(String guessedSource, DirectoryEntry de) throws MalformedURLException { String url = de.url; if (url.equals("Main_Page")) url = ""; - if (guessedSource != null) return new DigestURL(guessedSource + url); if (url.startsWith("A/")) return new DigestURL("https://" + url.substring(2)); if (url.startsWith("H/")) return new DigestURL("https://" + url.substring(2)); + if (guessedSource != null) return new DigestURL(guessedSource + url); return new DigestURL(guessedSource + url); } @@ -439,6 +454,22 @@ public class ZimImporter extends Thread implements Importer { "ted_en_design_2023-09.zim", "ted_en_business_2023-09.zim", "ted_en_global_issues_2023-09.zim", + "opentextbooks_en_all_2023-08.zim", + "bestedlessons.org_en_all_2023-08.zim", + "wikivoyage_en_all_nopic_2023-10.zim", + "based.cooking_en_all_2023-10.zim", + "wordnet_en_all_2023-04.zim", + "internet-encyclopedia-philosophy_en_all_2023-08.zim", + "100r-off-the-grid_en_2023-09.zim", + "coopmaths_2023-04.zim", + "birds-of-ladakh_en_all_2023-02.zim", + "storyweaver.org_en_2023-09.zim", + "developer.mozilla.org_en_all_2023-02.zim", + "www.ready.gov_es_2023-06.zim", + "teoria.com_en_2023-08.zim", + "theworldfactbook_en_all_2023-06.zim", + "mutopiaproject.org_en_2023-08.zim", + "dp.la_en_all_2023-08.zim", // 302 "moderators.stackexchange.com_en_all_2023-05.zim", @@ -483,6 +514,7 @@ public class ZimImporter extends Thread implements Importer { System.out.println("Namespace: " + de.namespace); System.out.println("Title: " + de.title); System.out.println("URL: " + de.url); + System.out.println("Mime Type " + de.getMimeType()); System.out.println("guessed domain: " + guessDomainName(f.getName())); // uses a table and rules that deduces a source from the file name String source = getSource(r); System.out.println("guessed Source: " + source); // this uses metadata stored in the zim file diff --git a/source/org/openzim/ZIMFile.java b/source/org/openzim/ZIMFile.java index a241507ab..cbde3a0a8 100644 --- a/source/org/openzim/ZIMFile.java +++ b/source/org/openzim/ZIMFile.java @@ -134,6 +134,7 @@ public class ZIMFile extends File { } public final String getMimeType(int idx) { + if (idx >= this.mimeTypeList.length) return ""; return this.mimeTypeList[idx]; } diff --git a/source/org/openzim/ZIMReader.java b/source/org/openzim/ZIMReader.java index 27d544e27..14bc47dfd 100644 --- a/source/org/openzim/ZIMReader.java +++ b/source/org/openzim/ZIMReader.java @@ -237,11 +237,25 @@ public class ZIMReader { public DirectoryEntry getMainDirectoryEntry() throws IOException { DirectoryEntry de = getDirectoryInfo(this.mFile.header_mainPage); - if (de.namespace == 'W' && de.url.equals("mainPage") && de instanceof RedirectEntry) { + if (de instanceof RedirectEntry) { // resolve redirect to get the actual main page int redirect = ((RedirectEntry) de).redirect_index; de = getDirectoryInfo(redirect); } + // For the main entry we demand a "text/html" mime type. + // Many zim files do not provide this as the main file, which is strange (maybe lazy/irresponsibe) + // Because the main entry is important for a validation, we seek for one entry which may + // be proper for indexing. + int entryNumner = 0; + while (!de.getMimeType().equals("text/html") && entryNumner < this.mFile.header_entryCount) { + de = getDirectoryInfo(entryNumner); + entryNumner++; + if (de.namespace != 'C' && de.namespace != 'A') continue; + if (!(de instanceof ArticleEntry)) continue; + if (!de.getMimeType().equals("text/html")) continue; + if (de.url.contains("404") || de.title.contains("404") || de.title.contains("301")) continue; // is a pain + return de; + } return de; } From 24011dcbcc26f79f80a81a8dae59c6f65c1cee25 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Mon, 6 Nov 2023 22:44:18 +0100 Subject: [PATCH 05/11] more file name extensions for json list surrogate files --- source/net/yacy/search/Switchboard.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index bebd16cbd..bfadb1892 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2166,7 +2166,10 @@ public final class Switchboard extends serverSwitch { this.log.warn("IO Error processing zim file " + infile); } return moved; - } else if (s.endsWith(".jsonlist") || s.endsWith(".jsonlist.gz") || s.endsWith(".flatjson")) { + } else if ( + s.endsWith(".jsonl") || s.endsWith(".jsonl.gz") || + s.endsWith(".jsonlist") || s.endsWith(".jsonlist.gz") || + s.endsWith(".flatjson") || s.endsWith(".flatjson.gz")) { return this.processSurrogateJson(infile, outfile); } InputStream is = null; From 655d8db80218312593f37b6026fd6dc0db01d23f Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Sun, 12 Nov 2023 15:26:18 +0100 Subject: [PATCH 06/11] detailed directions in index export to explain how the export can be imported again using elasticsearch/opensearch --- htroot/IndexExport_p.html | 51 +++++++++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 13 deletions(-) diff --git a/htroot/IndexExport_p.html b/htroot/IndexExport_p.html index 1aa992716..eb3e1f188 100644 --- a/htroot/IndexExport_p.html +++ b/htroot/IndexExport_p.html @@ -9,11 +9,10 @@ #%env/templates/header.template%# #%env/templates/submenuIndexImport.template%# - - +

Index Export

The local index currently contains #[ucount]# documents, only #[ucount200]# exportable with status code 200 - the remaining are error documents.

- + #(lurlexport)#::
Loaded URL Export @@ -34,19 +33,45 @@
Full Data Records:
-
XML (Rich and full-text Solr data, one document per line in one large xml file, can be processed with shell tools, can be imported with DATA/SURROGATE/in/)
- JSON (Rich and full-text Elasticsearch data, one document per line in one flat JSON file, can be bulk-imported to elasticsearch with the command "curl -XPOST localhost:9200/collection1/yacy/_bulk --data-binary @yacy_dump_XXX.flatjson")
- XML (RSS)
+
+ JSON (Rich and full-text Elasticsearch data, one document per line in one flat JSON file, + can be bulk-imported to elasticsearch. Here is an example for opensearch, using docker:
+Start docker container of opensearch:
+docker run --name opensearch -p 9200:9200 -d -e OPENSEARCH_JAVA_OPTS="-Xms2G -Xmx2G" -e discovery.type=single-node -e DISABLE_SECURITY_PLUGIN=true -v $(pwd)/opensearch_data:/usr/share/opensearch/data opensearchproject/opensearch:latest
+Unblock index creation:
+curl -X PUT "http://localhost:9200/_cluster/settings" -H 'Content-Type: application/json' -d' +{ + "persistent": { + "cluster.blocks.create_index": null + } +}'
+Create the search index:
+curl -X PUT "http://localhost:9200/collection1/yacy"
+Bulk-upload the index file:
+curl -XPOST "http://localhost:9200/collection1/yacy/_bulk?filter_path=took,errors" -H "Content-Type: application/x-ndjson" --data-binary @yacy_dump_XXX.flatjson
+Make a search, get 10 results, search in fields text_t, title, description with boosts:
+curl -X POST "http://localhost:9200/collection1/yacy/_search" -H 'Content-Type: application/json' -d' +{"size": 10, "query": {"multi_match": { + "query": "one two three", + "fields": ["text_t", "title^10", "description^3"], "fuzziness": "AUTO" +}}}'
+ + XML (Rich and full-text Solr data, one document per line in one large xml file, + can be processed with shell tools, can be imported with DATA/SURROGATE/in/) +
+ + XML (RSS) +
Full URL List:
Plain Text List (URLs only)
HTML (URLs with title)
Only Domain:
Plain Text List (domains only)
HTML (domains as URLs, no title)
-
Only Text:
+
Only Text:
Fulltext of Search Index Text
-
-
+ +
 
@@ -55,16 +80,16 @@ ::
Export to file #[exportfile]# is running .. #[urlcount]# Documents so far
:: #(/lurlexport)# - - #(lurlexportfinished)#:: + + #(lurlexportfinished)#::
Finished export of #[urlcount]# Documents to file #[exportfile]#
Import this file by moving it to DATA/SURROGATES/in
:: #(/lurlexportfinished)# - + #(lurlexporterror)#::
Export to file #[exportfile]# failed: #[exportfailmsg]#
:: #(/lurlexporterror)# - + #(dumprestore)#::
Dump and Restore of Solr Index From c20c4b8a21364bf06d1f91c63650fcf3b434ba04 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Sun, 12 Nov 2023 22:11:55 +0100 Subject: [PATCH 07/11] modified export: added maximum number of docs per chunk The export file can now be many files, called chunks. By default still only one chunk is exported. This function is required in case that the exported files shall be imported to an elasticsearch/opensearch index. The bulk import function of elasticsearch/opensearch is limited to 100MB. To make it possible to import YaCy files, those must be splitted into chunks. Right now we cannot estimate the chunk size as bytes, only as number of documents. The user must do experiments to find out the optimum chunk max size, like 50000 docs per chunk. Try this as first attempt. --- htroot/IndexExport_p.html | 11 +- source/net/yacy/htroot/IndexExport_p.java | 64 +++---- source/net/yacy/search/index/Fulltext.java | 197 ++++++++++++++------- 3 files changed, 171 insertions(+), 101 deletions(-) diff --git a/htroot/IndexExport_p.html b/htroot/IndexExport_p.html index eb3e1f188..87ee4b62d 100644 --- a/htroot/IndexExport_p.html +++ b/htroot/IndexExport_p.html @@ -21,13 +21,16 @@
URL Filter
-
+
 .*.* (default) is a catch-all; format: java regex
query
-
+
 *:* (default) is a catch-all; format: :
-
maximum age (seconds, -1 = unlimited)
-
+
maximum age (seconds)
+
 -1 = unlimited -> no document is too old +
+
maximum number of records per chunk
+
 if exceeded: several chunks are stored; -1 = unlimited (makes only one chunk)
Export Format
diff --git a/source/net/yacy/htroot/IndexExport_p.java b/source/net/yacy/htroot/IndexExport_p.java index 78cc94132..aa5fc6f09 100644 --- a/source/net/yacy/htroot/IndexExport_p.java +++ b/source/net/yacy/htroot/IndexExport_p.java @@ -64,8 +64,8 @@ public class IndexExport_p { prop.put("lurlexport", 0); prop.put("reload", 0); prop.put("dumprestore", 1); - prop.put("dumprestore_dumpRestoreEnabled", sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_FULLTEXT, - SwitchboardConstants.CORE_SERVICE_FULLTEXT_DEFAULT)); + prop.put("dumprestore_dumpRestoreEnabled", sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_FULLTEXT, + SwitchboardConstants.CORE_SERVICE_FULLTEXT_DEFAULT)); List dumpFiles = segment.fulltext().dumpFiles(); prop.put("dumprestore_dumpfile", dumpFiles.size() == 0 ? "" : dumpFiles.get(dumpFiles.size() - 1).getAbsolutePath()); prop.put("dumprestore_optimizemax", 10); @@ -80,7 +80,7 @@ public class IndexExport_p { prop.put("lurlexportfinished", 0); prop.put("lurlexporterror", 0); prop.put("lurlexport_exportfile", export.file().toString()); - prop.put("lurlexport_urlcount", export.count()); + prop.put("lurlexport_urlcount", export.docCount()); prop.put("reload", 1); } else { prop.put("lurlexport", 1); @@ -93,7 +93,7 @@ public class IndexExport_p { // an export was running but has finished prop.put("lurlexportfinished", 1); prop.put("lurlexportfinished_exportfile", export.file().toString()); - prop.put("lurlexportfinished_urlcount", export.count()); + prop.put("lurlexportfinished_urlcount", export.docCount()); if (export.failed() == null) { prop.put("lurlexporterror", 0); } else { @@ -123,6 +123,8 @@ public class IndexExport_p { final String filter = post.get("exportfilter", ".*"); final String query = post.get("exportquery", "*:*"); final int maxseconds = post.getInt("exportmaxseconds", -1); + long maxChunkSize = post.getLong("maxchunksize", Long.MAX_VALUE); + if (maxChunkSize <= 0) maxChunkSize = Long.MAX_VALUE; final String path = post.get("exportfilepath", ""); // store this call as api call: we do this even if there is a chance that it fails because recurring calls may do not fail @@ -130,7 +132,7 @@ public class IndexExport_p { // start the export try { - export = sb.index.fulltext().export(format, filter, query, maxseconds, new File(path), dom, text); + export = sb.index.fulltext().export(format, filter, query, maxseconds, new File(path), dom, text, maxChunkSize); } catch (final IOException e) { prop.put("lurlexporterror", 1); prop.put("lurlexporterror_exportfile", "-no export-"); @@ -140,7 +142,7 @@ public class IndexExport_p { // show result prop.put("lurlexport_exportfile", export.file().toString()); - prop.put("lurlexport_urlcount", export.count()); + prop.put("lurlexport_urlcount", export.docCount()); if ((export != null) && (export.failed() == null)) { prop.put("lurlexport", 2); } @@ -148,34 +150,34 @@ public class IndexExport_p { } if (post.containsKey("indexdump")) { - try { - final File dump = segment.fulltext().dumpEmbeddedSolr(); - prop.put("indexdump", 1); - prop.put("indexdump_dumpfile", dump.getAbsolutePath()); - dumpFiles = segment.fulltext().dumpFiles(); - prop.put("dumprestore_dumpfile", dumpFiles.size() == 0 ? "" : dumpFiles.get(dumpFiles.size() - 1).getAbsolutePath()); - // sb.tables.recordAPICall(post, "IndexExport_p.html", WorkTables.TABLE_API_TYPE_STEERING, "solr dump generation"); - } catch(final SolrException e) { - if(ErrorCode.SERVICE_UNAVAILABLE.code == e.code()) { - prop.put("indexdump", 2); - } else { - prop.put("indexdump", 3); - } - } + try { + final File dump = segment.fulltext().dumpEmbeddedSolr(); + prop.put("indexdump", 1); + prop.put("indexdump_dumpfile", dump.getAbsolutePath()); + dumpFiles = segment.fulltext().dumpFiles(); + prop.put("dumprestore_dumpfile", dumpFiles.size() == 0 ? "" : dumpFiles.get(dumpFiles.size() - 1).getAbsolutePath()); + // sb.tables.recordAPICall(post, "IndexExport_p.html", WorkTables.TABLE_API_TYPE_STEERING, "solr dump generation"); + } catch(final SolrException e) { + if(ErrorCode.SERVICE_UNAVAILABLE.code == e.code()) { + prop.put("indexdump", 2); + } else { + prop.put("indexdump", 3); + } + } } if (post.containsKey("indexrestore")) { - try { - final File dump = new File(post.get("dumpfile", "")); - segment.fulltext().restoreEmbeddedSolr(dump); - prop.put("indexRestore", 1); - } catch(final SolrException e) { - if(ErrorCode.SERVICE_UNAVAILABLE.code == e.code()) { - prop.put("indexRestore", 2); - } else { - prop.put("indexRestore", 3); - } - } + try { + final File dump = new File(post.get("dumpfile", "")); + segment.fulltext().restoreEmbeddedSolr(dump); + prop.put("indexRestore", 1); + } catch(final SolrException e) { + if(ErrorCode.SERVICE_UNAVAILABLE.code == e.code()) { + prop.put("indexRestore", 2); + } else { + prop.put("indexRestore", 3); + } + } } // insert constants diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index 718be0099..d8a1754a7 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -695,7 +695,10 @@ public final class Fulltext { } public final static String yacy_dump_prefix = "yacy_dump_"; - public Export export(Fulltext.ExportFormat format, String filter, String query, final int maxseconds, File path, boolean dom, boolean text) throws IOException { + public Export export( + Fulltext.ExportFormat format, String filter, String query, + final int maxseconds, File path, boolean dom, boolean text, + long maxChunkSize) throws IOException { // modify query according to maxseconds final long now = System.currentTimeMillis(); @@ -760,27 +763,26 @@ public final class Fulltext { } } - String s = new File(path, yacy_dump_prefix + + String filename = yacy_dump_prefix + "f" + GenericFormatter.SHORT_MINUTE_FORMATTER.format(firstdate) + "_" + "l" + GenericFormatter.SHORT_MINUTE_FORMATTER.format(lastdate) + "_" + "n" + GenericFormatter.SHORT_MINUTE_FORMATTER.format(new Date(now)) + "_" + - "c" + String.format("%1$012d", doccount)).getAbsolutePath() + "_tc"; // the name ends with the transaction token ('c' = 'created') + "c" + String.format("%1$012d", doccount)+ "_tc"; // the name ends with the transaction token ('c' = 'created') - // create export file name - if (s.indexOf('.',0) < 0) s += "." + format.getExt(); - final File f = new File(s); - f.getParentFile().mkdirs(); - - return export(f, filter, query, format, dom, text); + return export(path, filename, format.getExt(), filter, query, format, dom, text, maxChunkSize); } // export methods - public Export export(final File f, final String filter, final String query, final ExportFormat format, final boolean dom, final boolean text) { + public Export export( + final File path, final String filename, + final String fileext, final String filter, final String query, + final ExportFormat format, final boolean dom, final boolean text, + long maxChunkSize) { if ((this.exportthread != null) && (this.exportthread.isAlive())) { ConcurrentLog.warn("LURL-EXPORT", "cannot start another export thread, already one running"); return this.exportthread; } - this.exportthread = new Export(f, filter, query, format, dom, text); + this.exportthread = new Export(path, filename, fileext, filter, query, format, dom, text, maxChunkSize); this.exportthread.start(); return this.exportthread; } @@ -795,69 +797,95 @@ public final class Fulltext { } public class Export extends Thread { - private final File f; + private final File path; + private final String filename, fileext; private final Pattern pattern; - private int count; private String failure; private final String query; private final ExportFormat format; private final boolean dom, text; - - private Export(final File f, final String filter, final String query, final ExportFormat format, final boolean dom, final boolean text) { + private int docCount, chunkSize, chunkCount; + private final long maxChunkSize; + + private Export( + final File path, final String filename, + final String fileext, final String filter, final String query, + final ExportFormat format, final boolean dom, final boolean text, + long maxChunkSize) { super("Fulltext.Export"); // format: 0=text, 1=html, 2=rss/xml - this.f = f; + this.path = path; + this.filename = filename; + this.fileext = fileext; this.pattern = filter == null ? null : Pattern.compile(filter); this.query = query == null? AbstractSolrConnector.CATCHALL_QUERY : query; - this.count = 0; this.failure = null; this.format = format; this.dom = dom; this.text = text; + this.docCount = 0; // number of all documents exported so far + this.chunkSize = 0; // number of documents in the current chunk + this.chunkCount = 0; // number of chunks opened so far + this.maxChunkSize = maxChunkSize; // number of maximum document count per chunk //if ((dom) && (format == 2)) dom = false; } + private void printHead(PrintWriter pw) { + if (this.format == ExportFormat.html) { + pw.println(""); + } + if (this.format == ExportFormat.rss) { + pw.println(""); + pw.println(""); + pw.println(""); + pw.println(""); + pw.println("YaCy Peer-to-Peer - Web-Search URL Export"); + pw.println(""); + pw.println("http://yacy.net"); + } + if (this.format == ExportFormat.solr) { + pw.println(""); + pw.println(""); + pw.println(""); + pw.println(" "); + pw.println(" "); + pw.println(" " + this.query + ""); + pw.println(" "); + pw.println(""); + pw.println(""); + } + } + + private void printTail(PrintWriter pw) { + if (this.format == ExportFormat.html) { + pw.println(""); + } + if (this.format == ExportFormat.rss) { + pw.println(""); + pw.println(""); + } + if (this.format == ExportFormat.solr) { + pw.println(""); + pw.println(""); + } + } + @Override public void run() { try { - final File parentf = this.f.getParentFile(); - if (parentf != null) { - parentf.mkdirs(); - } + if (this.path != null) this.path.mkdirs(); } catch(final Exception e) { ConcurrentLog.logException(e); this.failure = e.getMessage(); return; } - try (/* Resources automatically closed by this try-with-resources statement */ - final OutputStream os = new FileOutputStream(this.format == ExportFormat.solr ? new File(this.f.getAbsolutePath() + ".gz") : this.f); - final OutputStream wrappedStream = ((this.format == ExportFormat.solr)) ? new GZIPOutputStream(os, 65536){{this.def.setLevel(Deflater.BEST_COMPRESSION);}} : os; - final PrintWriter pw = new PrintWriter(new BufferedOutputStream(wrappedStream)); - ) { - if (this.format == ExportFormat.html) { - pw.println(""); - } - if (this.format == ExportFormat.rss) { - pw.println(""); - pw.println(""); - pw.println(""); - pw.println(""); - pw.println("YaCy Peer-to-Peer - Web-Search URL Export"); - pw.println(""); - pw.println("http://yacy.net"); - } - if (this.format == ExportFormat.solr) { - pw.println(""); - pw.println(""); - pw.println(""); - pw.println(" "); - pw.println(" "); - pw.println(" " + this.query + ""); - pw.println(" "); - pw.println(""); - pw.println(""); - } + try { + docCount = 0; + chunkSize = 0; + chunkCount = 0; + PrintWriter pw = getWriter(); + printHead(pw); if (this.dom) { final Map> scores = Fulltext.this.getDefaultConnector().getFacets(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", 100000000, CollectionSchema.host_s.getSolrFieldName()); final ReversibleScoreMap stats = scores.get(CollectionSchema.host_s.getSolrFieldName()); @@ -865,7 +893,7 @@ public final class Fulltext { if (this.pattern != null && !this.pattern.matcher(host).matches()) continue; if (this.format == ExportFormat.text) pw.println(host); if (this.format == ExportFormat.html) pw.println("" + host + "
"); - this.count++; + this.docCount++; this.chunkSize++; } } else { if (this.format == ExportFormat.solr || this.format == ExportFormat.elasticsearch || (this.text && this.format == ExportFormat.text)) { @@ -882,7 +910,14 @@ public final class Fulltext { if (this.format == ExportFormat.elasticsearch) pw.println("{\"index\":{}}"); final String d = sw.toString(); pw.println(d); - this.count++; + this.docCount++; this.chunkSize++; + if (this.chunkSize >= this.maxChunkSize) { + printTail(pw); + pw.close(); + pw = getWriter(); // increases chunkCount as side-effect + printHead(pw); + this.chunkSize = 0; + } } } else { final BlockingQueue docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", null, 0, 100000000, Long.MAX_VALUE, 100, 1, true, @@ -918,21 +953,19 @@ public final class Fulltext { pw.println("" + hash + ""); pw.println(""); } - this.count++; + this.docCount++; this.chunkSize++; + if (this.chunkSize >= this.maxChunkSize) { + printTail(pw); + pw.close(); + pw = getWriter(); // increases chunkCount as side-effect + printHead(pw); + this.chunkSize = 0; + } } } } - if (this.format == ExportFormat.html) { - pw.println(""); - } - if (this.format == ExportFormat.rss) { - pw.println("
"); - pw.println("
"); - } - if (this.format == ExportFormat.solr) { - pw.println(""); - pw.println(""); - } + printTail(pw); + pw.close(); } catch (final Exception e) { /* Catch but log any IO exception that can occur on copy, automatic closing or streams creation */ ConcurrentLog.logException(e); @@ -942,15 +975,47 @@ public final class Fulltext { } public File file() { - return this.f; + final File f = new File(this.path, this.filename + "_" + chunkcount(this.chunkCount) + "." + this.fileext); + return f; + } + + private PrintWriter getWriter() throws IOException { + File f = file(); + final OutputStream os = new FileOutputStream(this.format == ExportFormat.solr ? new File(f.getAbsolutePath() + ".gz") : f); + final PrintWriter pw = new PrintWriter(new BufferedOutputStream(((this.format == ExportFormat.solr)) ? new GZIPOutputStream(os, 65536){{this.def.setLevel(Deflater.BEST_COMPRESSION);}} : os)); + this.chunkCount++; + return pw; + } + + private String chunkcount(int count) { + if (count < 10) return "000" + count; + if (count < 100) return "00" + count; + if (count < 1000) return "0" + count; + return "" + count; + } + + public File path() { + return this.path; + } + + public String filename() { + return this.filename; + } + + public String fileext() { + return this.fileext; } public String failed() { return this.failure; } - public int count() { - return this.count; + public int docCount() { + return this.docCount; + } + + public int chunkCount() { + return this.chunkCount; } @SuppressWarnings("unchecked") From 3268a93019aa00b49fdfb5925d7caac3c1a94274 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Mon, 13 Nov 2023 10:27:50 +0100 Subject: [PATCH 08/11] added a 'minified' option to YaCy dumps --- htroot/IndexExport_p.html | 5 ++ source/net/yacy/htroot/IndexExport_p.java | 3 +- source/net/yacy/search/index/Fulltext.java | 87 ++++++++++++++-------- 3 files changed, 62 insertions(+), 33 deletions(-) diff --git a/htroot/IndexExport_p.html b/htroot/IndexExport_p.html index 87ee4b62d..df58837c2 100644 --- a/htroot/IndexExport_p.html +++ b/htroot/IndexExport_p.html @@ -32,6 +32,11 @@
maximum number of records per chunk
 if exceeded: several chunks are stored; -1 = unlimited (makes only one chunk)
+
Export Size
+
+ full size, all fields:  + minified; only fields sku, date, title, description, text_t +
Export Format
diff --git a/source/net/yacy/htroot/IndexExport_p.java b/source/net/yacy/htroot/IndexExport_p.java index aa5fc6f09..667ba5711 100644 --- a/source/net/yacy/htroot/IndexExport_p.java +++ b/source/net/yacy/htroot/IndexExport_p.java @@ -126,13 +126,14 @@ public class IndexExport_p { long maxChunkSize = post.getLong("maxchunksize", Long.MAX_VALUE); if (maxChunkSize <= 0) maxChunkSize = Long.MAX_VALUE; final String path = post.get("exportfilepath", ""); + final boolean minified = post.get("minified", "no").equals("yes"); // store this call as api call: we do this even if there is a chance that it fails because recurring calls may do not fail if (maxseconds != -1) sb.tables.recordAPICall(post, "IndexExport_p.html", WorkTables.TABLE_API_TYPE_DUMP, format + "-dump, q=" + query + ", maxseconds=" + maxseconds); // start the export try { - export = sb.index.fulltext().export(format, filter, query, maxseconds, new File(path), dom, text, maxChunkSize); + export = sb.index.fulltext().export(format, filter, query, maxseconds, new File(path), dom, text, maxChunkSize, minified); } catch (final IOException e) { prop.put("lurlexporterror", 1); prop.put("lurlexporterror_exportfile", "-no export-"); diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index d8a1754a7..cd9680b27 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -34,8 +34,10 @@ import java.util.ArrayList; import java.util.Collection; import java.util.Date; import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import java.util.Set; import java.util.concurrent.BlockingQueue; import java.util.concurrent.atomic.AtomicInteger; @@ -118,7 +120,7 @@ public final class Fulltext { this.writeWebgraph = false; } - public void setUseWebgraph(boolean check) { + public void setUseWebgraph(final boolean check) { this.writeWebgraph = check; } @@ -142,8 +144,8 @@ public final class Fulltext { final File solrLocation = new File(this.segmentPath, SOLR_PATH); // migrate old solr to new - for (String oldVersion: SOLR_OLD_PATH) { - File oldLocation = new File(this.segmentPath, oldVersion); + for (final String oldVersion: SOLR_OLD_PATH) { + final File oldLocation = new File(this.segmentPath, oldVersion); if (oldLocation.exists()) { if (!oldLocation.renameTo(solrLocation)) { ConcurrentLog.severe("Fulltext", "Failed renaming old Solr location (" @@ -183,11 +185,11 @@ public final class Fulltext { return this.solrInstances.getDefaultEmbeddedConnector(); } - public EmbeddedSolrConnector getEmbeddedConnector(String corename) { + public EmbeddedSolrConnector getEmbeddedConnector(final String corename) { return this.solrInstances.getEmbeddedConnector(corename); } - public SolrConnector getConnectorForRead(String corename) { + public SolrConnector getConnectorForRead(final String corename) { if (this.solrInstances.isConnectedRemote()) return this.solrInstances.getRemoteConnector(corename); if (this.solrInstances.isConnectedEmbedded()) return this.solrInstances.getEmbeddedConnector(corename); return null; @@ -315,7 +317,7 @@ public final class Fulltext { } private long lastCommit = 0; - public void commit(boolean softCommit) { + public void commit(final boolean softCommit) { final long t = System.currentTimeMillis(); if (this.lastCommit + 10000 > t) return; this.lastCommit = t; @@ -423,7 +425,7 @@ public final class Fulltext { * @param freshdate either NULL or a date in the past which is the limit for deletion. Only documents older than this date are deleted * @throws IOException */ - public void deleteStaleDomainHashes(final Set hosthashes, Date freshdate) { + public void deleteStaleDomainHashes(final Set hosthashes, final Date freshdate) { // delete in solr final Date now = new Date(); deleteDomainWithConstraint(this.getDefaultConnector(), CollectionSchema.host_id_s.getSolrFieldName(), hosthashes, @@ -434,7 +436,7 @@ public final class Fulltext { (WebgraphSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]")); } - public void deleteStaleDomainNames(final Set hostnames, Date freshdate) { + public void deleteStaleDomainNames(final Set hostnames, final Date freshdate) { final Date now = new Date(); deleteDomainWithConstraint(this.getDefaultConnector(), CollectionSchema.host_s.getSolrFieldName(), hostnames, @@ -453,7 +455,7 @@ public final class Fulltext { deleteDomainWithConstraint(this.getDefaultConnector(), CollectionSchema.host_id_s.getSolrFieldName(), hosthashes, CollectionSchema.failreason_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM); } - private static void deleteDomainWithConstraint(SolrConnector connector, String fieldname, final Set hosthashes, String constraintQuery) { + private static void deleteDomainWithConstraint(final SolrConnector connector, final String fieldname, final Set hosthashes, final String constraintQuery) { if (hosthashes == null || hosthashes.size() == 0) return; final int subsetscount = 1 + (hosthashes.size() / 255); // if the list is too large, we get a "too many boolean clauses" exception int c = 0; @@ -492,7 +494,7 @@ public final class Fulltext { * @param basepath the left path of the url; at least until the end of the host * @param freshdate either NULL or a date in the past which is the limit for deletion. Only documents older than this date are deleted */ - public int remove(final String basepath, Date freshdate) { + public int remove(final String basepath, final Date freshdate) { DigestURL uri; try {uri = new DigestURL(basepath);} catch (final MalformedURLException e) {return 0;} final String host = uri.getHost(); @@ -690,15 +692,15 @@ public final class Fulltext { public static enum ExportFormat { text("txt"), html("html"), rss("rss"), solr("xml"), elasticsearch("flatjson"); private final String ext; - private ExportFormat(String ext) {this.ext = ext;} + private ExportFormat(final String ext) {this.ext = ext;} public String getExt() {return this.ext;} } public final static String yacy_dump_prefix = "yacy_dump_"; public Export export( - Fulltext.ExportFormat format, String filter, String query, - final int maxseconds, File path, boolean dom, boolean text, - long maxChunkSize) throws IOException { + final Fulltext.ExportFormat format, final String filter, String query, + final int maxseconds, final File path, final boolean dom, final boolean text, + final long maxChunkSize, final boolean minified) throws IOException { // modify query according to maxseconds final long now = System.currentTimeMillis(); @@ -763,13 +765,13 @@ public final class Fulltext { } } - String filename = yacy_dump_prefix + + final String filename = yacy_dump_prefix + "f" + GenericFormatter.SHORT_MINUTE_FORMATTER.format(firstdate) + "_" + "l" + GenericFormatter.SHORT_MINUTE_FORMATTER.format(lastdate) + "_" + "n" + GenericFormatter.SHORT_MINUTE_FORMATTER.format(new Date(now)) + "_" + "c" + String.format("%1$012d", doccount)+ "_tc"; // the name ends with the transaction token ('c' = 'created') - return export(path, filename, format.getExt(), filter, query, format, dom, text, maxChunkSize); + return export(path, filename, format.getExt(), filter, query, format, dom, text, maxChunkSize, minified); } // export methods @@ -777,17 +779,17 @@ public final class Fulltext { final File path, final String filename, final String fileext, final String filter, final String query, final ExportFormat format, final boolean dom, final boolean text, - long maxChunkSize) { + final long maxChunkSize, final boolean minified) { if ((this.exportthread != null) && (this.exportthread.isAlive())) { ConcurrentLog.warn("LURL-EXPORT", "cannot start another export thread, already one running"); return this.exportthread; } - this.exportthread = new Export(path, filename, fileext, filter, query, format, dom, text, maxChunkSize); + this.exportthread = new Export(path, filename, fileext, filter, query, format, dom, text, maxChunkSize, minified); this.exportthread.start(); return this.exportthread; } - public static void main(String args[]) { + public static void main(final String args[]) { final Date firstdate = null; System.out.println(GenericFormatter.SHORT_MINUTE_FORMATTER.format(firstdate)); } @@ -796,6 +798,18 @@ public final class Fulltext { return this.exportthread; } + private final static Set minified_keys = new HashSet<>(); + static { + //minified_keys.add(CollectionSchema.id.getSolrFieldName()); + minified_keys.add(CollectionSchema.sku.getSolrFieldName()); + minified_keys.add(CollectionSchema.title.getSolrFieldName()); + //minified_keys.add(CollectionSchema.author.getSolrFieldName()); + minified_keys.add(CollectionSchema.description_txt.getSolrFieldName()); + //minified_keys.add(CollectionSchema.size_i.getSolrFieldName()); + minified_keys.add(CollectionSchema.last_modified.getSolrFieldName()); + minified_keys.add(CollectionSchema.text_t.getSolrFieldName()); + } + public class Export extends Thread { private final File path; private final String filename, fileext; @@ -806,12 +820,13 @@ public final class Fulltext { private final boolean dom, text; private int docCount, chunkSize, chunkCount; private final long maxChunkSize; + private final boolean minified; private Export( final File path, final String filename, final String fileext, final String filter, final String query, final ExportFormat format, final boolean dom, final boolean text, - long maxChunkSize) { + final long maxChunkSize, final boolean minified) { super("Fulltext.Export"); // format: 0=text, 1=html, 2=rss/xml this.path = path; @@ -827,10 +842,11 @@ public final class Fulltext { this.chunkSize = 0; // number of documents in the current chunk this.chunkCount = 0; // number of chunks opened so far this.maxChunkSize = maxChunkSize; // number of maximum document count per chunk + this.minified = minified; //if ((dom) && (format == 2)) dom = false; } - private void printHead(PrintWriter pw) { + private void printHead(final PrintWriter pw) { if (this.format == ExportFormat.html) { pw.println(""); } @@ -855,8 +871,8 @@ public final class Fulltext { pw.println(""); } } - - private void printTail(PrintWriter pw) { + + private void printTail(final PrintWriter pw) { if (this.format == ExportFormat.html) { pw.println(""); } @@ -869,7 +885,7 @@ public final class Fulltext { pw.println(""); } } - + @Override public void run() { try { @@ -881,9 +897,9 @@ public final class Fulltext { } try { - docCount = 0; - chunkSize = 0; - chunkCount = 0; + this.docCount = 0; + this.chunkSize = 0; + this.chunkCount = 0; PrintWriter pw = getWriter(); printHead(pw); if (this.dom) { @@ -902,6 +918,12 @@ public final class Fulltext { while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { final String url = getStringFrom(doc.getFieldValue(CollectionSchema.sku.getSolrFieldName())); if (this.pattern != null && !this.pattern.matcher(url).matches()) continue; + if (this.minified) { + final Iterator> i = doc.iterator(); + while (i.hasNext()) { + if (!minified_keys.contains(i.next().getKey())) i.remove(); + } + } final CRIgnoreWriter sw = new CRIgnoreWriter(); if (this.text) sw.write((String) doc.getFieldValue(CollectionSchema.text_t.getSolrFieldName())); if (this.format == ExportFormat.solr) EnhancedXMLResponseWriter.writeDoc(sw, doc); @@ -914,7 +936,8 @@ public final class Fulltext { if (this.chunkSize >= this.maxChunkSize) { printTail(pw); pw.close(); - pw = getWriter(); // increases chunkCount as side-effect + this.chunkCount++; + pw = getWriter(); printHead(pw); this.chunkSize = 0; } @@ -957,7 +980,8 @@ public final class Fulltext { if (this.chunkSize >= this.maxChunkSize) { printTail(pw); pw.close(); - pw = getWriter(); // increases chunkCount as side-effect + this.chunkCount++; + pw = getWriter(); printHead(pw); this.chunkSize = 0; } @@ -980,14 +1004,13 @@ public final class Fulltext { } private PrintWriter getWriter() throws IOException { - File f = file(); + final File f = file(); final OutputStream os = new FileOutputStream(this.format == ExportFormat.solr ? new File(f.getAbsolutePath() + ".gz") : f); final PrintWriter pw = new PrintWriter(new BufferedOutputStream(((this.format == ExportFormat.solr)) ? new GZIPOutputStream(os, 65536){{this.def.setLevel(Deflater.BEST_COMPRESSION);}} : os)); - this.chunkCount++; return pw; } - private String chunkcount(int count) { + private String chunkcount(final int count) { if (count < 10) return "000" + count; if (count < 100) return "00" + count; if (count < 1000) return "0" + count; From 656b3e3e771159791392b48a2866ace6ce12cf4b Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Mon, 13 Nov 2023 10:59:49 +0100 Subject: [PATCH 09/11] updated guava to latest and added missing library for failureaccess --- ivy.xml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ivy.xml b/ivy.xml index 8c072699d..85af1d3f3 100644 --- a/ivy.xml +++ b/ivy.xml @@ -14,7 +14,8 @@ - + + From ceb07a52186a5c8107bbd1fc73683cd9789f004b Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Mon, 13 Nov 2023 11:12:10 +0100 Subject: [PATCH 10/11] fixed problem with zim importer which crashed when non-valid urls appeared --- .../yacy/document/importer/ZimImporter.java | 109 +++++++++--------- 1 file changed, 57 insertions(+), 52 deletions(-) diff --git a/source/net/yacy/document/importer/ZimImporter.java b/source/net/yacy/document/importer/ZimImporter.java index bc7266e0a..1b4095df6 100644 --- a/source/net/yacy/document/importer/ZimImporter.java +++ b/source/net/yacy/document/importer/ZimImporter.java @@ -108,58 +108,63 @@ public class ZimImporter extends Thread implements Importer { // read all documents for (int i = 0; i < this.file.header_entryCount; i++) { - if (this.abort) break; - DirectoryEntry de = this.reader.getDirectoryInfo(i); - if (!(de instanceof ZIMReader.ArticleEntry)) continue; - ArticleEntry ae = (ArticleEntry) de; - if (ae.namespace != 'C' && ae.namespace != 'A') continue; - - // check url - DigestURL guessedUrl = guessURL(this.guessedSource, de); - if (recordCnt < 10) { - // critical test for the first 10 urls - if (!guessedUrl.exists(ClientIdentification.browserAgent)) { - sb.log.info("zim importer: file " + this.file.getName() + " failed url " + recordCnt + " existence test: " + guessedUrl); - return; - } - } - - // check availability of text parser - String mimeType = ae.getMimeType(); - if (!mimeType.startsWith("text/") && !mimeType.equals("application/epub+zip")) continue; // in this import we want only text, not everything that is possible - if (TextParser.supportsMime(mimeType) != null) continue; - - // read the content - byte[] b = this.reader.getArticleData(ae); - - // create artificial request and response headers for the indexer - RequestHeader requestHeader = new RequestHeader(); - ResponseHeader responseHeader = new ResponseHeader(200); - responseHeader.put(HeaderFramework.CONTENT_TYPE, de.getMimeType()); // very important to tell parser which kind of content - responseHeader.put(HeaderFramework.LAST_MODIFIED, dates); // put in the guessd date to have something that is not the current date - final Request request = new Request( - ASCII.getBytes(sb.peers.mySeed().hash), - guessedUrl, - null, // referrerhash the hash of the referrer URL - de.title, // name the name of the document to crawl - null, // appdate the time when the url was first time appeared - sb.crawler.defaultSurrogateProfile.handle(), // profileHandle the name of the prefetch profile. This must not be null! - 0, // depth the crawling depth of the entry - sb.crawler.defaultSurrogateProfile.timezoneOffset() // timezone offset - ); - final Response response = new Response( - request, - requestHeader, - responseHeader, - Switchboard.getSwitchboard().crawler.defaultSurrogateProfile, - false, - b - ); - - // throw this to the indexer - String error = sb.toIndexer(response); - if (error != null) ConcurrentLog.info("ZimImporter", "error parsing: " + error); - this.recordCnt++; + try { + if (this.abort) break; + DirectoryEntry de = this.reader.getDirectoryInfo(i); + if (!(de instanceof ZIMReader.ArticleEntry)) continue; + ArticleEntry ae = (ArticleEntry) de; + if (ae.namespace != 'C' && ae.namespace != 'A') continue; + + // check url + DigestURL guessedUrl = guessURL(this.guessedSource, de); + if (recordCnt < 10) { + // critical test for the first 10 urls + if (!guessedUrl.exists(ClientIdentification.browserAgent)) { + sb.log.info("zim importer: file " + this.file.getName() + " failed url " + recordCnt + " existence test: " + guessedUrl); + return; + } + } + + // check availability of text parser + String mimeType = ae.getMimeType(); + if (!mimeType.startsWith("text/") && !mimeType.equals("application/epub+zip")) continue; // in this import we want only text, not everything that is possible + if (TextParser.supportsMime(mimeType) != null) continue; + + // read the content + byte[] b = this.reader.getArticleData(ae); + + // create artificial request and response headers for the indexer + RequestHeader requestHeader = new RequestHeader(); + ResponseHeader responseHeader = new ResponseHeader(200); + responseHeader.put(HeaderFramework.CONTENT_TYPE, de.getMimeType()); // very important to tell parser which kind of content + responseHeader.put(HeaderFramework.LAST_MODIFIED, dates); // put in the guessd date to have something that is not the current date + final Request request = new Request( + ASCII.getBytes(sb.peers.mySeed().hash), + guessedUrl, + null, // referrerhash the hash of the referrer URL + de.title, // name the name of the document to crawl + null, // appdate the time when the url was first time appeared + sb.crawler.defaultSurrogateProfile.handle(), // profileHandle the name of the prefetch profile. This must not be null! + 0, // depth the crawling depth of the entry + sb.crawler.defaultSurrogateProfile.timezoneOffset() // timezone offset + ); + final Response response = new Response( + request, + requestHeader, + responseHeader, + Switchboard.getSwitchboard().crawler.defaultSurrogateProfile, + false, + b + ); + + // throw this to the indexer + String error = sb.toIndexer(response); + if (error != null) ConcurrentLog.info("ZimImporter", "error parsing: " + error); + this.recordCnt++; + } catch (Exception e) { + // catch any error that could stop the importer + ConcurrentLog.info("ZimImporter", "error loading: " + e.getMessage()); + } } } catch (IOException e) { ConcurrentLog.info("ZimImporter", "error reading: " + e.getMessage()); From cff0991d850123dd5b9a7062b6df991c50fb26f5 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Mon, 13 Nov 2023 16:41:19 +0100 Subject: [PATCH 11/11] test if this is helpful for https://github.com/yacy/yacy_search_server/issues/500 --- source/net/yacy/document/parser/sitemapParser.java | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/source/net/yacy/document/parser/sitemapParser.java b/source/net/yacy/document/parser/sitemapParser.java index be52f72e7..2dd6ebdeb 100644 --- a/source/net/yacy/document/parser/sitemapParser.java +++ b/source/net/yacy/document/parser/sitemapParser.java @@ -49,7 +49,6 @@ import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.TextParser; import net.yacy.document.VocabularyScraper; -import net.yacy.kelondro.io.ByteCountInputStream; import org.w3c.dom.CharacterData; import org.w3c.dom.Element; @@ -116,7 +115,8 @@ public class sitemapParser extends AbstractParser implements Parser { ConcurrentLog.info("SitemapReader", "loading sitemap from " + sitemapURL.toNormalform(true)); // client.setHeader(requestHeader.entrySet()); try (final HTTPClient client = new HTTPClient(agent)) { - client.GET(sitemapURL.toNormalform(false), false); + String url = sitemapURL.toNormalform(false); + client.GET(url, false); if (client.getStatusCode() != 200) { throw new IOException("Unable to download the sitemap file " + sitemapURL + "\nServer returned status: " + client.getHttpResponse().getStatusLine()); @@ -128,11 +128,10 @@ public class sitemapParser extends AbstractParser implements Parser { final String contentMimeType = header.mime(); InputStream contentStream = client.getContentstream(); - if (contentMimeType != null && (contentMimeType.equals("application/x-gzip") || contentMimeType.equals("application/gzip"))) { + if ((contentMimeType != null && (contentMimeType.equals("application/x-gzip") || contentMimeType.equals("application/gzip"))) || url.endsWith(".gz")) { contentStream = new GZIPInputStream(contentStream); } - final ByteCountInputStream counterStream = new ByteCountInputStream(contentStream, null); - return new SitemapReader(counterStream, agent); + return new SitemapReader(contentStream, agent); } catch (final IOException e) { throw e; }