diff --git a/htroot/ConfigParser_p.html b/htroot/ConfigParser_p.html index a51ee1013..66a4665d3 100644 --- a/htroot/ConfigParser_p.html +++ b/htroot/ConfigParser_p.html @@ -51,27 +51,6 @@ -
PDF Parser Attributes -

- This is an experimental setting which makes it possible to split PDF documents into individual index entries. - Every page will become a single index hit and the url is artifically extended with a post/get attribute value containing - the page number as value. When such an url is displayed within a search result, then the post/get attribute is transformed into an anchor hash link. - This makes it possible to view the individual page directly in the pdf.js viewer built-in into firefox, - for reference see https://github.com/mozilla/pdf.js/wiki/Viewer-options -

- - - - - - - - - - - - -
Split PDF
Property Name
#%env/templates/footer.template%# diff --git a/htroot/Crawler_p.html b/htroot/Crawler_p.html index 79a0319c0..3b328a996 100644 --- a/htroot/Crawler_p.html +++ b/htroot/Crawler_p.html @@ -134,7 +134,7 @@ Speed / PPM
(Pages Per Minute) - + @@ -147,7 +147,7 @@ Crawler PPM     - + diff --git a/htroot/IndexExport_p.html b/htroot/IndexExport_p.html index 1aa992716..df58837c2 100644 --- a/htroot/IndexExport_p.html +++ b/htroot/IndexExport_p.html @@ -9,11 +9,10 @@ #%env/templates/header.template%# #%env/templates/submenuIndexImport.template%# - - +

Index Export

The local index currently contains #[ucount]# documents, only #[ucount200]# exportable with status code 200 - the remaining are error documents.

- + #(lurlexport)#::
Loaded URL Export @@ -22,31 +21,65 @@
URL Filter
-
+
 .*.* (default) is a catch-all; format: java regex
query
-
+
 *:* (default) is a catch-all; format: :
-
maximum age (seconds, -1 = unlimited)
-
+
maximum age (seconds)
+
 -1 = unlimited -> no document is too old +
+
maximum number of records per chunk
+
 if exceeded: several chunks are stored; -1 = unlimited (makes only one chunk) +
+
Export Size
+
+ full size, all fields:  + minified; only fields sku, date, title, description, text_t
Export Format
Full Data Records:
-
XML (Rich and full-text Solr data, one document per line in one large xml file, can be processed with shell tools, can be imported with DATA/SURROGATE/in/)
- JSON (Rich and full-text Elasticsearch data, one document per line in one flat JSON file, can be bulk-imported to elasticsearch with the command "curl -XPOST localhost:9200/collection1/yacy/_bulk --data-binary @yacy_dump_XXX.flatjson")
- XML (RSS)
+
+ JSON (Rich and full-text Elasticsearch data, one document per line in one flat JSON file, + can be bulk-imported to elasticsearch. Here is an example for opensearch, using docker:
+Start docker container of opensearch:
+docker run --name opensearch -p 9200:9200 -d -e OPENSEARCH_JAVA_OPTS="-Xms2G -Xmx2G" -e discovery.type=single-node -e DISABLE_SECURITY_PLUGIN=true -v $(pwd)/opensearch_data:/usr/share/opensearch/data opensearchproject/opensearch:latest
+Unblock index creation:
+curl -X PUT "http://localhost:9200/_cluster/settings" -H 'Content-Type: application/json' -d' +{ + "persistent": { + "cluster.blocks.create_index": null + } +}'
+Create the search index:
+curl -X PUT "http://localhost:9200/collection1/yacy"
+Bulk-upload the index file:
+curl -XPOST "http://localhost:9200/collection1/yacy/_bulk?filter_path=took,errors" -H "Content-Type: application/x-ndjson" --data-binary @yacy_dump_XXX.flatjson
+Make a search, get 10 results, search in fields text_t, title, description with boosts:
+curl -X POST "http://localhost:9200/collection1/yacy/_search" -H 'Content-Type: application/json' -d' +{"size": 10, "query": {"multi_match": { + "query": "one two three", + "fields": ["text_t", "title^10", "description^3"], "fuzziness": "AUTO" +}}}'
+ + XML (Rich and full-text Solr data, one document per line in one large xml file, + can be processed with shell tools, can be imported with DATA/SURROGATE/in/) +
+ + XML (RSS) +
Full URL List:
Plain Text List (URLs only)
HTML (URLs with title)
Only Domain:
Plain Text List (domains only)
HTML (domains as URLs, no title)
-
Only Text:
+
Only Text:
Fulltext of Search Index Text
-
-
+ +
 
@@ -55,16 +88,16 @@ ::
Export to file #[exportfile]# is running .. #[urlcount]# Documents so far
:: #(/lurlexport)# - - #(lurlexportfinished)#:: + + #(lurlexportfinished)#::
Finished export of #[urlcount]# Documents to file #[exportfile]#
Import this file by moving it to DATA/SURROGATES/in
:: #(/lurlexportfinished)# - + #(lurlexporterror)#::
Export to file #[exportfile]# failed: #[exportfailmsg]#
:: #(/lurlexporterror)# - + #(dumprestore)#::
Dump and Restore of Solr Index diff --git a/ivy.xml b/ivy.xml index 61f9ee127..85af1d3f3 100644 --- a/ivy.xml +++ b/ivy.xml @@ -14,7 +14,8 @@ - + + @@ -28,6 +29,7 @@ + diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java index e9a46fc91..768ca0aa6 100644 --- a/source/net/yacy/cora/document/id/MultiProtocolURL.java +++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java @@ -2578,6 +2578,36 @@ public class MultiProtocolURL implements Serializable, Comparable 0; + } + if (isHTTP() || isHTTPS()) { + final HTTPClient client = new HTTPClient(agent); + client.setHost(getHost()); + org.apache.http.HttpResponse response = client.HEADResponse(this, true); + client.close(); + if (response == null) return false; + int status = response.getStatusLine().getStatusCode(); + return status == 200 || status == 301 || status == 302; + } + return false; + } catch (IOException e) { + if (e.getMessage().contains("Circular redirect to")) return true; // exception; this is a 302 which the client actually accepts + //e.printStackTrace(); + return false; + } + } + /** * Read fully the source, close it and return its content as a bytes array. * @param source the source to read diff --git a/source/net/yacy/document/importer/ZimImporter.java b/source/net/yacy/document/importer/ZimImporter.java index a96a79b18..1b4095df6 100644 --- a/source/net/yacy/document/importer/ZimImporter.java +++ b/source/net/yacy/document/importer/ZimImporter.java @@ -25,11 +25,22 @@ package net.yacy.document.importer; import java.io.File; import java.io.IOException; +import java.net.MalformedURLException; +import java.text.ParseException; +import java.text.SimpleDateFormat; import java.util.Collection; +import java.util.Date; +import java.util.HashSet; +import java.util.LinkedHashSet; +import java.util.Locale; import java.util.Map; +import java.util.Set; import java.util.TreeMap; +import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.protocol.ClientIdentification; +import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.util.ConcurrentLog; @@ -80,44 +91,80 @@ public class ZimImporter extends Thread implements Importer { public void run() { job = this; this.startTime = System.currentTimeMillis(); + Switchboard sb = Switchboard.getSwitchboard(); try { this.reader = new ZIMReader(this.file); this.guessedSource = getSource(this.reader); + Date guessedDate = getDate(this.reader); + String dates = HeaderFramework.newRfc1123Format().format(guessedDate); + + // verify the source + DirectoryEntry mainEntry = this.reader.getMainDirectoryEntry(); + DigestURL mainURL = guessURL(this.guessedSource, mainEntry); + if (!mainURL.exists(ClientIdentification.browserAgent)) { + sb.log.info("zim importer: file " + this.file.getName() + " failed main url existence test: " + mainURL); + return; + } + // read all documents for (int i = 0; i < this.file.header_entryCount; i++) { - if (this.abort) break; - DirectoryEntry de = this.reader.getDirectoryInfo(i); - if (!(de instanceof ZIMReader.ArticleEntry)) continue; - ArticleEntry ae = (ArticleEntry) de; - - // check url - String guessedUrl = guessURL(this.guessedSource, de); - assert guessedUrl.startsWith("http"); - - // check availability of text parser - String mimeType = ae.getMimeType(); - if (TextParser.supportsMime(mimeType) != null) continue; - - // read the content - byte[] b = this.reader.getArticleData(ae); - - // create artificial request and response headers for the indexer - RequestHeader requestHeader = new RequestHeader(); - ResponseHeader responseHeader = new ResponseHeader(200); - final Request request = new Request(new DigestURL(guessedUrl), null); - final Response response = new Response( - request, - requestHeader, - responseHeader, - Switchboard.getSwitchboard().crawler.defaultSurrogateProfile, - false, - b - ); - - // throw this to the indexer - String error = Switchboard.getSwitchboard().toIndexer(response); - if (error != null) ConcurrentLog.info("ZimImporter", "error parsing: " + error); - this.recordCnt++; + try { + if (this.abort) break; + DirectoryEntry de = this.reader.getDirectoryInfo(i); + if (!(de instanceof ZIMReader.ArticleEntry)) continue; + ArticleEntry ae = (ArticleEntry) de; + if (ae.namespace != 'C' && ae.namespace != 'A') continue; + + // check url + DigestURL guessedUrl = guessURL(this.guessedSource, de); + if (recordCnt < 10) { + // critical test for the first 10 urls + if (!guessedUrl.exists(ClientIdentification.browserAgent)) { + sb.log.info("zim importer: file " + this.file.getName() + " failed url " + recordCnt + " existence test: " + guessedUrl); + return; + } + } + + // check availability of text parser + String mimeType = ae.getMimeType(); + if (!mimeType.startsWith("text/") && !mimeType.equals("application/epub+zip")) continue; // in this import we want only text, not everything that is possible + if (TextParser.supportsMime(mimeType) != null) continue; + + // read the content + byte[] b = this.reader.getArticleData(ae); + + // create artificial request and response headers for the indexer + RequestHeader requestHeader = new RequestHeader(); + ResponseHeader responseHeader = new ResponseHeader(200); + responseHeader.put(HeaderFramework.CONTENT_TYPE, de.getMimeType()); // very important to tell parser which kind of content + responseHeader.put(HeaderFramework.LAST_MODIFIED, dates); // put in the guessd date to have something that is not the current date + final Request request = new Request( + ASCII.getBytes(sb.peers.mySeed().hash), + guessedUrl, + null, // referrerhash the hash of the referrer URL + de.title, // name the name of the document to crawl + null, // appdate the time when the url was first time appeared + sb.crawler.defaultSurrogateProfile.handle(), // profileHandle the name of the prefetch profile. This must not be null! + 0, // depth the crawling depth of the entry + sb.crawler.defaultSurrogateProfile.timezoneOffset() // timezone offset + ); + final Response response = new Response( + request, + requestHeader, + responseHeader, + Switchboard.getSwitchboard().crawler.defaultSurrogateProfile, + false, + b + ); + + // throw this to the indexer + String error = sb.toIndexer(response); + if (error != null) ConcurrentLog.info("ZimImporter", "error parsing: " + error); + this.recordCnt++; + } catch (Exception e) { + // catch any error that could stop the importer + ConcurrentLog.info("ZimImporter", "error loading: " + e.getMessage()); + } } } catch (IOException e) { ConcurrentLog.info("ZimImporter", "error reading: " + e.getMessage()); @@ -195,8 +242,6 @@ public class ZimImporter extends Thread implements Importer { return "fas.org"; case "fonts": return "fonts.google.com"; - case "gutenberg": - return "gutenberg.org"; case "ifixit": return "ifixit.com"; case "lesfondamentaux": @@ -216,11 +261,23 @@ public class ZimImporter extends Thread implements Importer { case "rapsberry_pi_docs": return "raspberrypi.org"; case "ted": - return "ted.com"; + return "www.ted.com/search?q="; case "vikidia": - return "vikidia.org"; + return parts[1] + ".vikidia.org/wiki"; case "westeros": return "westeros.org"; + case "wikihow": + return parts[1].equals("en") ? "wikihow.com" : parts[1] + ".wikihow.com"; + case "wikisource": + return parts[1] + ".wikisource.org/wiki"; + case "wikiversity": + return parts[1] + ".wikiversity.org/wiki"; + case "wikivoyage": + return parts[1] + ".wikivoyage.org/wiki"; + case "wiktionary": + return parts[1] + ".wiktionary.org/wiki"; + case "wikiquote": + return parts[1] + ".wikiquote.org/wiki"; case "wikibooks": return parts[1] + ".wikibooks.org/wiki"; case "wikinews": @@ -266,16 +323,174 @@ public class ZimImporter extends Thread implements Importer { return source; } - public static String guessURL(String guessedSource, DirectoryEntry de) { + public static Date getDate(ZIMReader r) throws IOException { + String date = r.getMetadata("Date"); + if (date != null) try { + SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd", Locale.US); + return format.parse(date); + } catch (ParseException e) {} + // failover situation: use file date + return new Date(r.getZIMFile().lastModified()); + } + + public static DigestURL guessURL(String guessedSource, DirectoryEntry de) throws MalformedURLException { String url = de.url; if (url.equals("Main_Page")) url = ""; - if (guessedSource != null) return guessedSource + url; - if (url.startsWith("A/")) return "https://" + url.substring(2); - if (url.startsWith("H/")) return "https://" + url.substring(2); - return guessedSource + url; + if (url.startsWith("A/")) return new DigestURL("https://" + url.substring(2)); + if (url.startsWith("H/")) return new DigestURL("https://" + url.substring(2)); + if (guessedSource != null) return new DigestURL(guessedSource + url); + return new DigestURL(guessedSource + url); } + private final static String[] skip_files = { + "iota.stackexchange.com_en_all_2023-05.zim", + "stellar.stackexchange.com_en_all_2023-10.zim", + "vegetarianism.stackexchange.com_en_all_2023-05.zim", + "esperanto.stackexchange.com_eo_all_2023-10.zim", + "tezos.stackexchange.com_en_all_2023-10.zim", + "eosio.stackexchange.com_en_all_2023-10.zim", + "ebooks.stackexchange.com_en_all_2023-10.zim", + "poker.stackexchange.com_en_all_2023-05.zim", + "cseducators.stackexchange.com_en_all_2023-10.zim", + "iot.stackexchange.com_en_all_2023-05.zim", + "portuguese.stackexchange.com_pt_all_2023-04.zim", + "portuguese.stackexchange.com_pt_all_2023-10.zim", + "italian.stackexchange.com_it_all_2023-05.zim", + "monero.stackexchange.com_en_all_2022-11.zim", + "sustainability.stackexchange.com_en_all_2023-05.zim", + "westeros_en_all_nopic_2021-03.zim", + "opensource.stackexchange.com_en_all_2023-10.zim", + "tor.stackexchange.com_en_all_2023-05.zim", + "devops.stackexchange.com_en_all_2023-10.zim", + "patents.stackexchange.com_en_all_2023-10.zim", + "stackapps.com_en_all_2023-05.zim", + "hardwarerecs.stackexchange.com_en_all_2023-05.zim", + "hsm.stackexchange.com_en_all_2023-05.zim", + "expatriates.stackexchange.com_en_all_2023-11.zim", + "opendata.stackexchange.com_en_all_2023-10.zim", + "sports.stackexchange.com_en_all_2023-05.zim", + "wikinews_de_all_nopic_2023-10.zim", + "computergraphics.stackexchange.com_en_all_2023-10.zim", + "tridion.stackexchange.com_en_all_2023-10.zim", + "bioinformatics.stackexchange.com_en_all_2023-10.zim", + "expressionengine.stackexchange.com_en_all_2023-11.zim", + "elementaryos.stackexchange.com_en_all_2023-10.zim", + "cstheory.stackexchange.com_en_all_2023-10.zim", + "chess.stackexchange.com_en_all_2023-05.zim", + "vi.stackexchange.com_en_all_2023-05.zim", + "fitness.stackexchange.com_en_all_2023-10.zim", + "pets.stackexchange.com_en_all_2023-05.zim", + "french.stackexchange.com_fr_all_2023-10.zim", + "sqa.stackexchange.com_en_all_2023-05.zim", + "islam.stackexchange.com_en_all_2023-05.zim", + "scicomp.stackexchange.com_en_all_2023-05.zim", + "wikinews_en_all_nopic_2023-09.zim", + "ai.stackexchange.com_en_all_2023-10.zim", + "boardgames.stackexchange.com_en_all_2023-05.zim", + "economics.stackexchange.com_en_all_2023-05.zim", + "3dprinting.stackexchange.com_en_all_2023-07.zim", + "earthscience.stackexchange.com_en_all_2023-05.zim", + "emacs.stackexchange.com_en_all_2023-10.zim", + "bitcoin.stackexchange.com_en_all_2023-05.zim", + "philosophy.stackexchange.com_en_all_2023-05.zim", + "law.stackexchange.com_en_all_2023-05.zim", + "astronomy.stackexchange.com_en_all_2023-05.zim", + "artofproblemsolving_en_all_nopic_2021-03.zim", + "engineering.stackexchange.com_en_all_2023-05.zim", + "ja.stackoverflow.com_ja_all_2023-06.zim", + "webmasters.stackexchange.com_en_all_2023-05.zim", + "anime.stackexchange.com_en_all_2023-10.zim", + "cooking.stackexchange.com_en_all_2023-05.zim", + "arduino.stackexchange.com_en_all_2023-05.zim", + "money.stackexchange.com_en_all_2023-05.zim", + "judaism.stackexchange.com_en_all_2023-05.zim", + "ethereum.stackexchange.com_en_all_2023-05.zim", + "datascience.stackexchange.com_en_all_2023-10.zim", + "academia.stackexchange.com_en_all_2023-10.zim", + "music.stackexchange.com_en_all_2023-05.zim", + "cs.stackexchange.com_en_all_2023-03.zim", + "dsp.stackexchange.com_en_all_2023-05.zim", + "biology.stackexchange.com_en_all_2023-05.zim", + "android.stackexchange.com_en_all_2023-10.zim", + "bicycles.stackexchange.com_en_all_2023-05.zim", + "puzzling.stackexchange.com_en_all_2023-05.zim", + "photo.stackexchange.com_en_all_2023-05.zim", + "aviation.stackexchange.com_en_all_2023-05.zim", + "drupal.stackexchange.com_en_all_2023-05.zim", + "ux.stackexchange.com_en_all_2023-05.zim", + "ell.stackexchange.com_en_all_2023-10.zim", + "openstreetmap-wiki_en_all_nopic_2023-05.zim", + "softwareengineering.stackexchange.com_en_all_2023-05.zim", + "gaming.stackexchange.com_en_all_2023-10.zim", + "mathematica.stackexchange.com_en_all_2023-10.zim", + "pt.stackoverflow.com_pt_all_2023-06.zim", + "apple.stackexchange.com_en_all_2023-05.zim", + "diy.stackexchange.com_en_all_2023-08.zim", + "es.stackoverflow.com_es_all_2023-06.zim", + "gis.stackexchange.com_en_all_2023-05.zim", + "stats.stackexchange.com_en_all_2023-05.zim", + "physics.stackexchange.com_en_all_2023-05.zim", + "serverfault.com_en_all_2023-05.zim", + "electronics.stackexchange.com_en_all_2023-05.zim", + "tex.stackexchange.com_en_all_2023-05.zim", + "wikibooks_de_all_nopic_2021-03.zim", + "askubuntu.com_en_all_2023-05.zim", + "superuser.com_en_all_2023-05.zim", + "lesfondamentaux.reseau-canope.fr_fr_all_2022-11.zim", + "wikibooks_en_all_nopic_2021-03.zim", + "courses.lumenlearning.com_en_all_2021-03.zim", + "wikipedia_de_all_nopic_2023-10.zim", + "wikipedia_en_all_nopic_2023-10.zim", + "stackoverflow.com_en_all_nopic_2022-07.zim", + "stackoverflow.com_en_all_2023-05.zim", + "armypubs_en_all_2023-08.zim", + "vikidia_en_all_nopic_2023-09.zim", + "wikiquote_de_all_nopic_2023-10.zim", + "wikiquote_en_all_nopic_2023-09.zim", + "wiktionary_de_all_nopic_2023-10.zim", + "wiktionary_en_all_nopic_2023-10.zim", + "wikihow_de_maxi_2023-10.zim", + "wikivoyage_de_all_nopic_2023-09.zim", + "wikiversity_de_all_nopic_2021-03.zim", + "wikiversity_en_all_nopic_2021-03.zim", + "wikisource_de_all_nopic_2023-09.zim", + "wikisource_en_all_nopic_2023-08.zim", + "ted_countdown_global_2023-09.zim", + "ted_en_design_2023-09.zim", + "ted_en_business_2023-09.zim", + "ted_en_global_issues_2023-09.zim", + "opentextbooks_en_all_2023-08.zim", + "bestedlessons.org_en_all_2023-08.zim", + "wikivoyage_en_all_nopic_2023-10.zim", + "based.cooking_en_all_2023-10.zim", + "wordnet_en_all_2023-04.zim", + "internet-encyclopedia-philosophy_en_all_2023-08.zim", + "100r-off-the-grid_en_2023-09.zim", + "coopmaths_2023-04.zim", + "birds-of-ladakh_en_all_2023-02.zim", + "storyweaver.org_en_2023-09.zim", + "developer.mozilla.org_en_all_2023-02.zim", + "www.ready.gov_es_2023-06.zim", + "teoria.com_en_2023-08.zim", + "theworldfactbook_en_all_2023-06.zim", + "mutopiaproject.org_en_2023-08.zim", + "dp.la_en_all_2023-08.zim", + + // 302 + "moderators.stackexchange.com_en_all_2023-05.zim", + "beer.stackexchange.com_en_all_2023-05.zim", + "health.stackexchange.com_en_all_2023-05.zim", + "avp.stackexchange.com_en_all_2023-05.zim", + "lowtechmagazine.com_en_all_2023-08.zim", + "ifixit_de_all_2023-07.zim", + "ifixit_en_all_2023-10.zim", + "der-postillon.com_de_all_2020-12.zim", + "wikihow_en_maxi_2023-03.zim", + }; + public static void main(String[] args) { + Set skip = new HashSet<>(); + for (String s: skip_files) skip.add(s); // zim file import test // will test mostly if domain names are included in zim file urls String zimFilesPath = args[0]; @@ -291,7 +506,10 @@ public class ZimImporter extends Thread implements Importer { } Collection orderedFiles = orderedFileMap.values(); + Set files_ok = new LinkedHashSet<>(); + Set files_nok = new LinkedHashSet<>(); for (File f: orderedFiles) { + if (skip.contains(f.getName())) continue; try { ZIMFile z = new ZIMFile(f.getAbsolutePath()); ZIMReader r = new ZIMReader(z); @@ -301,14 +519,21 @@ public class ZimImporter extends Thread implements Importer { System.out.println("Namespace: " + de.namespace); System.out.println("Title: " + de.title); System.out.println("URL: " + de.url); - System.out.println("guessed domain: " + guessDomainName(f.getName())); + System.out.println("Mime Type " + de.getMimeType()); + System.out.println("guessed domain: " + guessDomainName(f.getName())); // uses a table and rules that deduces a source from the file name String source = getSource(r); - System.out.println("guessed Source: " + source); - System.out.println("guessed main article: " + guessURL(source, de)); + System.out.println("guessed Source: " + source); // this uses metadata stored in the zim file + DigestURL mainURL = guessURL(source, de); + System.out.println("guessed main article: " + mainURL); + boolean ok = mainURL.exists(ClientIdentification.browserAgent); + System.out.println("main article exists: " + ok); + if (ok) files_ok.add(f.getName()); else files_nok.add(f.getName()); System.out.println(); } catch (IOException e) { e.printStackTrace(); } } + System.out.println("ok files: " + files_ok.toString()); + System.out.println("not-ok files: " + files_nok.toString()); } } diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java index 0ad6b2248..f02577244 100644 --- a/source/net/yacy/document/parser/pdfParser.java +++ b/source/net/yacy/document/parser/pdfParser.java @@ -53,7 +53,6 @@ import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink; import org.apache.pdfbox.text.PDFTextStripper; -import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; @@ -69,9 +68,6 @@ import net.yacy.kelondro.util.MemoryControl; public class pdfParser extends AbstractParser implements Parser { - public static boolean individualPages = false; - public static String individualPagePropertyname = "page"; - public pdfParser() { super("Acrobat Portable Document Parser"); this.SUPPORTED_EXTENSIONS.add("pdf"); @@ -149,98 +145,36 @@ public class pdfParser extends AbstractParser implements Parser { // get the links final List> pdflinks = extractPdfLinks(pdfDoc); - // get the fulltext (either per document or for each page) - final PDFTextStripper stripper = new PDFTextStripper(/*StandardCharsets.UTF_8.name()*/); - - if (individualPages) { - // this is a hack which stores individual pages of the source pdf into individual index documents - // the new documents will get a virtual link with a post argument page=X appended to the original url - - // collect text - final int pagecount = pdfDoc.getNumberOfPages(); - final String[] pages = new String[pagecount]; - for (int page = 1; page <= pagecount; page++) { - stripper.setStartPage(page); - stripper.setEndPage(page); - pages[page - 1] = stripper.getText(pdfDoc); - //System.out.println("PAGE " + page + ": " + pages[page - 1]); - } - - // create individual documents for each page - assert pages.length == pdflinks.size() : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.size(); - result = new Document[Math.min(pages.length, pdflinks.size())]; - final String loc = location.toNormalform(true); - for (int page = 0; page < result.length; page++) { - result[page] = new Document( - new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash - mimeType, - StandardCharsets.UTF_8.name(), - this, - null, - docKeywords, - singleList(docTitle), - docAuthor, - docPublisher, - null, - null, - 0.0d, 0.0d, - pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]), - pdflinks == null || page >= pdflinks.size() ? null : pdflinks.get(page), - null, - null, - false, - docDate); - } - } else { - // collect the whole text at once - final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE); - byte[] contentBytes = new byte[0]; - stripper.setEndPage(3); // get first 3 pages (always) - writer.append(stripper.getText(pdfDoc)); - contentBytes = writer.getBytes(); // remember text in case of interrupting thread - - if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read - stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text) - stripper.setEndPage(Integer.MAX_VALUE); // set to default - // we start the pdf parsing in a separate thread to ensure that it can be terminated - final PDDocument pdfDocC = pdfDoc; - final Thread t = new Thread("pdfParser.getText:" + location) { - @Override - public void run() { - try { - writer.append(stripper.getText(pdfDocC)); - } catch (final Throwable e) {} - } - }; - t.start(); - t.join(3000); // pdfbox likes to forget to terminate ... (quite often) - if (t.isAlive()) t.interrupt(); - contentBytes = writer.getBytes(); // get final text before closing writer - writer.close(); // free writer resources - } - - final Collection pdflinksCombined = new HashSet<>(); - for (final Collection pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx); - result = new Document[]{new Document( - location, - mimeType, - StandardCharsets.UTF_8.name(), - this, - null, - docKeywords, - singleList(docTitle), - docAuthor, - docPublisher, - null, - null, - 0.0d, 0.0d, - contentBytes, - pdflinksCombined, - null, - null, - false, - docDate)}; - } + // collect the whole text at once + final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE); + byte[] contentBytes = new byte[0]; + final PDFTextStripper stripper = new PDFTextStripper(); + stripper.setEndPage(Integer.MAX_VALUE); + writer.append(stripper.getText(pdfDoc)); + contentBytes = writer.getBytes(); // remember text in case of interrupting thread + writer.close(); // free writer resources + + final Collection pdflinksCombined = new HashSet<>(); + for (final Collection pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx); + result = new Document[]{new Document( + location, + mimeType, + StandardCharsets.UTF_8.name(), + this, + null, + docKeywords, + singleList(docTitle), + docAuthor, + docPublisher, + null, + null, + 0.0d, 0.0d, + contentBytes, + pdflinksCombined, + null, + null, + false, + docDate)}; } catch (final Throwable e) { //throw new Parser.Failure(e.getMessage(), location); } finally { diff --git a/source/net/yacy/document/parser/sitemapParser.java b/source/net/yacy/document/parser/sitemapParser.java index be52f72e7..2dd6ebdeb 100644 --- a/source/net/yacy/document/parser/sitemapParser.java +++ b/source/net/yacy/document/parser/sitemapParser.java @@ -49,7 +49,6 @@ import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.TextParser; import net.yacy.document.VocabularyScraper; -import net.yacy.kelondro.io.ByteCountInputStream; import org.w3c.dom.CharacterData; import org.w3c.dom.Element; @@ -116,7 +115,8 @@ public class sitemapParser extends AbstractParser implements Parser { ConcurrentLog.info("SitemapReader", "loading sitemap from " + sitemapURL.toNormalform(true)); // client.setHeader(requestHeader.entrySet()); try (final HTTPClient client = new HTTPClient(agent)) { - client.GET(sitemapURL.toNormalform(false), false); + String url = sitemapURL.toNormalform(false); + client.GET(url, false); if (client.getStatusCode() != 200) { throw new IOException("Unable to download the sitemap file " + sitemapURL + "\nServer returned status: " + client.getHttpResponse().getStatusLine()); @@ -128,11 +128,10 @@ public class sitemapParser extends AbstractParser implements Parser { final String contentMimeType = header.mime(); InputStream contentStream = client.getContentstream(); - if (contentMimeType != null && (contentMimeType.equals("application/x-gzip") || contentMimeType.equals("application/gzip"))) { + if ((contentMimeType != null && (contentMimeType.equals("application/x-gzip") || contentMimeType.equals("application/gzip"))) || url.endsWith(".gz")) { contentStream = new GZIPInputStream(contentStream); } - final ByteCountInputStream counterStream = new ByteCountInputStream(contentStream, null); - return new SitemapReader(counterStream, agent); + return new SitemapReader(contentStream, agent); } catch (final IOException e) { throw e; } diff --git a/source/net/yacy/htroot/ConfigParser_p.java b/source/net/yacy/htroot/ConfigParser_p.java index e466d783b..943279382 100644 --- a/source/net/yacy/htroot/ConfigParser_p.java +++ b/source/net/yacy/htroot/ConfigParser_p.java @@ -61,13 +61,6 @@ public class ConfigParser_p { env.setConfig(SwitchboardConstants.PARSER_MIME_DENY, TextParser.getDenyMime()); env.setConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, TextParser.getDenyExtension()); } - - if (post.containsKey("pdfSettings")) { - env.setConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, post.getBoolean("individualPages")); - env.setConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, post.get("individualPagePropertyname", "page")); - pdfParser.individualPages = sb.getConfigBool(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, false); - pdfParser.individualPagePropertyname = sb.getConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, "page"); - } } int i = 0; @@ -94,9 +87,6 @@ public class ConfigParser_p { prop.put("parser", i); - prop.put("individualPages", sb.getConfigBool(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, false)); - prop.put("individualPagePropertyname", sb.getConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, "page")); - // return rewrite properties return prop; } diff --git a/source/net/yacy/htroot/Crawler_p.java b/source/net/yacy/htroot/Crawler_p.java index e95562713..8c898f558 100644 --- a/source/net/yacy/htroot/Crawler_p.java +++ b/source/net/yacy/htroot/Crawler_p.java @@ -774,7 +774,7 @@ public class Crawler_p { } /* - * PPM + * PPM LF MH @@ -784,19 +784,19 @@ public class Crawler_p { if (post != null && post.containsKey("crawlingPerformance")) { final String crawlingPerformance = post.get("crawlingPerformance", "custom"); final long LCbusySleep1 = sb.getConfigLong(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, 1000L); - int wantedPPM = (LCbusySleep1 == 0) ? 30000 : (int) (60000L / LCbusySleep1); + int wantedPPM = (LCbusySleep1 == 0) ? 60000 : (int) (60000L / LCbusySleep1); try { wantedPPM = post.getInt("customPPM", wantedPPM); } catch (final NumberFormatException e) {} if ("minimum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 10; - if ("maximum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 30000; + if ("maximum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 60000; int wPPM = wantedPPM; if ( wPPM <= 0 ) { wPPM = 1; } - if ( wPPM >= 30000 ) { - wPPM = 30000; + if ( wPPM >= 60000 ) { + wPPM = 60000; } final int newBusySleep = 60000 / wPPM; // for wantedPPM = 10: 6000; for wantedPPM = 1000: 60 diff --git a/source/net/yacy/htroot/IndexExport_p.java b/source/net/yacy/htroot/IndexExport_p.java index 78cc94132..667ba5711 100644 --- a/source/net/yacy/htroot/IndexExport_p.java +++ b/source/net/yacy/htroot/IndexExport_p.java @@ -64,8 +64,8 @@ public class IndexExport_p { prop.put("lurlexport", 0); prop.put("reload", 0); prop.put("dumprestore", 1); - prop.put("dumprestore_dumpRestoreEnabled", sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_FULLTEXT, - SwitchboardConstants.CORE_SERVICE_FULLTEXT_DEFAULT)); + prop.put("dumprestore_dumpRestoreEnabled", sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_FULLTEXT, + SwitchboardConstants.CORE_SERVICE_FULLTEXT_DEFAULT)); List dumpFiles = segment.fulltext().dumpFiles(); prop.put("dumprestore_dumpfile", dumpFiles.size() == 0 ? "" : dumpFiles.get(dumpFiles.size() - 1).getAbsolutePath()); prop.put("dumprestore_optimizemax", 10); @@ -80,7 +80,7 @@ public class IndexExport_p { prop.put("lurlexportfinished", 0); prop.put("lurlexporterror", 0); prop.put("lurlexport_exportfile", export.file().toString()); - prop.put("lurlexport_urlcount", export.count()); + prop.put("lurlexport_urlcount", export.docCount()); prop.put("reload", 1); } else { prop.put("lurlexport", 1); @@ -93,7 +93,7 @@ public class IndexExport_p { // an export was running but has finished prop.put("lurlexportfinished", 1); prop.put("lurlexportfinished_exportfile", export.file().toString()); - prop.put("lurlexportfinished_urlcount", export.count()); + prop.put("lurlexportfinished_urlcount", export.docCount()); if (export.failed() == null) { prop.put("lurlexporterror", 0); } else { @@ -123,14 +123,17 @@ public class IndexExport_p { final String filter = post.get("exportfilter", ".*"); final String query = post.get("exportquery", "*:*"); final int maxseconds = post.getInt("exportmaxseconds", -1); + long maxChunkSize = post.getLong("maxchunksize", Long.MAX_VALUE); + if (maxChunkSize <= 0) maxChunkSize = Long.MAX_VALUE; final String path = post.get("exportfilepath", ""); + final boolean minified = post.get("minified", "no").equals("yes"); // store this call as api call: we do this even if there is a chance that it fails because recurring calls may do not fail if (maxseconds != -1) sb.tables.recordAPICall(post, "IndexExport_p.html", WorkTables.TABLE_API_TYPE_DUMP, format + "-dump, q=" + query + ", maxseconds=" + maxseconds); // start the export try { - export = sb.index.fulltext().export(format, filter, query, maxseconds, new File(path), dom, text); + export = sb.index.fulltext().export(format, filter, query, maxseconds, new File(path), dom, text, maxChunkSize, minified); } catch (final IOException e) { prop.put("lurlexporterror", 1); prop.put("lurlexporterror_exportfile", "-no export-"); @@ -140,7 +143,7 @@ public class IndexExport_p { // show result prop.put("lurlexport_exportfile", export.file().toString()); - prop.put("lurlexport_urlcount", export.count()); + prop.put("lurlexport_urlcount", export.docCount()); if ((export != null) && (export.failed() == null)) { prop.put("lurlexport", 2); } @@ -148,34 +151,34 @@ public class IndexExport_p { } if (post.containsKey("indexdump")) { - try { - final File dump = segment.fulltext().dumpEmbeddedSolr(); - prop.put("indexdump", 1); - prop.put("indexdump_dumpfile", dump.getAbsolutePath()); - dumpFiles = segment.fulltext().dumpFiles(); - prop.put("dumprestore_dumpfile", dumpFiles.size() == 0 ? "" : dumpFiles.get(dumpFiles.size() - 1).getAbsolutePath()); - // sb.tables.recordAPICall(post, "IndexExport_p.html", WorkTables.TABLE_API_TYPE_STEERING, "solr dump generation"); - } catch(final SolrException e) { - if(ErrorCode.SERVICE_UNAVAILABLE.code == e.code()) { - prop.put("indexdump", 2); - } else { - prop.put("indexdump", 3); - } - } + try { + final File dump = segment.fulltext().dumpEmbeddedSolr(); + prop.put("indexdump", 1); + prop.put("indexdump_dumpfile", dump.getAbsolutePath()); + dumpFiles = segment.fulltext().dumpFiles(); + prop.put("dumprestore_dumpfile", dumpFiles.size() == 0 ? "" : dumpFiles.get(dumpFiles.size() - 1).getAbsolutePath()); + // sb.tables.recordAPICall(post, "IndexExport_p.html", WorkTables.TABLE_API_TYPE_STEERING, "solr dump generation"); + } catch(final SolrException e) { + if(ErrorCode.SERVICE_UNAVAILABLE.code == e.code()) { + prop.put("indexdump", 2); + } else { + prop.put("indexdump", 3); + } + } } if (post.containsKey("indexrestore")) { - try { - final File dump = new File(post.get("dumpfile", "")); - segment.fulltext().restoreEmbeddedSolr(dump); - prop.put("indexRestore", 1); - } catch(final SolrException e) { - if(ErrorCode.SERVICE_UNAVAILABLE.code == e.code()) { - prop.put("indexRestore", 2); - } else { - prop.put("indexRestore", 3); - } - } + try { + final File dump = new File(post.get("dumpfile", "")); + segment.fulltext().restoreEmbeddedSolr(dump); + prop.put("indexRestore", 1); + } catch(final SolrException e) { + if(ErrorCode.SERVICE_UNAVAILABLE.code == e.code()) { + prop.put("indexRestore", 2); + } else { + prop.put("indexRestore", 3); + } + } } // insert constants diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java index 33b797524..2d93ec8b7 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java @@ -981,17 +981,7 @@ public class URIMetadataNode extends SolrDocument /* implements Comparablepublic static final String PROXY_ONLINE_CAUTION_DELAY = "onlineCautionDelay"

*

Name of the setting how long indexing should pause after the last time the proxy was used in milliseconds

diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index 718be0099..cd9680b27 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -34,8 +34,10 @@ import java.util.ArrayList; import java.util.Collection; import java.util.Date; import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import java.util.Set; import java.util.concurrent.BlockingQueue; import java.util.concurrent.atomic.AtomicInteger; @@ -118,7 +120,7 @@ public final class Fulltext { this.writeWebgraph = false; } - public void setUseWebgraph(boolean check) { + public void setUseWebgraph(final boolean check) { this.writeWebgraph = check; } @@ -142,8 +144,8 @@ public final class Fulltext { final File solrLocation = new File(this.segmentPath, SOLR_PATH); // migrate old solr to new - for (String oldVersion: SOLR_OLD_PATH) { - File oldLocation = new File(this.segmentPath, oldVersion); + for (final String oldVersion: SOLR_OLD_PATH) { + final File oldLocation = new File(this.segmentPath, oldVersion); if (oldLocation.exists()) { if (!oldLocation.renameTo(solrLocation)) { ConcurrentLog.severe("Fulltext", "Failed renaming old Solr location (" @@ -183,11 +185,11 @@ public final class Fulltext { return this.solrInstances.getDefaultEmbeddedConnector(); } - public EmbeddedSolrConnector getEmbeddedConnector(String corename) { + public EmbeddedSolrConnector getEmbeddedConnector(final String corename) { return this.solrInstances.getEmbeddedConnector(corename); } - public SolrConnector getConnectorForRead(String corename) { + public SolrConnector getConnectorForRead(final String corename) { if (this.solrInstances.isConnectedRemote()) return this.solrInstances.getRemoteConnector(corename); if (this.solrInstances.isConnectedEmbedded()) return this.solrInstances.getEmbeddedConnector(corename); return null; @@ -315,7 +317,7 @@ public final class Fulltext { } private long lastCommit = 0; - public void commit(boolean softCommit) { + public void commit(final boolean softCommit) { final long t = System.currentTimeMillis(); if (this.lastCommit + 10000 > t) return; this.lastCommit = t; @@ -423,7 +425,7 @@ public final class Fulltext { * @param freshdate either NULL or a date in the past which is the limit for deletion. Only documents older than this date are deleted * @throws IOException */ - public void deleteStaleDomainHashes(final Set hosthashes, Date freshdate) { + public void deleteStaleDomainHashes(final Set hosthashes, final Date freshdate) { // delete in solr final Date now = new Date(); deleteDomainWithConstraint(this.getDefaultConnector(), CollectionSchema.host_id_s.getSolrFieldName(), hosthashes, @@ -434,7 +436,7 @@ public final class Fulltext { (WebgraphSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]")); } - public void deleteStaleDomainNames(final Set hostnames, Date freshdate) { + public void deleteStaleDomainNames(final Set hostnames, final Date freshdate) { final Date now = new Date(); deleteDomainWithConstraint(this.getDefaultConnector(), CollectionSchema.host_s.getSolrFieldName(), hostnames, @@ -453,7 +455,7 @@ public final class Fulltext { deleteDomainWithConstraint(this.getDefaultConnector(), CollectionSchema.host_id_s.getSolrFieldName(), hosthashes, CollectionSchema.failreason_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM); } - private static void deleteDomainWithConstraint(SolrConnector connector, String fieldname, final Set hosthashes, String constraintQuery) { + private static void deleteDomainWithConstraint(final SolrConnector connector, final String fieldname, final Set hosthashes, final String constraintQuery) { if (hosthashes == null || hosthashes.size() == 0) return; final int subsetscount = 1 + (hosthashes.size() / 255); // if the list is too large, we get a "too many boolean clauses" exception int c = 0; @@ -492,7 +494,7 @@ public final class Fulltext { * @param basepath the left path of the url; at least until the end of the host * @param freshdate either NULL or a date in the past which is the limit for deletion. Only documents older than this date are deleted */ - public int remove(final String basepath, Date freshdate) { + public int remove(final String basepath, final Date freshdate) { DigestURL uri; try {uri = new DigestURL(basepath);} catch (final MalformedURLException e) {return 0;} final String host = uri.getHost(); @@ -690,12 +692,15 @@ public final class Fulltext { public static enum ExportFormat { text("txt"), html("html"), rss("rss"), solr("xml"), elasticsearch("flatjson"); private final String ext; - private ExportFormat(String ext) {this.ext = ext;} + private ExportFormat(final String ext) {this.ext = ext;} public String getExt() {return this.ext;} } public final static String yacy_dump_prefix = "yacy_dump_"; - public Export export(Fulltext.ExportFormat format, String filter, String query, final int maxseconds, File path, boolean dom, boolean text) throws IOException { + public Export export( + final Fulltext.ExportFormat format, final String filter, String query, + final int maxseconds, final File path, final boolean dom, final boolean text, + final long maxChunkSize, final boolean minified) throws IOException { // modify query according to maxseconds final long now = System.currentTimeMillis(); @@ -760,32 +765,31 @@ public final class Fulltext { } } - String s = new File(path, yacy_dump_prefix + + final String filename = yacy_dump_prefix + "f" + GenericFormatter.SHORT_MINUTE_FORMATTER.format(firstdate) + "_" + "l" + GenericFormatter.SHORT_MINUTE_FORMATTER.format(lastdate) + "_" + "n" + GenericFormatter.SHORT_MINUTE_FORMATTER.format(new Date(now)) + "_" + - "c" + String.format("%1$012d", doccount)).getAbsolutePath() + "_tc"; // the name ends with the transaction token ('c' = 'created') + "c" + String.format("%1$012d", doccount)+ "_tc"; // the name ends with the transaction token ('c' = 'created') - // create export file name - if (s.indexOf('.',0) < 0) s += "." + format.getExt(); - final File f = new File(s); - f.getParentFile().mkdirs(); - - return export(f, filter, query, format, dom, text); + return export(path, filename, format.getExt(), filter, query, format, dom, text, maxChunkSize, minified); } // export methods - public Export export(final File f, final String filter, final String query, final ExportFormat format, final boolean dom, final boolean text) { + public Export export( + final File path, final String filename, + final String fileext, final String filter, final String query, + final ExportFormat format, final boolean dom, final boolean text, + final long maxChunkSize, final boolean minified) { if ((this.exportthread != null) && (this.exportthread.isAlive())) { ConcurrentLog.warn("LURL-EXPORT", "cannot start another export thread, already one running"); return this.exportthread; } - this.exportthread = new Export(f, filter, query, format, dom, text); + this.exportthread = new Export(path, filename, fileext, filter, query, format, dom, text, maxChunkSize, minified); this.exportthread.start(); return this.exportthread; } - public static void main(String args[]) { + public static void main(final String args[]) { final Date firstdate = null; System.out.println(GenericFormatter.SHORT_MINUTE_FORMATTER.format(firstdate)); } @@ -794,70 +798,110 @@ public final class Fulltext { return this.exportthread; } + private final static Set minified_keys = new HashSet<>(); + static { + //minified_keys.add(CollectionSchema.id.getSolrFieldName()); + minified_keys.add(CollectionSchema.sku.getSolrFieldName()); + minified_keys.add(CollectionSchema.title.getSolrFieldName()); + //minified_keys.add(CollectionSchema.author.getSolrFieldName()); + minified_keys.add(CollectionSchema.description_txt.getSolrFieldName()); + //minified_keys.add(CollectionSchema.size_i.getSolrFieldName()); + minified_keys.add(CollectionSchema.last_modified.getSolrFieldName()); + minified_keys.add(CollectionSchema.text_t.getSolrFieldName()); + } + public class Export extends Thread { - private final File f; + private final File path; + private final String filename, fileext; private final Pattern pattern; - private int count; private String failure; private final String query; private final ExportFormat format; private final boolean dom, text; - - private Export(final File f, final String filter, final String query, final ExportFormat format, final boolean dom, final boolean text) { + private int docCount, chunkSize, chunkCount; + private final long maxChunkSize; + private final boolean minified; + + private Export( + final File path, final String filename, + final String fileext, final String filter, final String query, + final ExportFormat format, final boolean dom, final boolean text, + final long maxChunkSize, final boolean minified) { super("Fulltext.Export"); // format: 0=text, 1=html, 2=rss/xml - this.f = f; + this.path = path; + this.filename = filename; + this.fileext = fileext; this.pattern = filter == null ? null : Pattern.compile(filter); this.query = query == null? AbstractSolrConnector.CATCHALL_QUERY : query; - this.count = 0; this.failure = null; this.format = format; this.dom = dom; this.text = text; + this.docCount = 0; // number of all documents exported so far + this.chunkSize = 0; // number of documents in the current chunk + this.chunkCount = 0; // number of chunks opened so far + this.maxChunkSize = maxChunkSize; // number of maximum document count per chunk + this.minified = minified; //if ((dom) && (format == 2)) dom = false; } + private void printHead(final PrintWriter pw) { + if (this.format == ExportFormat.html) { + pw.println(""); + } + if (this.format == ExportFormat.rss) { + pw.println(""); + pw.println(""); + pw.println(""); + pw.println(""); + pw.println("YaCy Peer-to-Peer - Web-Search URL Export"); + pw.println(""); + pw.println("http://yacy.net"); + } + if (this.format == ExportFormat.solr) { + pw.println(""); + pw.println(""); + pw.println(""); + pw.println(" "); + pw.println(" "); + pw.println(" " + this.query + ""); + pw.println(" "); + pw.println(""); + pw.println(""); + } + } + + private void printTail(final PrintWriter pw) { + if (this.format == ExportFormat.html) { + pw.println(""); + } + if (this.format == ExportFormat.rss) { + pw.println(""); + pw.println(""); + } + if (this.format == ExportFormat.solr) { + pw.println(""); + pw.println(""); + } + } + @Override public void run() { try { - final File parentf = this.f.getParentFile(); - if (parentf != null) { - parentf.mkdirs(); - } + if (this.path != null) this.path.mkdirs(); } catch(final Exception e) { ConcurrentLog.logException(e); this.failure = e.getMessage(); return; } - try (/* Resources automatically closed by this try-with-resources statement */ - final OutputStream os = new FileOutputStream(this.format == ExportFormat.solr ? new File(this.f.getAbsolutePath() + ".gz") : this.f); - final OutputStream wrappedStream = ((this.format == ExportFormat.solr)) ? new GZIPOutputStream(os, 65536){{this.def.setLevel(Deflater.BEST_COMPRESSION);}} : os; - final PrintWriter pw = new PrintWriter(new BufferedOutputStream(wrappedStream)); - ) { - if (this.format == ExportFormat.html) { - pw.println(""); - } - if (this.format == ExportFormat.rss) { - pw.println(""); - pw.println(""); - pw.println(""); - pw.println(""); - pw.println("YaCy Peer-to-Peer - Web-Search URL Export"); - pw.println(""); - pw.println("http://yacy.net"); - } - if (this.format == ExportFormat.solr) { - pw.println(""); - pw.println(""); - pw.println(""); - pw.println(" "); - pw.println(" "); - pw.println(" " + this.query + ""); - pw.println(" "); - pw.println(""); - pw.println(""); - } + try { + this.docCount = 0; + this.chunkSize = 0; + this.chunkCount = 0; + PrintWriter pw = getWriter(); + printHead(pw); if (this.dom) { final Map> scores = Fulltext.this.getDefaultConnector().getFacets(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", 100000000, CollectionSchema.host_s.getSolrFieldName()); final ReversibleScoreMap stats = scores.get(CollectionSchema.host_s.getSolrFieldName()); @@ -865,7 +909,7 @@ public final class Fulltext { if (this.pattern != null && !this.pattern.matcher(host).matches()) continue; if (this.format == ExportFormat.text) pw.println(host); if (this.format == ExportFormat.html) pw.println("" + host + "
"); - this.count++; + this.docCount++; this.chunkSize++; } } else { if (this.format == ExportFormat.solr || this.format == ExportFormat.elasticsearch || (this.text && this.format == ExportFormat.text)) { @@ -874,6 +918,12 @@ public final class Fulltext { while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { final String url = getStringFrom(doc.getFieldValue(CollectionSchema.sku.getSolrFieldName())); if (this.pattern != null && !this.pattern.matcher(url).matches()) continue; + if (this.minified) { + final Iterator> i = doc.iterator(); + while (i.hasNext()) { + if (!minified_keys.contains(i.next().getKey())) i.remove(); + } + } final CRIgnoreWriter sw = new CRIgnoreWriter(); if (this.text) sw.write((String) doc.getFieldValue(CollectionSchema.text_t.getSolrFieldName())); if (this.format == ExportFormat.solr) EnhancedXMLResponseWriter.writeDoc(sw, doc); @@ -882,7 +932,15 @@ public final class Fulltext { if (this.format == ExportFormat.elasticsearch) pw.println("{\"index\":{}}"); final String d = sw.toString(); pw.println(d); - this.count++; + this.docCount++; this.chunkSize++; + if (this.chunkSize >= this.maxChunkSize) { + printTail(pw); + pw.close(); + this.chunkCount++; + pw = getWriter(); + printHead(pw); + this.chunkSize = 0; + } } } else { final BlockingQueue docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", null, 0, 100000000, Long.MAX_VALUE, 100, 1, true, @@ -918,21 +976,20 @@ public final class Fulltext { pw.println("" + hash + ""); pw.println(""); } - this.count++; + this.docCount++; this.chunkSize++; + if (this.chunkSize >= this.maxChunkSize) { + printTail(pw); + pw.close(); + this.chunkCount++; + pw = getWriter(); + printHead(pw); + this.chunkSize = 0; + } } } } - if (this.format == ExportFormat.html) { - pw.println(""); - } - if (this.format == ExportFormat.rss) { - pw.println("
"); - pw.println("
"); - } - if (this.format == ExportFormat.solr) { - pw.println(""); - pw.println(""); - } + printTail(pw); + pw.close(); } catch (final Exception e) { /* Catch but log any IO exception that can occur on copy, automatic closing or streams creation */ ConcurrentLog.logException(e); @@ -942,15 +999,46 @@ public final class Fulltext { } public File file() { - return this.f; + final File f = new File(this.path, this.filename + "_" + chunkcount(this.chunkCount) + "." + this.fileext); + return f; + } + + private PrintWriter getWriter() throws IOException { + final File f = file(); + final OutputStream os = new FileOutputStream(this.format == ExportFormat.solr ? new File(f.getAbsolutePath() + ".gz") : f); + final PrintWriter pw = new PrintWriter(new BufferedOutputStream(((this.format == ExportFormat.solr)) ? new GZIPOutputStream(os, 65536){{this.def.setLevel(Deflater.BEST_COMPRESSION);}} : os)); + return pw; + } + + private String chunkcount(final int count) { + if (count < 10) return "000" + count; + if (count < 100) return "00" + count; + if (count < 1000) return "0" + count; + return "" + count; + } + + public File path() { + return this.path; + } + + public String filename() { + return this.filename; + } + + public String fileext() { + return this.fileext; } public String failed() { return this.failure; } - public int count() { - return this.count; + public int docCount() { + return this.docCount; + } + + public int chunkCount() { + return this.chunkCount; } @SuppressWarnings("unchecked") diff --git a/source/org/openzim/ZIMFile.java b/source/org/openzim/ZIMFile.java index 906bf30a9..cbde3a0a8 100644 --- a/source/org/openzim/ZIMFile.java +++ b/source/org/openzim/ZIMFile.java @@ -113,23 +113,28 @@ public class ZIMFile extends File { } this.mimeTypeList = mList.toArray(new String[mList.size()]); - // Initialize the Url Pointer List - this.urlPtrListBlob = new byte[this.header_entryCount * 8]; - mReader.seek(this.header_urlPtrPos); - RandomAccessFileZIMInputStream.readFully(mReader, this.urlPtrListBlob); - - // Initialize the Title Pointer List - this.titlePtrListBlob = new byte[this.header_entryCount * 4]; - mReader.seek(this.header_titlePtrPos); - RandomAccessFileZIMInputStream.readFully(mReader, this.titlePtrListBlob); - - // Initialize the Cluster Pointer List - this.clusterPtrListBlob = new byte[this.header_clusterCount * 8]; - mReader.seek(this.header_clusterPtrPos); - RandomAccessFileZIMInputStream.readFully(mReader, this.clusterPtrListBlob); + try { + // Initialize the Url Pointer List + this.urlPtrListBlob = new byte[this.header_entryCount * 8]; + mReader.seek(this.header_urlPtrPos); + RandomAccessFileZIMInputStream.readFully(mReader, this.urlPtrListBlob); + + // Initialize the Title Pointer List + this.titlePtrListBlob = new byte[this.header_entryCount * 4]; + mReader.seek(this.header_titlePtrPos); + RandomAccessFileZIMInputStream.readFully(mReader, this.titlePtrListBlob); + + // Initialize the Cluster Pointer List + this.clusterPtrListBlob = new byte[this.header_clusterCount * 8]; + mReader.seek(this.header_clusterPtrPos); + RandomAccessFileZIMInputStream.readFully(mReader, this.clusterPtrListBlob); + } catch (IndexOutOfBoundsException e) { + throw new IOException(e.getMessage()); + } } public final String getMimeType(int idx) { + if (idx >= this.mimeTypeList.length) return ""; return this.mimeTypeList[idx]; } diff --git a/source/org/openzim/ZIMReader.java b/source/org/openzim/ZIMReader.java index bc39fd36b..14bc47dfd 100644 --- a/source/org/openzim/ZIMReader.java +++ b/source/org/openzim/ZIMReader.java @@ -237,11 +237,25 @@ public class ZIMReader { public DirectoryEntry getMainDirectoryEntry() throws IOException { DirectoryEntry de = getDirectoryInfo(this.mFile.header_mainPage); - if (de.namespace == 'W' && de.url.equals("mainPage") && de instanceof RedirectEntry) { + if (de instanceof RedirectEntry) { // resolve redirect to get the actual main page int redirect = ((RedirectEntry) de).redirect_index; de = getDirectoryInfo(redirect); } + // For the main entry we demand a "text/html" mime type. + // Many zim files do not provide this as the main file, which is strange (maybe lazy/irresponsibe) + // Because the main entry is important for a validation, we seek for one entry which may + // be proper for indexing. + int entryNumner = 0; + while (!de.getMimeType().equals("text/html") && entryNumner < this.mFile.header_entryCount) { + de = getDirectoryInfo(entryNumner); + entryNumner++; + if (de.namespace != 'C' && de.namespace != 'A') continue; + if (!(de instanceof ArticleEntry)) continue; + if (!de.getMimeType().equals("text/html")) continue; + if (de.url.contains("404") || de.title.contains("404") || de.title.contains("301")) continue; // is a pain + return de; + } return de; } @@ -337,10 +351,7 @@ public class ZIMReader { public Cluster getCluster(int clusterNumber) throws IOException { for (int i = 0; i < this.clusterCache.size(); i++) { Cluster c = clusterCache.get(i); - if (c.cluster_number == clusterNumber) { - c.incUsage(); // cache hit - return c; - } + if (c.cluster_number == clusterNumber) return c; } // cache miss @@ -348,17 +359,10 @@ public class ZIMReader { // check cache size if (clusterCache.size() >= MAX_CLUSTER_CACHE_SIZE) { - // remove one entry - double maxEntry = Double.MIN_VALUE; - int pos = -1; - for (int i = 0; i < clusterCache.size(); i++) { - double r = this.clusterCache.get(i).getUsageRatio(); - if (r > maxEntry) {maxEntry = r; pos = i;} - } - if (pos >= 0) this.clusterCache.remove(pos); + // remove one entry: the first entry is the oldest entry + this.clusterCache.remove(0); } - c.incUsage(); this.clusterCache.add(c); return c; } @@ -378,12 +382,10 @@ public class ZIMReader { private int cluster_number; // used to identify the correct cache entry private List blobs; - private int usageCounter; // used for efficient caching and cache stale detection private boolean extended; public Cluster(int cluster_number) throws IOException { this.cluster_number = cluster_number; - this.usageCounter = 0; // open the cluster and make a Input Stream with the proper decompression type final long clusterPos = mFile.geClusterPtr(cluster_number); @@ -444,21 +446,9 @@ public class ZIMReader { return this.blobs.get(i); } - public void incUsage() { - this.usageCounter++; - } - - public int getUsage() { - return this.usageCounter; - } - public int getSize() { return this.blobs.size(); } - - public double getUsageRatio() { - return ((double) this.usageCounter) / ((double) this.blobs.size()); - } } public byte[] getArticleData(final DirectoryEntry directoryInfo) throws IOException {