From 7db0534d8a0709a2903f1880e98aaa4657fbf462 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Sun, 5 Nov 2023 02:16:40 +0100 Subject: [PATCH] Added a zim parser to the surrogate import option. You can now import zim files into YaCy by simply moving them to the DATA/SURROGATE/IN folder. They will be fetched and after parsing moved to DATA/SURROGATE/OUT. There are exceptions where the parser is not able to identify the original URL of the documents in the zim file. In that case the file is simply ignored. This commit also carries an important fix to the pdf parser and an increase of the maximum parsing speed to 60000 PPM which should make it possible to index up to 1000 files in one second. --- htroot/ConfigParser_p.html | 21 -- htroot/Crawler_p.html | 4 +- ivy.xml | 1 + .../cora/document/id/MultiProtocolURL.java | 14 +- .../yacy/document/importer/ZimImporter.java | 215 ++++++++++++++++-- .../net/yacy/document/parser/pdfParser.java | 126 +++------- source/net/yacy/htroot/ConfigParser_p.java | 10 - source/net/yacy/htroot/Crawler_p.java | 10 +- .../kelondro/data/meta/URIMetadataNode.java | 12 +- source/net/yacy/search/Switchboard.java | 18 +- .../net/yacy/search/SwitchboardConstants.java | 2 - source/org/openzim/ZIMFile.java | 32 +-- 12 files changed, 279 insertions(+), 186 deletions(-) diff --git a/htroot/ConfigParser_p.html b/htroot/ConfigParser_p.html index a51ee1013..66a4665d3 100644 --- a/htroot/ConfigParser_p.html +++ b/htroot/ConfigParser_p.html @@ -51,27 +51,6 @@ -
PDF Parser Attributes -

- This is an experimental setting which makes it possible to split PDF documents into individual index entries. - Every page will become a single index hit and the url is artifically extended with a post/get attribute value containing - the page number as value. When such an url is displayed within a search result, then the post/get attribute is transformed into an anchor hash link. - This makes it possible to view the individual page directly in the pdf.js viewer built-in into firefox, - for reference see https://github.com/mozilla/pdf.js/wiki/Viewer-options -

- - - - - - - - - - - - -
Split PDF
Property Name
#%env/templates/footer.template%# diff --git a/htroot/Crawler_p.html b/htroot/Crawler_p.html index 79a0319c0..3b328a996 100644 --- a/htroot/Crawler_p.html +++ b/htroot/Crawler_p.html @@ -134,7 +134,7 @@ Speed / PPM
(Pages Per Minute) - + @@ -147,7 +147,7 @@ Crawler PPM     - + diff --git a/ivy.xml b/ivy.xml index 61f9ee127..8c072699d 100644 --- a/ivy.xml +++ b/ivy.xml @@ -28,6 +28,7 @@ + diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java index a3404bec0..1cac0dace 100644 --- a/source/net/yacy/cora/document/id/MultiProtocolURL.java +++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java @@ -2593,14 +2593,18 @@ public class MultiProtocolURL implements Serializable, Comparable 0; } if (isHTTP() || isHTTPS()) { - try (final HTTPClient client = new HTTPClient(agent)) { - client.setHost(getHost()); - org.apache.http.HttpResponse response = client.HEADResponse(this, true); - return response != null && (response.getStatusLine().getStatusCode() == 200 || response.getStatusLine().getStatusCode() == 301); - } + final HTTPClient client = new HTTPClient(agent); + client.setHost(getHost()); + org.apache.http.HttpResponse response = client.HEADResponse(this, true); + client.close(); + if (response == null) return false; + int status = response.getStatusLine().getStatusCode(); + return status == 200 || status == 301 || status == 302; } return false; } catch (IOException e) { + if (e.getMessage().contains("Circular redirect to")) return true; // exception; this is a 302 which the client actually accepts + //e.printStackTrace(); return false; } } diff --git a/source/net/yacy/document/importer/ZimImporter.java b/source/net/yacy/document/importer/ZimImporter.java index 26f36f787..118e27e40 100644 --- a/source/net/yacy/document/importer/ZimImporter.java +++ b/source/net/yacy/document/importer/ZimImporter.java @@ -25,12 +25,20 @@ package net.yacy.document.importer; import java.io.File; import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; import java.util.Collection; +import java.util.Date; +import java.util.HashSet; +import java.util.LinkedHashSet; import java.util.Map; +import java.util.Set; import java.util.TreeMap; +import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.ClientIdentification; +import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.util.ConcurrentLog; @@ -81,14 +89,18 @@ public class ZimImporter extends Thread implements Importer { public void run() { job = this; this.startTime = System.currentTimeMillis(); + Switchboard sb = Switchboard.getSwitchboard(); try { this.reader = new ZIMReader(this.file); this.guessedSource = getSource(this.reader); // verify the source DirectoryEntry mainEntry = this.reader.getMainDirectoryEntry(); - DigestURL url = new DigestURL(mainEntry.url); - if (!url.exists(ClientIdentification.browserAgent)) return; + DigestURL mainURL = guessURL(this.guessedSource, mainEntry); + if (!mainURL.exists(ClientIdentification.browserAgent)) { + sb.log.info("zim importer: file " + this.file.getName() + " failed main url existence test: " + mainURL); + return; + } // read all documents for (int i = 0; i < this.file.header_entryCount; i++) { @@ -98,8 +110,14 @@ public class ZimImporter extends Thread implements Importer { ArticleEntry ae = (ArticleEntry) de; // check url - String guessedUrl = guessURL(this.guessedSource, de); - assert guessedUrl.startsWith("http"); + DigestURL guessedUrl = guessURL(this.guessedSource, de); + if (recordCnt < 10) { + // critical test for the first 10 urls + if (!guessedUrl.exists(ClientIdentification.browserAgent)) { + sb.log.info("zim importer: file " + this.file.getName() + " failed url " + recordCnt + " existence test: " + guessedUrl); + return; + } + } // check availability of text parser String mimeType = ae.getMimeType(); @@ -111,7 +129,17 @@ public class ZimImporter extends Thread implements Importer { // create artificial request and response headers for the indexer RequestHeader requestHeader = new RequestHeader(); ResponseHeader responseHeader = new ResponseHeader(200); - final Request request = new Request(new DigestURL(guessedUrl), null); + responseHeader.put(HeaderFramework.CONTENT_TYPE, de.getMimeType()); // very important to tell parser which kind of content + final Request request = new Request( + ASCII.getBytes(sb.peers.mySeed().hash), + guessedUrl, + null, // referrerhash the hash of the referrer URL + de.title, // name the name of the document to crawl + null, // appdate the time when the url was first time appeared + sb.crawler.defaultSurrogateProfile.handle(), // profileHandle the name of the prefetch profile. This must not be null! + 0, // depth the crawling depth of the entry + sb.crawler.defaultSurrogateProfile.timezoneOffset() // timezone offset + ); final Response response = new Response( request, requestHeader, @@ -122,7 +150,7 @@ public class ZimImporter extends Thread implements Importer { ); // throw this to the indexer - String error = Switchboard.getSwitchboard().toIndexer(response); + String error = sb.toIndexer(response); if (error != null) ConcurrentLog.info("ZimImporter", "error parsing: " + error); this.recordCnt++; } @@ -203,7 +231,7 @@ public class ZimImporter extends Thread implements Importer { case "fonts": return "fonts.google.com"; case "gutenberg": - return "gutenberg.org"; + return "https://dev.library.kiwix.org/viewer#gutenberg_de_all_2023-03"; case "ifixit": return "ifixit.com"; case "lesfondamentaux": @@ -223,11 +251,23 @@ public class ZimImporter extends Thread implements Importer { case "rapsberry_pi_docs": return "raspberrypi.org"; case "ted": - return "ted.com"; + return "www.ted.com/search?q="; case "vikidia": - return "vikidia.org"; + return parts[1] + ".vikidia.org/wiki"; case "westeros": return "westeros.org"; + case "wikihow": + return parts[1].equals("en") ? "wikihow.com" : parts[1] + ".wikihow.com"; + case "wikisource": + return parts[1] + ".wikisource.org/wiki"; + case "wikiversity": + return parts[1] + ".wikiversity.org/wiki"; + case "wikivoyage": + return parts[1] + ".wikivoyage.org/wiki"; + case "wiktionary": + return parts[1] + ".wiktionary.org/wiki"; + case "wikiquote": + return parts[1] + ".wikiquote.org/wiki"; case "wikibooks": return parts[1] + ".wikibooks.org/wiki"; case "wikinews": @@ -273,16 +313,148 @@ public class ZimImporter extends Thread implements Importer { return source; } - public static String guessURL(String guessedSource, DirectoryEntry de) { + public static DigestURL guessURL(String guessedSource, DirectoryEntry de) throws MalformedURLException { String url = de.url; if (url.equals("Main_Page")) url = ""; - if (guessedSource != null) return guessedSource + url; - if (url.startsWith("A/")) return "https://" + url.substring(2); - if (url.startsWith("H/")) return "https://" + url.substring(2); - return guessedSource + url; + if (guessedSource != null) return new DigestURL(guessedSource + url); + if (url.startsWith("A/")) return new DigestURL("https://" + url.substring(2)); + if (url.startsWith("H/")) return new DigestURL("https://" + url.substring(2)); + return new DigestURL(guessedSource + url); } + private final static String[] skip_files = { + "iota.stackexchange.com_en_all_2023-05.zim", + "stellar.stackexchange.com_en_all_2023-10.zim", + "vegetarianism.stackexchange.com_en_all_2023-05.zim", + "esperanto.stackexchange.com_eo_all_2023-10.zim", + "tezos.stackexchange.com_en_all_2023-10.zim", + "eosio.stackexchange.com_en_all_2023-10.zim", + "ebooks.stackexchange.com_en_all_2023-10.zim", + "poker.stackexchange.com_en_all_2023-05.zim", + "cseducators.stackexchange.com_en_all_2023-10.zim", + "iot.stackexchange.com_en_all_2023-05.zim", + "portuguese.stackexchange.com_pt_all_2023-04.zim", + "portuguese.stackexchange.com_pt_all_2023-10.zim", + "italian.stackexchange.com_it_all_2023-05.zim", + "monero.stackexchange.com_en_all_2022-11.zim", + "sustainability.stackexchange.com_en_all_2023-05.zim", + "westeros_en_all_nopic_2021-03.zim", + "opensource.stackexchange.com_en_all_2023-10.zim", + "tor.stackexchange.com_en_all_2023-05.zim", + "devops.stackexchange.com_en_all_2023-10.zim", + "patents.stackexchange.com_en_all_2023-10.zim", + "stackapps.com_en_all_2023-05.zim", + "hardwarerecs.stackexchange.com_en_all_2023-05.zim", + "hsm.stackexchange.com_en_all_2023-05.zim", + "expatriates.stackexchange.com_en_all_2023-11.zim", + "opendata.stackexchange.com_en_all_2023-10.zim", + "sports.stackexchange.com_en_all_2023-05.zim", + "wikinews_de_all_nopic_2023-10.zim", + "computergraphics.stackexchange.com_en_all_2023-10.zim", + "tridion.stackexchange.com_en_all_2023-10.zim", + "bioinformatics.stackexchange.com_en_all_2023-10.zim", + "expressionengine.stackexchange.com_en_all_2023-11.zim", + "elementaryos.stackexchange.com_en_all_2023-10.zim", + "cstheory.stackexchange.com_en_all_2023-10.zim", + "chess.stackexchange.com_en_all_2023-05.zim", + "vi.stackexchange.com_en_all_2023-05.zim", + "fitness.stackexchange.com_en_all_2023-10.zim", + "pets.stackexchange.com_en_all_2023-05.zim", + "french.stackexchange.com_fr_all_2023-10.zim", + "sqa.stackexchange.com_en_all_2023-05.zim", + "islam.stackexchange.com_en_all_2023-05.zim", + "scicomp.stackexchange.com_en_all_2023-05.zim", + "wikinews_en_all_nopic_2023-09.zim", + "ai.stackexchange.com_en_all_2023-10.zim", + "boardgames.stackexchange.com_en_all_2023-05.zim", + "economics.stackexchange.com_en_all_2023-05.zim", + "3dprinting.stackexchange.com_en_all_2023-07.zim", + "earthscience.stackexchange.com_en_all_2023-05.zim", + "emacs.stackexchange.com_en_all_2023-10.zim", + "bitcoin.stackexchange.com_en_all_2023-05.zim", + "philosophy.stackexchange.com_en_all_2023-05.zim", + "law.stackexchange.com_en_all_2023-05.zim", + "astronomy.stackexchange.com_en_all_2023-05.zim", + "artofproblemsolving_en_all_nopic_2021-03.zim", + "engineering.stackexchange.com_en_all_2023-05.zim", + "ja.stackoverflow.com_ja_all_2023-06.zim", + "webmasters.stackexchange.com_en_all_2023-05.zim", + "anime.stackexchange.com_en_all_2023-10.zim", + "cooking.stackexchange.com_en_all_2023-05.zim", + "arduino.stackexchange.com_en_all_2023-05.zim", + "money.stackexchange.com_en_all_2023-05.zim", + "judaism.stackexchange.com_en_all_2023-05.zim", + "ethereum.stackexchange.com_en_all_2023-05.zim", + "datascience.stackexchange.com_en_all_2023-10.zim", + "academia.stackexchange.com_en_all_2023-10.zim", + "music.stackexchange.com_en_all_2023-05.zim", + "cs.stackexchange.com_en_all_2023-03.zim", + "dsp.stackexchange.com_en_all_2023-05.zim", + "biology.stackexchange.com_en_all_2023-05.zim", + "android.stackexchange.com_en_all_2023-10.zim", + "bicycles.stackexchange.com_en_all_2023-05.zim", + "puzzling.stackexchange.com_en_all_2023-05.zim", + "photo.stackexchange.com_en_all_2023-05.zim", + "aviation.stackexchange.com_en_all_2023-05.zim", + "drupal.stackexchange.com_en_all_2023-05.zim", + "ux.stackexchange.com_en_all_2023-05.zim", + "ell.stackexchange.com_en_all_2023-10.zim", + "openstreetmap-wiki_en_all_nopic_2023-05.zim", + "softwareengineering.stackexchange.com_en_all_2023-05.zim", + "gaming.stackexchange.com_en_all_2023-10.zim", + "mathematica.stackexchange.com_en_all_2023-10.zim", + "pt.stackoverflow.com_pt_all_2023-06.zim", + "apple.stackexchange.com_en_all_2023-05.zim", + "diy.stackexchange.com_en_all_2023-08.zim", + "es.stackoverflow.com_es_all_2023-06.zim", + "gis.stackexchange.com_en_all_2023-05.zim", + "stats.stackexchange.com_en_all_2023-05.zim", + "physics.stackexchange.com_en_all_2023-05.zim", + "serverfault.com_en_all_2023-05.zim", + "electronics.stackexchange.com_en_all_2023-05.zim", + "tex.stackexchange.com_en_all_2023-05.zim", + "wikibooks_de_all_nopic_2021-03.zim", + "askubuntu.com_en_all_2023-05.zim", + "superuser.com_en_all_2023-05.zim", + "lesfondamentaux.reseau-canope.fr_fr_all_2022-11.zim", + "wikibooks_en_all_nopic_2021-03.zim", + "courses.lumenlearning.com_en_all_2021-03.zim", + "wikipedia_de_all_nopic_2023-10.zim", + "wikipedia_en_all_nopic_2023-10.zim", + "stackoverflow.com_en_all_nopic_2022-07.zim", + "stackoverflow.com_en_all_2023-05.zim", + "armypubs_en_all_2023-08.zim", + "vikidia_en_all_nopic_2023-09.zim", + "wikiquote_de_all_nopic_2023-10.zim", + "wikiquote_en_all_nopic_2023-09.zim", + "wiktionary_de_all_nopic_2023-10.zim", + "wiktionary_en_all_nopic_2023-10.zim", + "wikihow_de_maxi_2023-10.zim", + "wikivoyage_de_all_nopic_2023-09.zim", + "wikiversity_de_all_nopic_2021-03.zim", + "wikiversity_en_all_nopic_2021-03.zim", + "wikisource_de_all_nopic_2023-09.zim", + "wikisource_en_all_nopic_2023-08.zim", + "ted_countdown_global_2023-09.zim", + "ted_en_design_2023-09.zim", + "ted_en_business_2023-09.zim", + "ted_en_global_issues_2023-09.zim", + + // 302 + "moderators.stackexchange.com_en_all_2023-05.zim", + "beer.stackexchange.com_en_all_2023-05.zim", + "health.stackexchange.com_en_all_2023-05.zim", + "avp.stackexchange.com_en_all_2023-05.zim", + "lowtechmagazine.com_en_all_2023-08.zim", + "ifixit_de_all_2023-07.zim", + "ifixit_en_all_2023-10.zim", + "der-postillon.com_de_all_2020-12.zim", + "wikihow_en_maxi_2023-03.zim", + }; + public static void main(String[] args) { + Set skip = new HashSet<>(); + for (String s: skip_files) skip.add(s); // zim file import test // will test mostly if domain names are included in zim file urls String zimFilesPath = args[0]; @@ -298,7 +470,10 @@ public class ZimImporter extends Thread implements Importer { } Collection orderedFiles = orderedFileMap.values(); + Set files_ok = new LinkedHashSet<>(); + Set files_nok = new LinkedHashSet<>(); for (File f: orderedFiles) { + if (skip.contains(f.getName())) continue; try { ZIMFile z = new ZIMFile(f.getAbsolutePath()); ZIMReader r = new ZIMReader(z); @@ -308,16 +483,20 @@ public class ZimImporter extends Thread implements Importer { System.out.println("Namespace: " + de.namespace); System.out.println("Title: " + de.title); System.out.println("URL: " + de.url); - System.out.println("guessed domain: " + guessDomainName(f.getName())); + System.out.println("guessed domain: " + guessDomainName(f.getName())); // uses a table and rules that deduces a source from the file name String source = getSource(r); - System.out.println("guessed Source: " + source); - String mainURL = guessURL(source, de); + System.out.println("guessed Source: " + source); // this uses metadata stored in the zim file + DigestURL mainURL = guessURL(source, de); System.out.println("guessed main article: " + mainURL); - System.out.println("main article exists: " + new DigestURL(mainURL).exists(ClientIdentification.browserAgent)); + boolean ok = mainURL.exists(ClientIdentification.browserAgent); + System.out.println("main article exists: " + ok); + if (ok) files_ok.add(f.getName()); else files_nok.add(f.getName()); System.out.println(); } catch (IOException e) { e.printStackTrace(); } } + System.out.println("ok files: " + files_ok.toString()); + System.out.println("not-ok files: " + files_nok.toString()); } } diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java index 0ad6b2248..f02577244 100644 --- a/source/net/yacy/document/parser/pdfParser.java +++ b/source/net/yacy/document/parser/pdfParser.java @@ -53,7 +53,6 @@ import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink; import org.apache.pdfbox.text.PDFTextStripper; -import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; @@ -69,9 +68,6 @@ import net.yacy.kelondro.util.MemoryControl; public class pdfParser extends AbstractParser implements Parser { - public static boolean individualPages = false; - public static String individualPagePropertyname = "page"; - public pdfParser() { super("Acrobat Portable Document Parser"); this.SUPPORTED_EXTENSIONS.add("pdf"); @@ -149,98 +145,36 @@ public class pdfParser extends AbstractParser implements Parser { // get the links final List> pdflinks = extractPdfLinks(pdfDoc); - // get the fulltext (either per document or for each page) - final PDFTextStripper stripper = new PDFTextStripper(/*StandardCharsets.UTF_8.name()*/); - - if (individualPages) { - // this is a hack which stores individual pages of the source pdf into individual index documents - // the new documents will get a virtual link with a post argument page=X appended to the original url - - // collect text - final int pagecount = pdfDoc.getNumberOfPages(); - final String[] pages = new String[pagecount]; - for (int page = 1; page <= pagecount; page++) { - stripper.setStartPage(page); - stripper.setEndPage(page); - pages[page - 1] = stripper.getText(pdfDoc); - //System.out.println("PAGE " + page + ": " + pages[page - 1]); - } - - // create individual documents for each page - assert pages.length == pdflinks.size() : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.size(); - result = new Document[Math.min(pages.length, pdflinks.size())]; - final String loc = location.toNormalform(true); - for (int page = 0; page < result.length; page++) { - result[page] = new Document( - new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash - mimeType, - StandardCharsets.UTF_8.name(), - this, - null, - docKeywords, - singleList(docTitle), - docAuthor, - docPublisher, - null, - null, - 0.0d, 0.0d, - pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]), - pdflinks == null || page >= pdflinks.size() ? null : pdflinks.get(page), - null, - null, - false, - docDate); - } - } else { - // collect the whole text at once - final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE); - byte[] contentBytes = new byte[0]; - stripper.setEndPage(3); // get first 3 pages (always) - writer.append(stripper.getText(pdfDoc)); - contentBytes = writer.getBytes(); // remember text in case of interrupting thread - - if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read - stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text) - stripper.setEndPage(Integer.MAX_VALUE); // set to default - // we start the pdf parsing in a separate thread to ensure that it can be terminated - final PDDocument pdfDocC = pdfDoc; - final Thread t = new Thread("pdfParser.getText:" + location) { - @Override - public void run() { - try { - writer.append(stripper.getText(pdfDocC)); - } catch (final Throwable e) {} - } - }; - t.start(); - t.join(3000); // pdfbox likes to forget to terminate ... (quite often) - if (t.isAlive()) t.interrupt(); - contentBytes = writer.getBytes(); // get final text before closing writer - writer.close(); // free writer resources - } - - final Collection pdflinksCombined = new HashSet<>(); - for (final Collection pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx); - result = new Document[]{new Document( - location, - mimeType, - StandardCharsets.UTF_8.name(), - this, - null, - docKeywords, - singleList(docTitle), - docAuthor, - docPublisher, - null, - null, - 0.0d, 0.0d, - contentBytes, - pdflinksCombined, - null, - null, - false, - docDate)}; - } + // collect the whole text at once + final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE); + byte[] contentBytes = new byte[0]; + final PDFTextStripper stripper = new PDFTextStripper(); + stripper.setEndPage(Integer.MAX_VALUE); + writer.append(stripper.getText(pdfDoc)); + contentBytes = writer.getBytes(); // remember text in case of interrupting thread + writer.close(); // free writer resources + + final Collection pdflinksCombined = new HashSet<>(); + for (final Collection pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx); + result = new Document[]{new Document( + location, + mimeType, + StandardCharsets.UTF_8.name(), + this, + null, + docKeywords, + singleList(docTitle), + docAuthor, + docPublisher, + null, + null, + 0.0d, 0.0d, + contentBytes, + pdflinksCombined, + null, + null, + false, + docDate)}; } catch (final Throwable e) { //throw new Parser.Failure(e.getMessage(), location); } finally { diff --git a/source/net/yacy/htroot/ConfigParser_p.java b/source/net/yacy/htroot/ConfigParser_p.java index e466d783b..943279382 100644 --- a/source/net/yacy/htroot/ConfigParser_p.java +++ b/source/net/yacy/htroot/ConfigParser_p.java @@ -61,13 +61,6 @@ public class ConfigParser_p { env.setConfig(SwitchboardConstants.PARSER_MIME_DENY, TextParser.getDenyMime()); env.setConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, TextParser.getDenyExtension()); } - - if (post.containsKey("pdfSettings")) { - env.setConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, post.getBoolean("individualPages")); - env.setConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, post.get("individualPagePropertyname", "page")); - pdfParser.individualPages = sb.getConfigBool(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, false); - pdfParser.individualPagePropertyname = sb.getConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, "page"); - } } int i = 0; @@ -94,9 +87,6 @@ public class ConfigParser_p { prop.put("parser", i); - prop.put("individualPages", sb.getConfigBool(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, false)); - prop.put("individualPagePropertyname", sb.getConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, "page")); - // return rewrite properties return prop; } diff --git a/source/net/yacy/htroot/Crawler_p.java b/source/net/yacy/htroot/Crawler_p.java index e95562713..8c898f558 100644 --- a/source/net/yacy/htroot/Crawler_p.java +++ b/source/net/yacy/htroot/Crawler_p.java @@ -774,7 +774,7 @@ public class Crawler_p { } /* - * PPM + * PPM LF MH @@ -784,19 +784,19 @@ public class Crawler_p { if (post != null && post.containsKey("crawlingPerformance")) { final String crawlingPerformance = post.get("crawlingPerformance", "custom"); final long LCbusySleep1 = sb.getConfigLong(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, 1000L); - int wantedPPM = (LCbusySleep1 == 0) ? 30000 : (int) (60000L / LCbusySleep1); + int wantedPPM = (LCbusySleep1 == 0) ? 60000 : (int) (60000L / LCbusySleep1); try { wantedPPM = post.getInt("customPPM", wantedPPM); } catch (final NumberFormatException e) {} if ("minimum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 10; - if ("maximum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 30000; + if ("maximum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 60000; int wPPM = wantedPPM; if ( wPPM <= 0 ) { wPPM = 1; } - if ( wPPM >= 30000 ) { - wPPM = 30000; + if ( wPPM >= 60000 ) { + wPPM = 60000; } final int newBusySleep = 60000 / wPPM; // for wantedPPM = 10: 6000; for wantedPPM = 1000: 60 diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java index 33b797524..2d93ec8b7 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java @@ -981,17 +981,7 @@ public class URIMetadataNode extends SolrDocument /* implements Comparablepublic static final String PROXY_ONLINE_CAUTION_DELAY = "onlineCautionDelay"

*

Name of the setting how long indexing should pause after the last time the proxy was used in milliseconds

diff --git a/source/org/openzim/ZIMFile.java b/source/org/openzim/ZIMFile.java index 906bf30a9..a241507ab 100644 --- a/source/org/openzim/ZIMFile.java +++ b/source/org/openzim/ZIMFile.java @@ -113,20 +113,24 @@ public class ZIMFile extends File { } this.mimeTypeList = mList.toArray(new String[mList.size()]); - // Initialize the Url Pointer List - this.urlPtrListBlob = new byte[this.header_entryCount * 8]; - mReader.seek(this.header_urlPtrPos); - RandomAccessFileZIMInputStream.readFully(mReader, this.urlPtrListBlob); - - // Initialize the Title Pointer List - this.titlePtrListBlob = new byte[this.header_entryCount * 4]; - mReader.seek(this.header_titlePtrPos); - RandomAccessFileZIMInputStream.readFully(mReader, this.titlePtrListBlob); - - // Initialize the Cluster Pointer List - this.clusterPtrListBlob = new byte[this.header_clusterCount * 8]; - mReader.seek(this.header_clusterPtrPos); - RandomAccessFileZIMInputStream.readFully(mReader, this.clusterPtrListBlob); + try { + // Initialize the Url Pointer List + this.urlPtrListBlob = new byte[this.header_entryCount * 8]; + mReader.seek(this.header_urlPtrPos); + RandomAccessFileZIMInputStream.readFully(mReader, this.urlPtrListBlob); + + // Initialize the Title Pointer List + this.titlePtrListBlob = new byte[this.header_entryCount * 4]; + mReader.seek(this.header_titlePtrPos); + RandomAccessFileZIMInputStream.readFully(mReader, this.titlePtrListBlob); + + // Initialize the Cluster Pointer List + this.clusterPtrListBlob = new byte[this.header_clusterCount * 8]; + mReader.seek(this.header_clusterPtrPos); + RandomAccessFileZIMInputStream.readFully(mReader, this.clusterPtrListBlob); + } catch (IndexOutOfBoundsException e) { + throw new IOException(e.getMessage()); + } } public final String getMimeType(int idx) {