Added a zim parser to the surrogate import option.

You can now import zim files into YaCy by simply moving them
to the DATA/SURROGATE/IN folder. They will be fetched and after
parsing moved to DATA/SURROGATE/OUT.
There are exceptions where the parser is not able to identify the
original URL of the documents in the zim file. In that case the file
is simply ignored.
This commit also carries an important fix to the pdf parser and an
increase of the maximum parsing speed to 60000 PPM which should make it
possible to index up to 1000 files in one second.
pull/610/head
Michael Peter Christen 1 year ago
parent 70e29937ef
commit 7db0534d8a

@ -51,27 +51,6 @@
</tr> </tr>
</table> </table>
</fieldset> </fieldset>
<fieldset><legend id="parser">PDF Parser Attributes</legend>
<p>
This is an experimental setting which makes it possible to split PDF documents into individual index entries.
Every page will become a single index hit and the url is artifically extended with a post/get attribute value containing
the page number as value. When such an url is displayed within a search result, then the post/get attribute is transformed into an anchor hash link.
This makes it possible to view the individual page directly in the pdf.js viewer built-in into firefox,
for reference see https://github.com/mozilla/pdf.js/wiki/Viewer-options
</p>
<table border="0">
<tr class="TableCellLight">
<td class="small" align="right" width="90">Split PDF</td>
<td class="small" align="left" width="300"><input type="checkbox" name="individualPages" #(individualPages)#::checked="checked" #(/individualPages)#/></td>
</tr>
<tr class="TableCellLight">
<td class="small" align="right">Property Name</td>
<td class="small" align="left"><input type="text" name="individualPagePropertyname" value="#[individualPagePropertyname]#"/></td>
</tr>
<tr class="TableCellDark">
<td colspan="3" class="small" ><input type="submit" name="pdfSettings" value="Submit" class="btn btn-primary"/></td>
</tr>
</table>
</form> </form>
#%env/templates/footer.template%# #%env/templates/footer.template%#
</body> </body>

@ -134,7 +134,7 @@
<tr class="TableCellLight"> <tr class="TableCellLight">
<td align="left">Speed / PPM<br/>(Pages Per Minute)</td> <td align="left">Speed / PPM<br/>(Pages Per Minute)</td>
<td align="left" colspan="4"> <td align="left" colspan="4">
<input id="customPPM" name="customPPM" type="number" min="10" max="30000" style="width:5em" value="#[customPPMdefault]#" /><label for="customPPM"><abbr title="Pages Per Minute">PPM</abbr></label> <input id="customPPM" name="customPPM" type="number" min="10" max="60000" style="width:5em" value="#[customPPMdefault]#" /><label for="customPPM"><abbr title="Pages Per Minute">PPM</abbr></label>
<input id="latencyFactor" name="latencyFactor" type="number" min="0.1" max="3.0" step="0.1" style="width:3.5em" value="#[latencyFactorDefault]#" /> <input id="latencyFactor" name="latencyFactor" type="number" min="0.1" max="3.0" step="0.1" style="width:3.5em" value="#[latencyFactorDefault]#" />
<label for="latencyFactor"><abbr title="Latency Factor">LF</abbr></label> <label for="latencyFactor"><abbr title="Latency Factor">LF</abbr></label>
<input id="MaxSameHostInQueue" name="MaxSameHostInQueue" type="number" min="1" max="30" style="width:3em" value="#[MaxSameHostInQueueDefault]#" /> <input id="MaxSameHostInQueue" name="MaxSameHostInQueue" type="number" min="1" max="30" style="width:3em" value="#[MaxSameHostInQueueDefault]#" />
@ -147,7 +147,7 @@
<td align="left">Crawler PPM</td> <td align="left">Crawler PPM</td>
<td align="left" width="60"><span id="ppmNum">&nbsp;&nbsp;&nbsp;</span></td> <td align="left" width="60"><span id="ppmNum">&nbsp;&nbsp;&nbsp;</span></td>
<td align="left" width="260px" colspan="3"> <td align="left" width="260px" colspan="3">
<progress id="ppmbar" max="30000" value="0" style="width:94%;"/> <progress id="ppmbar" max="60000" value="0" style="width:94%;"/>
</td> </td>
</tr> </tr>
<tr class="TableCellLight"> <tr class="TableCellLight">

@ -28,6 +28,7 @@
<dependency org="io.opentracing" name="opentracing-noop" rev="0.33.0"/> <dependency org="io.opentracing" name="opentracing-noop" rev="0.33.0"/>
<dependency org="io.opentracing" name="opentracing-util" rev="0.33.0"/> <dependency org="io.opentracing" name="opentracing-util" rev="0.33.0"/>
<dependency org="javax.servlet" name="javax.servlet-api" rev="3.1.0"/> <dependency org="javax.servlet" name="javax.servlet-api" rev="3.1.0"/>
<dependency org="javainetlocator" name="inetaddresslocator" rev="2.18" />
<dependency org="jcifs" name="jcifs" rev="1.3.17" conf="compile->master" /> <dependency org="jcifs" name="jcifs" rev="1.3.17" conf="compile->master" />
<dependency org="net.arnx" name="jsonic" rev="1.3.10"/> <dependency org="net.arnx" name="jsonic" rev="1.3.10"/>
<dependency org="net.jthink" name="jaudiotagger" rev="2.2.5"/> <dependency org="net.jthink" name="jaudiotagger" rev="2.2.5"/>

@ -2593,14 +2593,18 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
return client.fileSize(path) > 0; return client.fileSize(path) > 0;
} }
if (isHTTP() || isHTTPS()) { if (isHTTP() || isHTTPS()) {
try (final HTTPClient client = new HTTPClient(agent)) { final HTTPClient client = new HTTPClient(agent);
client.setHost(getHost()); client.setHost(getHost());
org.apache.http.HttpResponse response = client.HEADResponse(this, true); org.apache.http.HttpResponse response = client.HEADResponse(this, true);
return response != null && (response.getStatusLine().getStatusCode() == 200 || response.getStatusLine().getStatusCode() == 301); client.close();
} if (response == null) return false;
int status = response.getStatusLine().getStatusCode();
return status == 200 || status == 301 || status == 302;
} }
return false; return false;
} catch (IOException e) { } catch (IOException e) {
if (e.getMessage().contains("Circular redirect to")) return true; // exception; this is a 302 which the client actually accepts
//e.printStackTrace();
return false; return false;
} }
} }

@ -25,12 +25,20 @@ package net.yacy.document.importer;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Collection; import java.util.Collection;
import java.util.Date;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.Map; import java.util.Map;
import java.util.Set;
import java.util.TreeMap; import java.util.TreeMap;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
@ -81,14 +89,18 @@ public class ZimImporter extends Thread implements Importer {
public void run() { public void run() {
job = this; job = this;
this.startTime = System.currentTimeMillis(); this.startTime = System.currentTimeMillis();
Switchboard sb = Switchboard.getSwitchboard();
try { try {
this.reader = new ZIMReader(this.file); this.reader = new ZIMReader(this.file);
this.guessedSource = getSource(this.reader); this.guessedSource = getSource(this.reader);
// verify the source // verify the source
DirectoryEntry mainEntry = this.reader.getMainDirectoryEntry(); DirectoryEntry mainEntry = this.reader.getMainDirectoryEntry();
DigestURL url = new DigestURL(mainEntry.url); DigestURL mainURL = guessURL(this.guessedSource, mainEntry);
if (!url.exists(ClientIdentification.browserAgent)) return; if (!mainURL.exists(ClientIdentification.browserAgent)) {
sb.log.info("zim importer: file " + this.file.getName() + " failed main url existence test: " + mainURL);
return;
}
// read all documents // read all documents
for (int i = 0; i < this.file.header_entryCount; i++) { for (int i = 0; i < this.file.header_entryCount; i++) {
@ -98,8 +110,14 @@ public class ZimImporter extends Thread implements Importer {
ArticleEntry ae = (ArticleEntry) de; ArticleEntry ae = (ArticleEntry) de;
// check url // check url
String guessedUrl = guessURL(this.guessedSource, de); DigestURL guessedUrl = guessURL(this.guessedSource, de);
assert guessedUrl.startsWith("http"); if (recordCnt < 10) {
// critical test for the first 10 urls
if (!guessedUrl.exists(ClientIdentification.browserAgent)) {
sb.log.info("zim importer: file " + this.file.getName() + " failed url " + recordCnt + " existence test: " + guessedUrl);
return;
}
}
// check availability of text parser // check availability of text parser
String mimeType = ae.getMimeType(); String mimeType = ae.getMimeType();
@ -111,7 +129,17 @@ public class ZimImporter extends Thread implements Importer {
// create artificial request and response headers for the indexer // create artificial request and response headers for the indexer
RequestHeader requestHeader = new RequestHeader(); RequestHeader requestHeader = new RequestHeader();
ResponseHeader responseHeader = new ResponseHeader(200); ResponseHeader responseHeader = new ResponseHeader(200);
final Request request = new Request(new DigestURL(guessedUrl), null); responseHeader.put(HeaderFramework.CONTENT_TYPE, de.getMimeType()); // very important to tell parser which kind of content
final Request request = new Request(
ASCII.getBytes(sb.peers.mySeed().hash),
guessedUrl,
null, // referrerhash the hash of the referrer URL
de.title, // name the name of the document to crawl
null, // appdate the time when the url was first time appeared
sb.crawler.defaultSurrogateProfile.handle(), // profileHandle the name of the prefetch profile. This must not be null!
0, // depth the crawling depth of the entry
sb.crawler.defaultSurrogateProfile.timezoneOffset() // timezone offset
);
final Response response = new Response( final Response response = new Response(
request, request,
requestHeader, requestHeader,
@ -122,7 +150,7 @@ public class ZimImporter extends Thread implements Importer {
); );
// throw this to the indexer // throw this to the indexer
String error = Switchboard.getSwitchboard().toIndexer(response); String error = sb.toIndexer(response);
if (error != null) ConcurrentLog.info("ZimImporter", "error parsing: " + error); if (error != null) ConcurrentLog.info("ZimImporter", "error parsing: " + error);
this.recordCnt++; this.recordCnt++;
} }
@ -203,7 +231,7 @@ public class ZimImporter extends Thread implements Importer {
case "fonts": case "fonts":
return "fonts.google.com"; return "fonts.google.com";
case "gutenberg": case "gutenberg":
return "gutenberg.org"; return "https://dev.library.kiwix.org/viewer#gutenberg_de_all_2023-03";
case "ifixit": case "ifixit":
return "ifixit.com"; return "ifixit.com";
case "lesfondamentaux": case "lesfondamentaux":
@ -223,11 +251,23 @@ public class ZimImporter extends Thread implements Importer {
case "rapsberry_pi_docs": case "rapsberry_pi_docs":
return "raspberrypi.org"; return "raspberrypi.org";
case "ted": case "ted":
return "ted.com"; return "www.ted.com/search?q=";
case "vikidia": case "vikidia":
return "vikidia.org"; return parts[1] + ".vikidia.org/wiki";
case "westeros": case "westeros":
return "westeros.org"; return "westeros.org";
case "wikihow":
return parts[1].equals("en") ? "wikihow.com" : parts[1] + ".wikihow.com";
case "wikisource":
return parts[1] + ".wikisource.org/wiki";
case "wikiversity":
return parts[1] + ".wikiversity.org/wiki";
case "wikivoyage":
return parts[1] + ".wikivoyage.org/wiki";
case "wiktionary":
return parts[1] + ".wiktionary.org/wiki";
case "wikiquote":
return parts[1] + ".wikiquote.org/wiki";
case "wikibooks": case "wikibooks":
return parts[1] + ".wikibooks.org/wiki"; return parts[1] + ".wikibooks.org/wiki";
case "wikinews": case "wikinews":
@ -273,16 +313,148 @@ public class ZimImporter extends Thread implements Importer {
return source; return source;
} }
public static String guessURL(String guessedSource, DirectoryEntry de) { public static DigestURL guessURL(String guessedSource, DirectoryEntry de) throws MalformedURLException {
String url = de.url; String url = de.url;
if (url.equals("Main_Page")) url = ""; if (url.equals("Main_Page")) url = "";
if (guessedSource != null) return guessedSource + url; if (guessedSource != null) return new DigestURL(guessedSource + url);
if (url.startsWith("A/")) return "https://" + url.substring(2); if (url.startsWith("A/")) return new DigestURL("https://" + url.substring(2));
if (url.startsWith("H/")) return "https://" + url.substring(2); if (url.startsWith("H/")) return new DigestURL("https://" + url.substring(2));
return guessedSource + url; return new DigestURL(guessedSource + url);
} }
private final static String[] skip_files = {
"iota.stackexchange.com_en_all_2023-05.zim",
"stellar.stackexchange.com_en_all_2023-10.zim",
"vegetarianism.stackexchange.com_en_all_2023-05.zim",
"esperanto.stackexchange.com_eo_all_2023-10.zim",
"tezos.stackexchange.com_en_all_2023-10.zim",
"eosio.stackexchange.com_en_all_2023-10.zim",
"ebooks.stackexchange.com_en_all_2023-10.zim",
"poker.stackexchange.com_en_all_2023-05.zim",
"cseducators.stackexchange.com_en_all_2023-10.zim",
"iot.stackexchange.com_en_all_2023-05.zim",
"portuguese.stackexchange.com_pt_all_2023-04.zim",
"portuguese.stackexchange.com_pt_all_2023-10.zim",
"italian.stackexchange.com_it_all_2023-05.zim",
"monero.stackexchange.com_en_all_2022-11.zim",
"sustainability.stackexchange.com_en_all_2023-05.zim",
"westeros_en_all_nopic_2021-03.zim",
"opensource.stackexchange.com_en_all_2023-10.zim",
"tor.stackexchange.com_en_all_2023-05.zim",
"devops.stackexchange.com_en_all_2023-10.zim",
"patents.stackexchange.com_en_all_2023-10.zim",
"stackapps.com_en_all_2023-05.zim",
"hardwarerecs.stackexchange.com_en_all_2023-05.zim",
"hsm.stackexchange.com_en_all_2023-05.zim",
"expatriates.stackexchange.com_en_all_2023-11.zim",
"opendata.stackexchange.com_en_all_2023-10.zim",
"sports.stackexchange.com_en_all_2023-05.zim",
"wikinews_de_all_nopic_2023-10.zim",
"computergraphics.stackexchange.com_en_all_2023-10.zim",
"tridion.stackexchange.com_en_all_2023-10.zim",
"bioinformatics.stackexchange.com_en_all_2023-10.zim",
"expressionengine.stackexchange.com_en_all_2023-11.zim",
"elementaryos.stackexchange.com_en_all_2023-10.zim",
"cstheory.stackexchange.com_en_all_2023-10.zim",
"chess.stackexchange.com_en_all_2023-05.zim",
"vi.stackexchange.com_en_all_2023-05.zim",
"fitness.stackexchange.com_en_all_2023-10.zim",
"pets.stackexchange.com_en_all_2023-05.zim",
"french.stackexchange.com_fr_all_2023-10.zim",
"sqa.stackexchange.com_en_all_2023-05.zim",
"islam.stackexchange.com_en_all_2023-05.zim",
"scicomp.stackexchange.com_en_all_2023-05.zim",
"wikinews_en_all_nopic_2023-09.zim",
"ai.stackexchange.com_en_all_2023-10.zim",
"boardgames.stackexchange.com_en_all_2023-05.zim",
"economics.stackexchange.com_en_all_2023-05.zim",
"3dprinting.stackexchange.com_en_all_2023-07.zim",
"earthscience.stackexchange.com_en_all_2023-05.zim",
"emacs.stackexchange.com_en_all_2023-10.zim",
"bitcoin.stackexchange.com_en_all_2023-05.zim",
"philosophy.stackexchange.com_en_all_2023-05.zim",
"law.stackexchange.com_en_all_2023-05.zim",
"astronomy.stackexchange.com_en_all_2023-05.zim",
"artofproblemsolving_en_all_nopic_2021-03.zim",
"engineering.stackexchange.com_en_all_2023-05.zim",
"ja.stackoverflow.com_ja_all_2023-06.zim",
"webmasters.stackexchange.com_en_all_2023-05.zim",
"anime.stackexchange.com_en_all_2023-10.zim",
"cooking.stackexchange.com_en_all_2023-05.zim",
"arduino.stackexchange.com_en_all_2023-05.zim",
"money.stackexchange.com_en_all_2023-05.zim",
"judaism.stackexchange.com_en_all_2023-05.zim",
"ethereum.stackexchange.com_en_all_2023-05.zim",
"datascience.stackexchange.com_en_all_2023-10.zim",
"academia.stackexchange.com_en_all_2023-10.zim",
"music.stackexchange.com_en_all_2023-05.zim",
"cs.stackexchange.com_en_all_2023-03.zim",
"dsp.stackexchange.com_en_all_2023-05.zim",
"biology.stackexchange.com_en_all_2023-05.zim",
"android.stackexchange.com_en_all_2023-10.zim",
"bicycles.stackexchange.com_en_all_2023-05.zim",
"puzzling.stackexchange.com_en_all_2023-05.zim",
"photo.stackexchange.com_en_all_2023-05.zim",
"aviation.stackexchange.com_en_all_2023-05.zim",
"drupal.stackexchange.com_en_all_2023-05.zim",
"ux.stackexchange.com_en_all_2023-05.zim",
"ell.stackexchange.com_en_all_2023-10.zim",
"openstreetmap-wiki_en_all_nopic_2023-05.zim",
"softwareengineering.stackexchange.com_en_all_2023-05.zim",
"gaming.stackexchange.com_en_all_2023-10.zim",
"mathematica.stackexchange.com_en_all_2023-10.zim",
"pt.stackoverflow.com_pt_all_2023-06.zim",
"apple.stackexchange.com_en_all_2023-05.zim",
"diy.stackexchange.com_en_all_2023-08.zim",
"es.stackoverflow.com_es_all_2023-06.zim",
"gis.stackexchange.com_en_all_2023-05.zim",
"stats.stackexchange.com_en_all_2023-05.zim",
"physics.stackexchange.com_en_all_2023-05.zim",
"serverfault.com_en_all_2023-05.zim",
"electronics.stackexchange.com_en_all_2023-05.zim",
"tex.stackexchange.com_en_all_2023-05.zim",
"wikibooks_de_all_nopic_2021-03.zim",
"askubuntu.com_en_all_2023-05.zim",
"superuser.com_en_all_2023-05.zim",
"lesfondamentaux.reseau-canope.fr_fr_all_2022-11.zim",
"wikibooks_en_all_nopic_2021-03.zim",
"courses.lumenlearning.com_en_all_2021-03.zim",
"wikipedia_de_all_nopic_2023-10.zim",
"wikipedia_en_all_nopic_2023-10.zim",
"stackoverflow.com_en_all_nopic_2022-07.zim",
"stackoverflow.com_en_all_2023-05.zim",
"armypubs_en_all_2023-08.zim",
"vikidia_en_all_nopic_2023-09.zim",
"wikiquote_de_all_nopic_2023-10.zim",
"wikiquote_en_all_nopic_2023-09.zim",
"wiktionary_de_all_nopic_2023-10.zim",
"wiktionary_en_all_nopic_2023-10.zim",
"wikihow_de_maxi_2023-10.zim",
"wikivoyage_de_all_nopic_2023-09.zim",
"wikiversity_de_all_nopic_2021-03.zim",
"wikiversity_en_all_nopic_2021-03.zim",
"wikisource_de_all_nopic_2023-09.zim",
"wikisource_en_all_nopic_2023-08.zim",
"ted_countdown_global_2023-09.zim",
"ted_en_design_2023-09.zim",
"ted_en_business_2023-09.zim",
"ted_en_global_issues_2023-09.zim",
// 302
"moderators.stackexchange.com_en_all_2023-05.zim",
"beer.stackexchange.com_en_all_2023-05.zim",
"health.stackexchange.com_en_all_2023-05.zim",
"avp.stackexchange.com_en_all_2023-05.zim",
"lowtechmagazine.com_en_all_2023-08.zim",
"ifixit_de_all_2023-07.zim",
"ifixit_en_all_2023-10.zim",
"der-postillon.com_de_all_2020-12.zim",
"wikihow_en_maxi_2023-03.zim",
};
public static void main(String[] args) { public static void main(String[] args) {
Set<String> skip = new HashSet<>();
for (String s: skip_files) skip.add(s);
// zim file import test // zim file import test
// will test mostly if domain names are included in zim file urls // will test mostly if domain names are included in zim file urls
String zimFilesPath = args[0]; String zimFilesPath = args[0];
@ -298,7 +470,10 @@ public class ZimImporter extends Thread implements Importer {
} }
Collection<File> orderedFiles = orderedFileMap.values(); Collection<File> orderedFiles = orderedFileMap.values();
Set<String> files_ok = new LinkedHashSet<>();
Set<String> files_nok = new LinkedHashSet<>();
for (File f: orderedFiles) { for (File f: orderedFiles) {
if (skip.contains(f.getName())) continue;
try { try {
ZIMFile z = new ZIMFile(f.getAbsolutePath()); ZIMFile z = new ZIMFile(f.getAbsolutePath());
ZIMReader r = new ZIMReader(z); ZIMReader r = new ZIMReader(z);
@ -308,16 +483,20 @@ public class ZimImporter extends Thread implements Importer {
System.out.println("Namespace: " + de.namespace); System.out.println("Namespace: " + de.namespace);
System.out.println("Title: " + de.title); System.out.println("Title: " + de.title);
System.out.println("URL: " + de.url); System.out.println("URL: " + de.url);
System.out.println("guessed domain: " + guessDomainName(f.getName())); System.out.println("guessed domain: " + guessDomainName(f.getName())); // uses a table and rules that deduces a source from the file name
String source = getSource(r); String source = getSource(r);
System.out.println("guessed Source: " + source); System.out.println("guessed Source: " + source); // this uses metadata stored in the zim file
String mainURL = guessURL(source, de); DigestURL mainURL = guessURL(source, de);
System.out.println("guessed main article: " + mainURL); System.out.println("guessed main article: " + mainURL);
System.out.println("main article exists: " + new DigestURL(mainURL).exists(ClientIdentification.browserAgent)); boolean ok = mainURL.exists(ClientIdentification.browserAgent);
System.out.println("main article exists: " + ok);
if (ok) files_ok.add(f.getName()); else files_nok.add(f.getName());
System.out.println(); System.out.println();
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); e.printStackTrace();
} }
} }
System.out.println("ok files: " + files_ok.toString());
System.out.println("not-ok files: " + files_nok.toString());
} }
} }

@ -53,7 +53,6 @@ import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink; import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.PDFTextStripper;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.document.id.MultiProtocolURL;
@ -69,9 +68,6 @@ import net.yacy.kelondro.util.MemoryControl;
public class pdfParser extends AbstractParser implements Parser { public class pdfParser extends AbstractParser implements Parser {
public static boolean individualPages = false;
public static String individualPagePropertyname = "page";
public pdfParser() { public pdfParser() {
super("Acrobat Portable Document Parser"); super("Acrobat Portable Document Parser");
this.SUPPORTED_EXTENSIONS.add("pdf"); this.SUPPORTED_EXTENSIONS.add("pdf");
@ -149,98 +145,36 @@ public class pdfParser extends AbstractParser implements Parser {
// get the links // get the links
final List<Collection<AnchorURL>> pdflinks = extractPdfLinks(pdfDoc); final List<Collection<AnchorURL>> pdflinks = extractPdfLinks(pdfDoc);
// get the fulltext (either per document or for each page) // collect the whole text at once
final PDFTextStripper stripper = new PDFTextStripper(/*StandardCharsets.UTF_8.name()*/); final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
byte[] contentBytes = new byte[0];
if (individualPages) { final PDFTextStripper stripper = new PDFTextStripper();
// this is a hack which stores individual pages of the source pdf into individual index documents stripper.setEndPage(Integer.MAX_VALUE);
// the new documents will get a virtual link with a post argument page=X appended to the original url writer.append(stripper.getText(pdfDoc));
contentBytes = writer.getBytes(); // remember text in case of interrupting thread
// collect text writer.close(); // free writer resources
final int pagecount = pdfDoc.getNumberOfPages();
final String[] pages = new String[pagecount]; final Collection<AnchorURL> pdflinksCombined = new HashSet<>();
for (int page = 1; page <= pagecount; page++) { for (final Collection<AnchorURL> pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx);
stripper.setStartPage(page); result = new Document[]{new Document(
stripper.setEndPage(page); location,
pages[page - 1] = stripper.getText(pdfDoc); mimeType,
//System.out.println("PAGE " + page + ": " + pages[page - 1]); StandardCharsets.UTF_8.name(),
} this,
null,
// create individual documents for each page docKeywords,
assert pages.length == pdflinks.size() : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.size(); singleList(docTitle),
result = new Document[Math.min(pages.length, pdflinks.size())]; docAuthor,
final String loc = location.toNormalform(true); docPublisher,
for (int page = 0; page < result.length; page++) { null,
result[page] = new Document( null,
new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash 0.0d, 0.0d,
mimeType, contentBytes,
StandardCharsets.UTF_8.name(), pdflinksCombined,
this, null,
null, null,
docKeywords, false,
singleList(docTitle), docDate)};
docAuthor,
docPublisher,
null,
null,
0.0d, 0.0d,
pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]),
pdflinks == null || page >= pdflinks.size() ? null : pdflinks.get(page),
null,
null,
false,
docDate);
}
} else {
// collect the whole text at once
final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
byte[] contentBytes = new byte[0];
stripper.setEndPage(3); // get first 3 pages (always)
writer.append(stripper.getText(pdfDoc));
contentBytes = writer.getBytes(); // remember text in case of interrupting thread
if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read
stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text)
stripper.setEndPage(Integer.MAX_VALUE); // set to default
// we start the pdf parsing in a separate thread to ensure that it can be terminated
final PDDocument pdfDocC = pdfDoc;
final Thread t = new Thread("pdfParser.getText:" + location) {
@Override
public void run() {
try {
writer.append(stripper.getText(pdfDocC));
} catch (final Throwable e) {}
}
};
t.start();
t.join(3000); // pdfbox likes to forget to terminate ... (quite often)
if (t.isAlive()) t.interrupt();
contentBytes = writer.getBytes(); // get final text before closing writer
writer.close(); // free writer resources
}
final Collection<AnchorURL> pdflinksCombined = new HashSet<>();
for (final Collection<AnchorURL> pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx);
result = new Document[]{new Document(
location,
mimeType,
StandardCharsets.UTF_8.name(),
this,
null,
docKeywords,
singleList(docTitle),
docAuthor,
docPublisher,
null,
null,
0.0d, 0.0d,
contentBytes,
pdflinksCombined,
null,
null,
false,
docDate)};
}
} catch (final Throwable e) { } catch (final Throwable e) {
//throw new Parser.Failure(e.getMessage(), location); //throw new Parser.Failure(e.getMessage(), location);
} finally { } finally {

@ -61,13 +61,6 @@ public class ConfigParser_p {
env.setConfig(SwitchboardConstants.PARSER_MIME_DENY, TextParser.getDenyMime()); env.setConfig(SwitchboardConstants.PARSER_MIME_DENY, TextParser.getDenyMime());
env.setConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, TextParser.getDenyExtension()); env.setConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, TextParser.getDenyExtension());
} }
if (post.containsKey("pdfSettings")) {
env.setConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, post.getBoolean("individualPages"));
env.setConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, post.get("individualPagePropertyname", "page"));
pdfParser.individualPages = sb.getConfigBool(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, false);
pdfParser.individualPagePropertyname = sb.getConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, "page");
}
} }
int i = 0; int i = 0;
@ -94,9 +87,6 @@ public class ConfigParser_p {
prop.put("parser", i); prop.put("parser", i);
prop.put("individualPages", sb.getConfigBool(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, false));
prop.put("individualPagePropertyname", sb.getConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, "page"));
// return rewrite properties // return rewrite properties
return prop; return prop;
} }

@ -774,7 +774,7 @@ public class Crawler_p {
} }
/* /*
* <input id="customPPM" name="customPPM" type="number" min="10" max="30000" style="width:46px" value="#[customPPMdefault]#" />PPM * <input id="customPPM" name="customPPM" type="number" min="10" max="60000" style="width:46px" value="#[customPPMdefault]#" />PPM
<input id="latencyFactor" name="latencyFactor" type="number" min="0.1" max="3.0" step="0.1" style="width:32px" value="#[latencyFactorDefault]#" />LF <input id="latencyFactor" name="latencyFactor" type="number" min="0.1" max="3.0" step="0.1" style="width:32px" value="#[latencyFactorDefault]#" />LF
<input id="MaxSameHostInQueue" name="MaxSameHostInQueue" type="number" min="1" max="30" style="width:32px" value="#[MaxSameHostInQueueDefault]#" />MH <input id="MaxSameHostInQueue" name="MaxSameHostInQueue" type="number" min="1" max="30" style="width:32px" value="#[MaxSameHostInQueueDefault]#" />MH
<input type="submit" name="crawlingPerformance" value="set" /> <input type="submit" name="crawlingPerformance" value="set" />
@ -784,19 +784,19 @@ public class Crawler_p {
if (post != null && post.containsKey("crawlingPerformance")) { if (post != null && post.containsKey("crawlingPerformance")) {
final String crawlingPerformance = post.get("crawlingPerformance", "custom"); final String crawlingPerformance = post.get("crawlingPerformance", "custom");
final long LCbusySleep1 = sb.getConfigLong(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, 1000L); final long LCbusySleep1 = sb.getConfigLong(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, 1000L);
int wantedPPM = (LCbusySleep1 == 0) ? 30000 : (int) (60000L / LCbusySleep1); int wantedPPM = (LCbusySleep1 == 0) ? 60000 : (int) (60000L / LCbusySleep1);
try { try {
wantedPPM = post.getInt("customPPM", wantedPPM); wantedPPM = post.getInt("customPPM", wantedPPM);
} catch (final NumberFormatException e) {} } catch (final NumberFormatException e) {}
if ("minimum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 10; if ("minimum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 10;
if ("maximum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 30000; if ("maximum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 60000;
int wPPM = wantedPPM; int wPPM = wantedPPM;
if ( wPPM <= 0 ) { if ( wPPM <= 0 ) {
wPPM = 1; wPPM = 1;
} }
if ( wPPM >= 30000 ) { if ( wPPM >= 60000 ) {
wPPM = 30000; wPPM = 60000;
} }
final int newBusySleep = 60000 / wPPM; // for wantedPPM = 10: 6000; for wantedPPM = 1000: 60 final int newBusySleep = 60000 / wPPM; // for wantedPPM = 10: 6000; for wantedPPM = 1000: 60

@ -981,17 +981,7 @@ public class URIMetadataNode extends SolrDocument /* implements Comparable<URIMe
public String urlstring() { public String urlstring() {
if (this.alternative_urlstring != null) return this.alternative_urlstring; if (this.alternative_urlstring != null) return this.alternative_urlstring;
if (!pdfParser.individualPages) return this.url().toNormalform(true); return this.url().toNormalform(true);
if (!"pdf".equals(MultiProtocolURL.getFileExtension(this.url().getFileName()).toLowerCase(Locale.ROOT))) return this.url().toNormalform(true);
// for pdf links we rewrite the url
// this is a special treatment of pdf files which can be splitted into subpages
String pageprop = pdfParser.individualPagePropertyname;
String resultUrlstring = this.url().toNormalform(true);
int p = resultUrlstring.lastIndexOf(pageprop + "=");
if (p > 0) {
return resultUrlstring.substring(0, p - 1) + "#page=" + resultUrlstring.substring(p + pageprop.length() + 1);
}
return resultUrlstring;
} }
/** /**
* used for search result entry * used for search result entry

@ -176,6 +176,7 @@ import net.yacy.document.content.SurrogateReader;
import net.yacy.document.importer.JsonListImporter; import net.yacy.document.importer.JsonListImporter;
import net.yacy.document.importer.OAIListFriendsLoader; import net.yacy.document.importer.OAIListFriendsLoader;
import net.yacy.document.importer.WarcImporter; import net.yacy.document.importer.WarcImporter;
import net.yacy.document.importer.ZimImporter;
import net.yacy.document.parser.audioTagParser; import net.yacy.document.parser.audioTagParser;
import net.yacy.document.parser.pdfParser; import net.yacy.document.parser.pdfParser;
import net.yacy.document.parser.html.Evaluation; import net.yacy.document.parser.html.Evaluation;
@ -906,8 +907,6 @@ public final class Switchboard extends serverSwitch {
TextParser.setDenyMime(this.getConfig(SwitchboardConstants.PARSER_MIME_DENY, "")); TextParser.setDenyMime(this.getConfig(SwitchboardConstants.PARSER_MIME_DENY, ""));
TextParser.setDenyExtension(this.getConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, "")); TextParser.setDenyExtension(this.getConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, ""));
pdfParser.individualPages = this.getConfigBool(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, false);
pdfParser.individualPagePropertyname = this.getConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, "page");
// start a loader // start a loader
this.log.config("Starting Crawl Loader"); this.log.config("Starting Crawl Loader");
@ -2153,6 +2152,20 @@ public final class Switchboard extends serverSwitch {
this.log.warn("IO Error processing warc file " + infile); this.log.warn("IO Error processing warc file " + infile);
} }
return moved; return moved;
} else if (s.endsWith(".zim")) {
try {
final ZimImporter wri = new ZimImporter(infile.getAbsolutePath());
wri.start();
try {
wri.join();
} catch (final InterruptedException ex) {
return moved;
}
moved = infile.renameTo(outfile);
} catch (final IOException ex) {
this.log.warn("IO Error processing zim file " + infile);
}
return moved;
} else if (s.endsWith(".jsonlist") || s.endsWith(".jsonlist.gz") || s.endsWith(".flatjson")) { } else if (s.endsWith(".jsonlist") || s.endsWith(".jsonlist.gz") || s.endsWith(".flatjson")) {
return this.processSurrogateJson(infile, outfile); return this.processSurrogateJson(infile, outfile);
} }
@ -2349,6 +2362,7 @@ public final class Switchboard extends serverSwitch {
if ( surrogate.endsWith(".xml") if ( surrogate.endsWith(".xml")
|| surrogate.endsWith(".xml.gz") || surrogate.endsWith(".xml.gz")
|| surrogate.endsWith(".xml.zip") || surrogate.endsWith(".xml.zip")
|| surrogate.endsWith(".zim")
|| surrogate.endsWith(".warc") || surrogate.endsWith(".warc")
|| surrogate.endsWith(".warc.gz") || surrogate.endsWith(".warc.gz")
|| surrogate.endsWith(".jsonlist") || surrogate.endsWith(".jsonlist")

@ -220,8 +220,6 @@ public final class SwitchboardConstants {
public static final String INDEX_TRANSFER_GZIP_BODY = "indexTransfer.gzipBody"; public static final String INDEX_TRANSFER_GZIP_BODY = "indexTransfer.gzipBody";
public static final String PARSER_MIME_DENY = "parser.mime.deny"; public static final String PARSER_MIME_DENY = "parser.mime.deny";
public static final String PARSER_EXTENSIONS_DENY = "parser.extensions.deny"; public static final String PARSER_EXTENSIONS_DENY = "parser.extensions.deny";
public static final String PARSER_PDF_INDIVIDUALPAGES = "parser.pdf.individualpages";
public static final String PARSER_PDF_INDIVIDUALPAGES_KEY = "parser.pdf.individualpages.key";
/** /**
* <p><code>public static final String <strong>PROXY_ONLINE_CAUTION_DELAY</strong> = "onlineCautionDelay"</code></p> * <p><code>public static final String <strong>PROXY_ONLINE_CAUTION_DELAY</strong> = "onlineCautionDelay"</code></p>
* <p>Name of the setting how long indexing should pause after the last time the proxy was used in milliseconds</p> * <p>Name of the setting how long indexing should pause after the last time the proxy was used in milliseconds</p>

@ -113,20 +113,24 @@ public class ZIMFile extends File {
} }
this.mimeTypeList = mList.toArray(new String[mList.size()]); this.mimeTypeList = mList.toArray(new String[mList.size()]);
// Initialize the Url Pointer List try {
this.urlPtrListBlob = new byte[this.header_entryCount * 8]; // Initialize the Url Pointer List
mReader.seek(this.header_urlPtrPos); this.urlPtrListBlob = new byte[this.header_entryCount * 8];
RandomAccessFileZIMInputStream.readFully(mReader, this.urlPtrListBlob); mReader.seek(this.header_urlPtrPos);
RandomAccessFileZIMInputStream.readFully(mReader, this.urlPtrListBlob);
// Initialize the Title Pointer List
this.titlePtrListBlob = new byte[this.header_entryCount * 4]; // Initialize the Title Pointer List
mReader.seek(this.header_titlePtrPos); this.titlePtrListBlob = new byte[this.header_entryCount * 4];
RandomAccessFileZIMInputStream.readFully(mReader, this.titlePtrListBlob); mReader.seek(this.header_titlePtrPos);
RandomAccessFileZIMInputStream.readFully(mReader, this.titlePtrListBlob);
// Initialize the Cluster Pointer List
this.clusterPtrListBlob = new byte[this.header_clusterCount * 8]; // Initialize the Cluster Pointer List
mReader.seek(this.header_clusterPtrPos); this.clusterPtrListBlob = new byte[this.header_clusterCount * 8];
RandomAccessFileZIMInputStream.readFully(mReader, this.clusterPtrListBlob); mReader.seek(this.header_clusterPtrPos);
RandomAccessFileZIMInputStream.readFully(mReader, this.clusterPtrListBlob);
} catch (IndexOutOfBoundsException e) {
throw new IOException(e.getMessage());
}
} }
public final String getMimeType(int idx) { public final String getMimeType(int idx) {

Loading…
Cancel
Save