Added a zim parser to the surrogate import option.

You can now import zim files into YaCy by simply moving them
to the DATA/SURROGATE/IN folder. They will be fetched and after
parsing moved to DATA/SURROGATE/OUT.
There are exceptions where the parser is not able to identify the
original URL of the documents in the zim file. In that case the file
is simply ignored.
This commit also carries an important fix to the pdf parser and an
increase of the maximum parsing speed to 60000 PPM which should make it
possible to index up to 1000 files in one second.
pull/610/head
Michael Peter Christen 1 year ago
parent 70e29937ef
commit 7db0534d8a

@ -51,27 +51,6 @@
</tr>
</table>
</fieldset>
<fieldset><legend id="parser">PDF Parser Attributes</legend>
<p>
This is an experimental setting which makes it possible to split PDF documents into individual index entries.
Every page will become a single index hit and the url is artifically extended with a post/get attribute value containing
the page number as value. When such an url is displayed within a search result, then the post/get attribute is transformed into an anchor hash link.
This makes it possible to view the individual page directly in the pdf.js viewer built-in into firefox,
for reference see https://github.com/mozilla/pdf.js/wiki/Viewer-options
</p>
<table border="0">
<tr class="TableCellLight">
<td class="small" align="right" width="90">Split PDF</td>
<td class="small" align="left" width="300"><input type="checkbox" name="individualPages" #(individualPages)#::checked="checked" #(/individualPages)#/></td>
</tr>
<tr class="TableCellLight">
<td class="small" align="right">Property Name</td>
<td class="small" align="left"><input type="text" name="individualPagePropertyname" value="#[individualPagePropertyname]#"/></td>
</tr>
<tr class="TableCellDark">
<td colspan="3" class="small" ><input type="submit" name="pdfSettings" value="Submit" class="btn btn-primary"/></td>
</tr>
</table>
</form>
#%env/templates/footer.template%#
</body>

@ -134,7 +134,7 @@
<tr class="TableCellLight">
<td align="left">Speed / PPM<br/>(Pages Per Minute)</td>
<td align="left" colspan="4">
<input id="customPPM" name="customPPM" type="number" min="10" max="30000" style="width:5em" value="#[customPPMdefault]#" /><label for="customPPM"><abbr title="Pages Per Minute">PPM</abbr></label>
<input id="customPPM" name="customPPM" type="number" min="10" max="60000" style="width:5em" value="#[customPPMdefault]#" /><label for="customPPM"><abbr title="Pages Per Minute">PPM</abbr></label>
<input id="latencyFactor" name="latencyFactor" type="number" min="0.1" max="3.0" step="0.1" style="width:3.5em" value="#[latencyFactorDefault]#" />
<label for="latencyFactor"><abbr title="Latency Factor">LF</abbr></label>
<input id="MaxSameHostInQueue" name="MaxSameHostInQueue" type="number" min="1" max="30" style="width:3em" value="#[MaxSameHostInQueueDefault]#" />
@ -147,7 +147,7 @@
<td align="left">Crawler PPM</td>
<td align="left" width="60"><span id="ppmNum">&nbsp;&nbsp;&nbsp;</span></td>
<td align="left" width="260px" colspan="3">
<progress id="ppmbar" max="30000" value="0" style="width:94%;"/>
<progress id="ppmbar" max="60000" value="0" style="width:94%;"/>
</td>
</tr>
<tr class="TableCellLight">

@ -28,6 +28,7 @@
<dependency org="io.opentracing" name="opentracing-noop" rev="0.33.0"/>
<dependency org="io.opentracing" name="opentracing-util" rev="0.33.0"/>
<dependency org="javax.servlet" name="javax.servlet-api" rev="3.1.0"/>
<dependency org="javainetlocator" name="inetaddresslocator" rev="2.18" />
<dependency org="jcifs" name="jcifs" rev="1.3.17" conf="compile->master" />
<dependency org="net.arnx" name="jsonic" rev="1.3.10"/>
<dependency org="net.jthink" name="jaudiotagger" rev="2.2.5"/>

@ -2593,14 +2593,18 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
return client.fileSize(path) > 0;
}
if (isHTTP() || isHTTPS()) {
try (final HTTPClient client = new HTTPClient(agent)) {
client.setHost(getHost());
org.apache.http.HttpResponse response = client.HEADResponse(this, true);
return response != null && (response.getStatusLine().getStatusCode() == 200 || response.getStatusLine().getStatusCode() == 301);
}
final HTTPClient client = new HTTPClient(agent);
client.setHost(getHost());
org.apache.http.HttpResponse response = client.HEADResponse(this, true);
client.close();
if (response == null) return false;
int status = response.getStatusLine().getStatusCode();
return status == 200 || status == 301 || status == 302;
}
return false;
} catch (IOException e) {
if (e.getMessage().contains("Circular redirect to")) return true; // exception; this is a 302 which the client actually accepts
//e.printStackTrace();
return false;
}
}

@ -25,12 +25,20 @@ package net.yacy.document.importer;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Collection;
import java.util.Date;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.util.ConcurrentLog;
@ -81,14 +89,18 @@ public class ZimImporter extends Thread implements Importer {
public void run() {
job = this;
this.startTime = System.currentTimeMillis();
Switchboard sb = Switchboard.getSwitchboard();
try {
this.reader = new ZIMReader(this.file);
this.guessedSource = getSource(this.reader);
// verify the source
DirectoryEntry mainEntry = this.reader.getMainDirectoryEntry();
DigestURL url = new DigestURL(mainEntry.url);
if (!url.exists(ClientIdentification.browserAgent)) return;
DigestURL mainURL = guessURL(this.guessedSource, mainEntry);
if (!mainURL.exists(ClientIdentification.browserAgent)) {
sb.log.info("zim importer: file " + this.file.getName() + " failed main url existence test: " + mainURL);
return;
}
// read all documents
for (int i = 0; i < this.file.header_entryCount; i++) {
@ -98,8 +110,14 @@ public class ZimImporter extends Thread implements Importer {
ArticleEntry ae = (ArticleEntry) de;
// check url
String guessedUrl = guessURL(this.guessedSource, de);
assert guessedUrl.startsWith("http");
DigestURL guessedUrl = guessURL(this.guessedSource, de);
if (recordCnt < 10) {
// critical test for the first 10 urls
if (!guessedUrl.exists(ClientIdentification.browserAgent)) {
sb.log.info("zim importer: file " + this.file.getName() + " failed url " + recordCnt + " existence test: " + guessedUrl);
return;
}
}
// check availability of text parser
String mimeType = ae.getMimeType();
@ -111,7 +129,17 @@ public class ZimImporter extends Thread implements Importer {
// create artificial request and response headers for the indexer
RequestHeader requestHeader = new RequestHeader();
ResponseHeader responseHeader = new ResponseHeader(200);
final Request request = new Request(new DigestURL(guessedUrl), null);
responseHeader.put(HeaderFramework.CONTENT_TYPE, de.getMimeType()); // very important to tell parser which kind of content
final Request request = new Request(
ASCII.getBytes(sb.peers.mySeed().hash),
guessedUrl,
null, // referrerhash the hash of the referrer URL
de.title, // name the name of the document to crawl
null, // appdate the time when the url was first time appeared
sb.crawler.defaultSurrogateProfile.handle(), // profileHandle the name of the prefetch profile. This must not be null!
0, // depth the crawling depth of the entry
sb.crawler.defaultSurrogateProfile.timezoneOffset() // timezone offset
);
final Response response = new Response(
request,
requestHeader,
@ -122,7 +150,7 @@ public class ZimImporter extends Thread implements Importer {
);
// throw this to the indexer
String error = Switchboard.getSwitchboard().toIndexer(response);
String error = sb.toIndexer(response);
if (error != null) ConcurrentLog.info("ZimImporter", "error parsing: " + error);
this.recordCnt++;
}
@ -203,7 +231,7 @@ public class ZimImporter extends Thread implements Importer {
case "fonts":
return "fonts.google.com";
case "gutenberg":
return "gutenberg.org";
return "https://dev.library.kiwix.org/viewer#gutenberg_de_all_2023-03";
case "ifixit":
return "ifixit.com";
case "lesfondamentaux":
@ -223,11 +251,23 @@ public class ZimImporter extends Thread implements Importer {
case "rapsberry_pi_docs":
return "raspberrypi.org";
case "ted":
return "ted.com";
return "www.ted.com/search?q=";
case "vikidia":
return "vikidia.org";
return parts[1] + ".vikidia.org/wiki";
case "westeros":
return "westeros.org";
case "wikihow":
return parts[1].equals("en") ? "wikihow.com" : parts[1] + ".wikihow.com";
case "wikisource":
return parts[1] + ".wikisource.org/wiki";
case "wikiversity":
return parts[1] + ".wikiversity.org/wiki";
case "wikivoyage":
return parts[1] + ".wikivoyage.org/wiki";
case "wiktionary":
return parts[1] + ".wiktionary.org/wiki";
case "wikiquote":
return parts[1] + ".wikiquote.org/wiki";
case "wikibooks":
return parts[1] + ".wikibooks.org/wiki";
case "wikinews":
@ -273,16 +313,148 @@ public class ZimImporter extends Thread implements Importer {
return source;
}
public static String guessURL(String guessedSource, DirectoryEntry de) {
public static DigestURL guessURL(String guessedSource, DirectoryEntry de) throws MalformedURLException {
String url = de.url;
if (url.equals("Main_Page")) url = "";
if (guessedSource != null) return guessedSource + url;
if (url.startsWith("A/")) return "https://" + url.substring(2);
if (url.startsWith("H/")) return "https://" + url.substring(2);
return guessedSource + url;
if (guessedSource != null) return new DigestURL(guessedSource + url);
if (url.startsWith("A/")) return new DigestURL("https://" + url.substring(2));
if (url.startsWith("H/")) return new DigestURL("https://" + url.substring(2));
return new DigestURL(guessedSource + url);
}
private final static String[] skip_files = {
"iota.stackexchange.com_en_all_2023-05.zim",
"stellar.stackexchange.com_en_all_2023-10.zim",
"vegetarianism.stackexchange.com_en_all_2023-05.zim",
"esperanto.stackexchange.com_eo_all_2023-10.zim",
"tezos.stackexchange.com_en_all_2023-10.zim",
"eosio.stackexchange.com_en_all_2023-10.zim",
"ebooks.stackexchange.com_en_all_2023-10.zim",
"poker.stackexchange.com_en_all_2023-05.zim",
"cseducators.stackexchange.com_en_all_2023-10.zim",
"iot.stackexchange.com_en_all_2023-05.zim",
"portuguese.stackexchange.com_pt_all_2023-04.zim",
"portuguese.stackexchange.com_pt_all_2023-10.zim",
"italian.stackexchange.com_it_all_2023-05.zim",
"monero.stackexchange.com_en_all_2022-11.zim",
"sustainability.stackexchange.com_en_all_2023-05.zim",
"westeros_en_all_nopic_2021-03.zim",
"opensource.stackexchange.com_en_all_2023-10.zim",
"tor.stackexchange.com_en_all_2023-05.zim",
"devops.stackexchange.com_en_all_2023-10.zim",
"patents.stackexchange.com_en_all_2023-10.zim",
"stackapps.com_en_all_2023-05.zim",
"hardwarerecs.stackexchange.com_en_all_2023-05.zim",
"hsm.stackexchange.com_en_all_2023-05.zim",
"expatriates.stackexchange.com_en_all_2023-11.zim",
"opendata.stackexchange.com_en_all_2023-10.zim",
"sports.stackexchange.com_en_all_2023-05.zim",
"wikinews_de_all_nopic_2023-10.zim",
"computergraphics.stackexchange.com_en_all_2023-10.zim",
"tridion.stackexchange.com_en_all_2023-10.zim",
"bioinformatics.stackexchange.com_en_all_2023-10.zim",
"expressionengine.stackexchange.com_en_all_2023-11.zim",
"elementaryos.stackexchange.com_en_all_2023-10.zim",
"cstheory.stackexchange.com_en_all_2023-10.zim",
"chess.stackexchange.com_en_all_2023-05.zim",
"vi.stackexchange.com_en_all_2023-05.zim",
"fitness.stackexchange.com_en_all_2023-10.zim",
"pets.stackexchange.com_en_all_2023-05.zim",
"french.stackexchange.com_fr_all_2023-10.zim",
"sqa.stackexchange.com_en_all_2023-05.zim",
"islam.stackexchange.com_en_all_2023-05.zim",
"scicomp.stackexchange.com_en_all_2023-05.zim",
"wikinews_en_all_nopic_2023-09.zim",
"ai.stackexchange.com_en_all_2023-10.zim",
"boardgames.stackexchange.com_en_all_2023-05.zim",
"economics.stackexchange.com_en_all_2023-05.zim",
"3dprinting.stackexchange.com_en_all_2023-07.zim",
"earthscience.stackexchange.com_en_all_2023-05.zim",
"emacs.stackexchange.com_en_all_2023-10.zim",
"bitcoin.stackexchange.com_en_all_2023-05.zim",
"philosophy.stackexchange.com_en_all_2023-05.zim",
"law.stackexchange.com_en_all_2023-05.zim",
"astronomy.stackexchange.com_en_all_2023-05.zim",
"artofproblemsolving_en_all_nopic_2021-03.zim",
"engineering.stackexchange.com_en_all_2023-05.zim",
"ja.stackoverflow.com_ja_all_2023-06.zim",
"webmasters.stackexchange.com_en_all_2023-05.zim",
"anime.stackexchange.com_en_all_2023-10.zim",
"cooking.stackexchange.com_en_all_2023-05.zim",
"arduino.stackexchange.com_en_all_2023-05.zim",
"money.stackexchange.com_en_all_2023-05.zim",
"judaism.stackexchange.com_en_all_2023-05.zim",
"ethereum.stackexchange.com_en_all_2023-05.zim",
"datascience.stackexchange.com_en_all_2023-10.zim",
"academia.stackexchange.com_en_all_2023-10.zim",
"music.stackexchange.com_en_all_2023-05.zim",
"cs.stackexchange.com_en_all_2023-03.zim",
"dsp.stackexchange.com_en_all_2023-05.zim",
"biology.stackexchange.com_en_all_2023-05.zim",
"android.stackexchange.com_en_all_2023-10.zim",
"bicycles.stackexchange.com_en_all_2023-05.zim",
"puzzling.stackexchange.com_en_all_2023-05.zim",
"photo.stackexchange.com_en_all_2023-05.zim",
"aviation.stackexchange.com_en_all_2023-05.zim",
"drupal.stackexchange.com_en_all_2023-05.zim",
"ux.stackexchange.com_en_all_2023-05.zim",
"ell.stackexchange.com_en_all_2023-10.zim",
"openstreetmap-wiki_en_all_nopic_2023-05.zim",
"softwareengineering.stackexchange.com_en_all_2023-05.zim",
"gaming.stackexchange.com_en_all_2023-10.zim",
"mathematica.stackexchange.com_en_all_2023-10.zim",
"pt.stackoverflow.com_pt_all_2023-06.zim",
"apple.stackexchange.com_en_all_2023-05.zim",
"diy.stackexchange.com_en_all_2023-08.zim",
"es.stackoverflow.com_es_all_2023-06.zim",
"gis.stackexchange.com_en_all_2023-05.zim",
"stats.stackexchange.com_en_all_2023-05.zim",
"physics.stackexchange.com_en_all_2023-05.zim",
"serverfault.com_en_all_2023-05.zim",
"electronics.stackexchange.com_en_all_2023-05.zim",
"tex.stackexchange.com_en_all_2023-05.zim",
"wikibooks_de_all_nopic_2021-03.zim",
"askubuntu.com_en_all_2023-05.zim",
"superuser.com_en_all_2023-05.zim",
"lesfondamentaux.reseau-canope.fr_fr_all_2022-11.zim",
"wikibooks_en_all_nopic_2021-03.zim",
"courses.lumenlearning.com_en_all_2021-03.zim",
"wikipedia_de_all_nopic_2023-10.zim",
"wikipedia_en_all_nopic_2023-10.zim",
"stackoverflow.com_en_all_nopic_2022-07.zim",
"stackoverflow.com_en_all_2023-05.zim",
"armypubs_en_all_2023-08.zim",
"vikidia_en_all_nopic_2023-09.zim",
"wikiquote_de_all_nopic_2023-10.zim",
"wikiquote_en_all_nopic_2023-09.zim",
"wiktionary_de_all_nopic_2023-10.zim",
"wiktionary_en_all_nopic_2023-10.zim",
"wikihow_de_maxi_2023-10.zim",
"wikivoyage_de_all_nopic_2023-09.zim",
"wikiversity_de_all_nopic_2021-03.zim",
"wikiversity_en_all_nopic_2021-03.zim",
"wikisource_de_all_nopic_2023-09.zim",
"wikisource_en_all_nopic_2023-08.zim",
"ted_countdown_global_2023-09.zim",
"ted_en_design_2023-09.zim",
"ted_en_business_2023-09.zim",
"ted_en_global_issues_2023-09.zim",
// 302
"moderators.stackexchange.com_en_all_2023-05.zim",
"beer.stackexchange.com_en_all_2023-05.zim",
"health.stackexchange.com_en_all_2023-05.zim",
"avp.stackexchange.com_en_all_2023-05.zim",
"lowtechmagazine.com_en_all_2023-08.zim",
"ifixit_de_all_2023-07.zim",
"ifixit_en_all_2023-10.zim",
"der-postillon.com_de_all_2020-12.zim",
"wikihow_en_maxi_2023-03.zim",
};
public static void main(String[] args) {
Set<String> skip = new HashSet<>();
for (String s: skip_files) skip.add(s);
// zim file import test
// will test mostly if domain names are included in zim file urls
String zimFilesPath = args[0];
@ -298,7 +470,10 @@ public class ZimImporter extends Thread implements Importer {
}
Collection<File> orderedFiles = orderedFileMap.values();
Set<String> files_ok = new LinkedHashSet<>();
Set<String> files_nok = new LinkedHashSet<>();
for (File f: orderedFiles) {
if (skip.contains(f.getName())) continue;
try {
ZIMFile z = new ZIMFile(f.getAbsolutePath());
ZIMReader r = new ZIMReader(z);
@ -308,16 +483,20 @@ public class ZimImporter extends Thread implements Importer {
System.out.println("Namespace: " + de.namespace);
System.out.println("Title: " + de.title);
System.out.println("URL: " + de.url);
System.out.println("guessed domain: " + guessDomainName(f.getName()));
System.out.println("guessed domain: " + guessDomainName(f.getName())); // uses a table and rules that deduces a source from the file name
String source = getSource(r);
System.out.println("guessed Source: " + source);
String mainURL = guessURL(source, de);
System.out.println("guessed Source: " + source); // this uses metadata stored in the zim file
DigestURL mainURL = guessURL(source, de);
System.out.println("guessed main article: " + mainURL);
System.out.println("main article exists: " + new DigestURL(mainURL).exists(ClientIdentification.browserAgent));
boolean ok = mainURL.exists(ClientIdentification.browserAgent);
System.out.println("main article exists: " + ok);
if (ok) files_ok.add(f.getName()); else files_nok.add(f.getName());
System.out.println();
} catch (IOException e) {
e.printStackTrace();
}
}
System.out.println("ok files: " + files_ok.toString());
System.out.println("not-ok files: " + files_nok.toString());
}
}

@ -53,7 +53,6 @@ import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
import org.apache.pdfbox.text.PDFTextStripper;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
@ -69,9 +68,6 @@ import net.yacy.kelondro.util.MemoryControl;
public class pdfParser extends AbstractParser implements Parser {
public static boolean individualPages = false;
public static String individualPagePropertyname = "page";
public pdfParser() {
super("Acrobat Portable Document Parser");
this.SUPPORTED_EXTENSIONS.add("pdf");
@ -149,98 +145,36 @@ public class pdfParser extends AbstractParser implements Parser {
// get the links
final List<Collection<AnchorURL>> pdflinks = extractPdfLinks(pdfDoc);
// get the fulltext (either per document or for each page)
final PDFTextStripper stripper = new PDFTextStripper(/*StandardCharsets.UTF_8.name()*/);
if (individualPages) {
// this is a hack which stores individual pages of the source pdf into individual index documents
// the new documents will get a virtual link with a post argument page=X appended to the original url
// collect text
final int pagecount = pdfDoc.getNumberOfPages();
final String[] pages = new String[pagecount];
for (int page = 1; page <= pagecount; page++) {
stripper.setStartPage(page);
stripper.setEndPage(page);
pages[page - 1] = stripper.getText(pdfDoc);
//System.out.println("PAGE " + page + ": " + pages[page - 1]);
}
// create individual documents for each page
assert pages.length == pdflinks.size() : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.size();
result = new Document[Math.min(pages.length, pdflinks.size())];
final String loc = location.toNormalform(true);
for (int page = 0; page < result.length; page++) {
result[page] = new Document(
new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash
mimeType,
StandardCharsets.UTF_8.name(),
this,
null,
docKeywords,
singleList(docTitle),
docAuthor,
docPublisher,
null,
null,
0.0d, 0.0d,
pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]),
pdflinks == null || page >= pdflinks.size() ? null : pdflinks.get(page),
null,
null,
false,
docDate);
}
} else {
// collect the whole text at once
final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
byte[] contentBytes = new byte[0];
stripper.setEndPage(3); // get first 3 pages (always)
writer.append(stripper.getText(pdfDoc));
contentBytes = writer.getBytes(); // remember text in case of interrupting thread
if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read
stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text)
stripper.setEndPage(Integer.MAX_VALUE); // set to default
// we start the pdf parsing in a separate thread to ensure that it can be terminated
final PDDocument pdfDocC = pdfDoc;
final Thread t = new Thread("pdfParser.getText:" + location) {
@Override
public void run() {
try {
writer.append(stripper.getText(pdfDocC));
} catch (final Throwable e) {}
}
};
t.start();
t.join(3000); // pdfbox likes to forget to terminate ... (quite often)
if (t.isAlive()) t.interrupt();
contentBytes = writer.getBytes(); // get final text before closing writer
writer.close(); // free writer resources
}
final Collection<AnchorURL> pdflinksCombined = new HashSet<>();
for (final Collection<AnchorURL> pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx);
result = new Document[]{new Document(
location,
mimeType,
StandardCharsets.UTF_8.name(),
this,
null,
docKeywords,
singleList(docTitle),
docAuthor,
docPublisher,
null,
null,
0.0d, 0.0d,
contentBytes,
pdflinksCombined,
null,
null,
false,
docDate)};
}
// collect the whole text at once
final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
byte[] contentBytes = new byte[0];
final PDFTextStripper stripper = new PDFTextStripper();
stripper.setEndPage(Integer.MAX_VALUE);
writer.append(stripper.getText(pdfDoc));
contentBytes = writer.getBytes(); // remember text in case of interrupting thread
writer.close(); // free writer resources
final Collection<AnchorURL> pdflinksCombined = new HashSet<>();
for (final Collection<AnchorURL> pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx);
result = new Document[]{new Document(
location,
mimeType,
StandardCharsets.UTF_8.name(),
this,
null,
docKeywords,
singleList(docTitle),
docAuthor,
docPublisher,
null,
null,
0.0d, 0.0d,
contentBytes,
pdflinksCombined,
null,
null,
false,
docDate)};
} catch (final Throwable e) {
//throw new Parser.Failure(e.getMessage(), location);
} finally {

@ -61,13 +61,6 @@ public class ConfigParser_p {
env.setConfig(SwitchboardConstants.PARSER_MIME_DENY, TextParser.getDenyMime());
env.setConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, TextParser.getDenyExtension());
}
if (post.containsKey("pdfSettings")) {
env.setConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, post.getBoolean("individualPages"));
env.setConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, post.get("individualPagePropertyname", "page"));
pdfParser.individualPages = sb.getConfigBool(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, false);
pdfParser.individualPagePropertyname = sb.getConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, "page");
}
}
int i = 0;
@ -94,9 +87,6 @@ public class ConfigParser_p {
prop.put("parser", i);
prop.put("individualPages", sb.getConfigBool(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, false));
prop.put("individualPagePropertyname", sb.getConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, "page"));
// return rewrite properties
return prop;
}

@ -774,7 +774,7 @@ public class Crawler_p {
}
/*
* <input id="customPPM" name="customPPM" type="number" min="10" max="30000" style="width:46px" value="#[customPPMdefault]#" />PPM
* <input id="customPPM" name="customPPM" type="number" min="10" max="60000" style="width:46px" value="#[customPPMdefault]#" />PPM
<input id="latencyFactor" name="latencyFactor" type="number" min="0.1" max="3.0" step="0.1" style="width:32px" value="#[latencyFactorDefault]#" />LF
<input id="MaxSameHostInQueue" name="MaxSameHostInQueue" type="number" min="1" max="30" style="width:32px" value="#[MaxSameHostInQueueDefault]#" />MH
<input type="submit" name="crawlingPerformance" value="set" />
@ -784,19 +784,19 @@ public class Crawler_p {
if (post != null && post.containsKey("crawlingPerformance")) {
final String crawlingPerformance = post.get("crawlingPerformance", "custom");
final long LCbusySleep1 = sb.getConfigLong(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, 1000L);
int wantedPPM = (LCbusySleep1 == 0) ? 30000 : (int) (60000L / LCbusySleep1);
int wantedPPM = (LCbusySleep1 == 0) ? 60000 : (int) (60000L / LCbusySleep1);
try {
wantedPPM = post.getInt("customPPM", wantedPPM);
} catch (final NumberFormatException e) {}
if ("minimum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 10;
if ("maximum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 30000;
if ("maximum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 60000;
int wPPM = wantedPPM;
if ( wPPM <= 0 ) {
wPPM = 1;
}
if ( wPPM >= 30000 ) {
wPPM = 30000;
if ( wPPM >= 60000 ) {
wPPM = 60000;
}
final int newBusySleep = 60000 / wPPM; // for wantedPPM = 10: 6000; for wantedPPM = 1000: 60

@ -981,17 +981,7 @@ public class URIMetadataNode extends SolrDocument /* implements Comparable<URIMe
public String urlstring() {
if (this.alternative_urlstring != null) return this.alternative_urlstring;
if (!pdfParser.individualPages) return this.url().toNormalform(true);
if (!"pdf".equals(MultiProtocolURL.getFileExtension(this.url().getFileName()).toLowerCase(Locale.ROOT))) return this.url().toNormalform(true);
// for pdf links we rewrite the url
// this is a special treatment of pdf files which can be splitted into subpages
String pageprop = pdfParser.individualPagePropertyname;
String resultUrlstring = this.url().toNormalform(true);
int p = resultUrlstring.lastIndexOf(pageprop + "=");
if (p > 0) {
return resultUrlstring.substring(0, p - 1) + "#page=" + resultUrlstring.substring(p + pageprop.length() + 1);
}
return resultUrlstring;
return this.url().toNormalform(true);
}
/**
* used for search result entry

@ -176,6 +176,7 @@ import net.yacy.document.content.SurrogateReader;
import net.yacy.document.importer.JsonListImporter;
import net.yacy.document.importer.OAIListFriendsLoader;
import net.yacy.document.importer.WarcImporter;
import net.yacy.document.importer.ZimImporter;
import net.yacy.document.parser.audioTagParser;
import net.yacy.document.parser.pdfParser;
import net.yacy.document.parser.html.Evaluation;
@ -906,8 +907,6 @@ public final class Switchboard extends serverSwitch {
TextParser.setDenyMime(this.getConfig(SwitchboardConstants.PARSER_MIME_DENY, ""));
TextParser.setDenyExtension(this.getConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, ""));
pdfParser.individualPages = this.getConfigBool(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, false);
pdfParser.individualPagePropertyname = this.getConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, "page");
// start a loader
this.log.config("Starting Crawl Loader");
@ -2153,6 +2152,20 @@ public final class Switchboard extends serverSwitch {
this.log.warn("IO Error processing warc file " + infile);
}
return moved;
} else if (s.endsWith(".zim")) {
try {
final ZimImporter wri = new ZimImporter(infile.getAbsolutePath());
wri.start();
try {
wri.join();
} catch (final InterruptedException ex) {
return moved;
}
moved = infile.renameTo(outfile);
} catch (final IOException ex) {
this.log.warn("IO Error processing zim file " + infile);
}
return moved;
} else if (s.endsWith(".jsonlist") || s.endsWith(".jsonlist.gz") || s.endsWith(".flatjson")) {
return this.processSurrogateJson(infile, outfile);
}
@ -2349,6 +2362,7 @@ public final class Switchboard extends serverSwitch {
if ( surrogate.endsWith(".xml")
|| surrogate.endsWith(".xml.gz")
|| surrogate.endsWith(".xml.zip")
|| surrogate.endsWith(".zim")
|| surrogate.endsWith(".warc")
|| surrogate.endsWith(".warc.gz")
|| surrogate.endsWith(".jsonlist")

@ -220,8 +220,6 @@ public final class SwitchboardConstants {
public static final String INDEX_TRANSFER_GZIP_BODY = "indexTransfer.gzipBody";
public static final String PARSER_MIME_DENY = "parser.mime.deny";
public static final String PARSER_EXTENSIONS_DENY = "parser.extensions.deny";
public static final String PARSER_PDF_INDIVIDUALPAGES = "parser.pdf.individualpages";
public static final String PARSER_PDF_INDIVIDUALPAGES_KEY = "parser.pdf.individualpages.key";
/**
* <p><code>public static final String <strong>PROXY_ONLINE_CAUTION_DELAY</strong> = "onlineCautionDelay"</code></p>
* <p>Name of the setting how long indexing should pause after the last time the proxy was used in milliseconds</p>

@ -113,20 +113,24 @@ public class ZIMFile extends File {
}
this.mimeTypeList = mList.toArray(new String[mList.size()]);
// Initialize the Url Pointer List
this.urlPtrListBlob = new byte[this.header_entryCount * 8];
mReader.seek(this.header_urlPtrPos);
RandomAccessFileZIMInputStream.readFully(mReader, this.urlPtrListBlob);
// Initialize the Title Pointer List
this.titlePtrListBlob = new byte[this.header_entryCount * 4];
mReader.seek(this.header_titlePtrPos);
RandomAccessFileZIMInputStream.readFully(mReader, this.titlePtrListBlob);
// Initialize the Cluster Pointer List
this.clusterPtrListBlob = new byte[this.header_clusterCount * 8];
mReader.seek(this.header_clusterPtrPos);
RandomAccessFileZIMInputStream.readFully(mReader, this.clusterPtrListBlob);
try {
// Initialize the Url Pointer List
this.urlPtrListBlob = new byte[this.header_entryCount * 8];
mReader.seek(this.header_urlPtrPos);
RandomAccessFileZIMInputStream.readFully(mReader, this.urlPtrListBlob);
// Initialize the Title Pointer List
this.titlePtrListBlob = new byte[this.header_entryCount * 4];
mReader.seek(this.header_titlePtrPos);
RandomAccessFileZIMInputStream.readFully(mReader, this.titlePtrListBlob);
// Initialize the Cluster Pointer List
this.clusterPtrListBlob = new byte[this.header_clusterCount * 8];
mReader.seek(this.header_clusterPtrPos);
RandomAccessFileZIMInputStream.readFully(mReader, this.clusterPtrListBlob);
} catch (IndexOutOfBoundsException e) {
throw new IOException(e.getMessage());
}
}
public final String getMimeType(int idx) {

Loading…
Cancel
Save