pull/621/head
sgaebel 1 year ago
commit d72cd7916c

@ -51,27 +51,6 @@
</tr>
</table>
</fieldset>
<fieldset><legend id="parser">PDF Parser Attributes</legend>
<p>
This is an experimental setting which makes it possible to split PDF documents into individual index entries.
Every page will become a single index hit and the url is artifically extended with a post/get attribute value containing
the page number as value. When such an url is displayed within a search result, then the post/get attribute is transformed into an anchor hash link.
This makes it possible to view the individual page directly in the pdf.js viewer built-in into firefox,
for reference see https://github.com/mozilla/pdf.js/wiki/Viewer-options
</p>
<table border="0">
<tr class="TableCellLight">
<td class="small" align="right" width="90">Split PDF</td>
<td class="small" align="left" width="300"><input type="checkbox" name="individualPages" #(individualPages)#::checked="checked" #(/individualPages)#/></td>
</tr>
<tr class="TableCellLight">
<td class="small" align="right">Property Name</td>
<td class="small" align="left"><input type="text" name="individualPagePropertyname" value="#[individualPagePropertyname]#"/></td>
</tr>
<tr class="TableCellDark">
<td colspan="3" class="small" ><input type="submit" name="pdfSettings" value="Submit" class="btn btn-primary"/></td>
</tr>
</table>
</form>
#%env/templates/footer.template%#
</body>

@ -134,7 +134,7 @@
<tr class="TableCellLight">
<td align="left">Speed / PPM<br/>(Pages Per Minute)</td>
<td align="left" colspan="4">
<input id="customPPM" name="customPPM" type="number" min="10" max="30000" style="width:5em" value="#[customPPMdefault]#" /><label for="customPPM"><abbr title="Pages Per Minute">PPM</abbr></label>
<input id="customPPM" name="customPPM" type="number" min="10" max="60000" style="width:5em" value="#[customPPMdefault]#" /><label for="customPPM"><abbr title="Pages Per Minute">PPM</abbr></label>
<input id="latencyFactor" name="latencyFactor" type="number" min="0.1" max="3.0" step="0.1" style="width:3.5em" value="#[latencyFactorDefault]#" />
<label for="latencyFactor"><abbr title="Latency Factor">LF</abbr></label>
<input id="MaxSameHostInQueue" name="MaxSameHostInQueue" type="number" min="1" max="30" style="width:3em" value="#[MaxSameHostInQueueDefault]#" />
@ -147,7 +147,7 @@
<td align="left">Crawler PPM</td>
<td align="left" width="60"><span id="ppmNum">&nbsp;&nbsp;&nbsp;</span></td>
<td align="left" width="260px" colspan="3">
<progress id="ppmbar" max="30000" value="0" style="width:94%;"/>
<progress id="ppmbar" max="60000" value="0" style="width:94%;"/>
</td>
</tr>
<tr class="TableCellLight">

@ -9,11 +9,10 @@
<body id="IndexControl">
#%env/templates/header.template%#
#%env/templates/submenuIndexImport.template%#
<h2>Index Export</h2>
<p>The local index currently contains #[ucount]# documents, only #[ucount200]# exportable with status code 200 - the remaining are error documents.</p>
#(lurlexport)#::
<form action="IndexExport_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset><legend>Loaded URL Export</legend>
@ -22,31 +21,65 @@
<dd><input type="text" name="exportfilepath" value="#[exportfilepath]#" size="120" maxlength="250" />
</dd>
<dt class="TableCellDark">URL Filter</dt>
<dd><input type="text" name="exportfilter" value=".*.*" size="20" maxlength="250" />
<dd><input type="text" name="exportfilter" value=".*.*" size="20" maxlength="250" />&nbsp;.*.* (default) is a catch-all; format: java regex
</dd>
<dt class="TableCellDark">query</dt>
<dd><input type="text" name="exportquery" value="*:*" size="20" maxlength="250" />
<dd><input type="text" name="exportquery" value="*:*" size="20" maxlength="250" />&nbsp;*:* (default) is a catch-all; format: <field-name>:<solr-pattern>
</dd>
<dt class="TableCellDark">maximum age (seconds, -1 = unlimited)</dt>
<dd><input type="text" name="exportmaxseconds" value="-1" size="20" maxlength="250" />
<dt class="TableCellDark">maximum age (seconds)</dt>
<dd><input type="text" name="exportmaxseconds" value="-1" size="20" maxlength="250" />&nbsp;-1 = unlimited -> no document is too old
</dd>
<dt class="TableCellDark">maximum number of records per chunk</dt>
<dd><input type="text" name="maxchunksize" value="-1" size="20" maxlength="250" />&nbsp;if exceeded: several chunks are stored; -1 = unlimited (makes only one chunk)
</dd>
<dt class="TableCellDark">Export Size</dt>
<dd>
full size, all fields:<input type="radio" name="minified" value="no" checked="checked">&nbsp;
minified; only fields sku, date, title, description, text_t<input type="radio" name="minified" value="yes" >
</dd>
<dt class="TableCellDark">Export Format</dt>
<dd>
<dl>
<dt>Full Data Records:</dt>
<dd><input type="radio" name="format" value="full-solr" /> XML (Rich and full-text Solr data, one document per line in one large xml file, can be processed with shell tools, can be imported with DATA/SURROGATE/in/)<br />
<input type="radio" name="format" value="full-elasticsearch" checked="checked" /> JSON (Rich and full-text Elasticsearch data, one document per line in one flat JSON file, can be bulk-imported to elasticsearch with the command "curl -XPOST localhost:9200/collection1/yacy/_bulk --data-binary @yacy_dump_XXX.flatjson")<br />
<input type="radio" name="format" value="full-rss" /> XML (RSS)</dd>
<dd><input type="radio" name="format" value="full-elasticsearch" checked="checked" />
JSON (Rich and full-text Elasticsearch data, one document per line in one flat JSON file,
can be bulk-imported to elasticsearch. Here is an example for opensearch, using docker:<br />
Start docker container of opensearch:<br />
<code>docker run --name opensearch -p 9200:9200 -d -e OPENSEARCH_JAVA_OPTS="-Xms2G -Xmx2G" -e discovery.type=single-node -e DISABLE_SECURITY_PLUGIN=true -v $(pwd)/opensearch_data:/usr/share/opensearch/data opensearchproject/opensearch:latest</code><br />
Unblock index creation:<br />
<code>curl -X PUT "http://localhost:9200/_cluster/settings" -H 'Content-Type: application/json' -d'
{
"persistent": {
"cluster.blocks.create_index": null
}
}'</code><br />
Create the search index:<br />
<code>curl -X PUT "http://localhost:9200/collection1/yacy"</code><br />
Bulk-upload the index file:<br />
<code>curl -XPOST "http://localhost:9200/collection1/yacy/_bulk?filter_path=took,errors" -H "Content-Type: application/x-ndjson" --data-binary @yacy_dump_XXX.flatjson</code><br />
Make a search, get 10 results, search in fields text_t, title, description with boosts:<br />
<code>curl -X POST "http://localhost:9200/collection1/yacy/_search" -H 'Content-Type: application/json' -d'
{"size": 10, "query": {"multi_match": {
"query": "one two three",
"fields": ["text_t", "title^10", "description^3"], "fuzziness": "AUTO"
}}}'</code><br />
<input type="radio" name="format" value="full-solr" />
XML (Rich and full-text Solr data, one document per line in one large xml file,
can be processed with shell tools, can be imported with DATA/SURROGATE/in/)
<br />
<input type="radio" name="format" value="full-rss" />
XML (RSS)
</dd>
<dt>Full URL List:</dt>
<dd><input type="radio" name="format" value="url-text" /> Plain Text List (URLs only)<br />
<input type="radio" name="format" value="url-html" /> HTML (URLs with title)</dd>
<dt>Only Domain:</dt>
<dd><input type="radio" name="format" value="dom-text" /> Plain Text List (domains only)<br />
<input type="radio" name="format" value="dom-html" /> HTML (domains as URLs, no title)</dd>
<dt>Only Text:</dt>
<dt>Only Text:</dt>
<dd><input type="radio" name="format" value="text-text" /> Fulltext of Search Index Text</dd>
</dl>
</dd>
</dl>
</dd>
<dt>&nbsp;</dt>
<dd><input type="submit" name="lurlexport" value="Export" class="btn btn-primary" style="width:240px;"/>
</dd>
@ -55,16 +88,16 @@
</form>::
<div class="alert alert-info" style="text-decoration:blink">Export to file #[exportfile]# is running .. #[urlcount]# Documents so far</div>::
#(/lurlexport)#
#(lurlexportfinished)#::
#(lurlexportfinished)#::
<div class="alert alert-success">Finished export of #[urlcount]# Documents to file <a href="file://#[exportfile]#" target="_">#[exportfile]#</a><br/>
<em>Import this file by moving it to DATA/SURROGATES/in</em></div>::
#(/lurlexportfinished)#
#(lurlexporterror)#::
<div class="alert alert-warning">Export to file #[exportfile]# failed: #[exportfailmsg]#</div>::
#(/lurlexporterror)#
#(dumprestore)#::
<form action="IndexExport_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset><legend>Dump and Restore of Solr Index</legend>

@ -14,7 +14,8 @@
<dependency org="com.drewnoakes" name="metadata-extractor" rev="2.18.0" />
<dependency org="com.fasterxml.jackson.core" name="jackson-databind" rev="2.13.5"/>
<dependency org="com.github.ben-manes.caffeine" name="caffeine" rev="3.1.8"/>
<dependency org="com.google.guava" name="guava" rev="28.0-jre" conf="compile->master"/>
<dependency org="com.google.guava" name="guava" rev="32.1.3-jre" conf="compile->master"/>
<dependency org="com.google.guava" name="failureaccess" rev="1.0.2" />
<dependency org="com.ibm.icu" name="icu4j" rev="73.2"/>
<dependency org="com.jcraft" name="jsch" rev="0.1.55" />
<dependency org="com.twelvemonkeys.imageio" name="imageio-core" rev="3.9.4"/>
@ -28,6 +29,7 @@
<dependency org="io.opentracing" name="opentracing-noop" rev="0.33.0"/>
<dependency org="io.opentracing" name="opentracing-util" rev="0.33.0"/>
<dependency org="javax.servlet" name="javax.servlet-api" rev="3.1.0"/>
<dependency org="javainetlocator" name="inetaddresslocator" rev="2.18" />
<dependency org="jcifs" name="jcifs" rev="1.3.17" conf="compile->master" />
<dependency org="net.arnx" name="jsonic" rev="1.3.10"/>
<dependency org="net.jthink" name="jaudiotagger" rev="2.2.5"/>

@ -2578,6 +2578,36 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
return null;
}
public boolean exists(final ClientIdentification.Agent agent) {
try {
if (isFile()) {
return getFSFile().exists();
}
if (isSMB()) {
return getSmbFile().exists();
}
if (isFTP()) {
final FTPClient client = new FTPClient();
client.open(this.host, this.port < 0 ? 21 : this.port);
return client.fileSize(path) > 0;
}
if (isHTTP() || isHTTPS()) {
final HTTPClient client = new HTTPClient(agent);
client.setHost(getHost());
org.apache.http.HttpResponse response = client.HEADResponse(this, true);
client.close();
if (response == null) return false;
int status = response.getStatusLine().getStatusCode();
return status == 200 || status == 301 || status == 302;
}
return false;
} catch (IOException e) {
if (e.getMessage().contains("Circular redirect to")) return true; // exception; this is a 302 which the client actually accepts
//e.printStackTrace();
return false;
}
}
/**
* Read fully the source, close it and return its content as a bytes array.
* @param source the source to read

@ -25,11 +25,22 @@ package net.yacy.document.importer;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Collection;
import java.util.Date;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.util.ConcurrentLog;
@ -80,44 +91,80 @@ public class ZimImporter extends Thread implements Importer {
public void run() {
job = this;
this.startTime = System.currentTimeMillis();
Switchboard sb = Switchboard.getSwitchboard();
try {
this.reader = new ZIMReader(this.file);
this.guessedSource = getSource(this.reader);
Date guessedDate = getDate(this.reader);
String dates = HeaderFramework.newRfc1123Format().format(guessedDate);
// verify the source
DirectoryEntry mainEntry = this.reader.getMainDirectoryEntry();
DigestURL mainURL = guessURL(this.guessedSource, mainEntry);
if (!mainURL.exists(ClientIdentification.browserAgent)) {
sb.log.info("zim importer: file " + this.file.getName() + " failed main url existence test: " + mainURL);
return;
}
// read all documents
for (int i = 0; i < this.file.header_entryCount; i++) {
if (this.abort) break;
DirectoryEntry de = this.reader.getDirectoryInfo(i);
if (!(de instanceof ZIMReader.ArticleEntry)) continue;
ArticleEntry ae = (ArticleEntry) de;
// check url
String guessedUrl = guessURL(this.guessedSource, de);
assert guessedUrl.startsWith("http");
// check availability of text parser
String mimeType = ae.getMimeType();
if (TextParser.supportsMime(mimeType) != null) continue;
// read the content
byte[] b = this.reader.getArticleData(ae);
// create artificial request and response headers for the indexer
RequestHeader requestHeader = new RequestHeader();
ResponseHeader responseHeader = new ResponseHeader(200);
final Request request = new Request(new DigestURL(guessedUrl), null);
final Response response = new Response(
request,
requestHeader,
responseHeader,
Switchboard.getSwitchboard().crawler.defaultSurrogateProfile,
false,
b
);
// throw this to the indexer
String error = Switchboard.getSwitchboard().toIndexer(response);
if (error != null) ConcurrentLog.info("ZimImporter", "error parsing: " + error);
this.recordCnt++;
try {
if (this.abort) break;
DirectoryEntry de = this.reader.getDirectoryInfo(i);
if (!(de instanceof ZIMReader.ArticleEntry)) continue;
ArticleEntry ae = (ArticleEntry) de;
if (ae.namespace != 'C' && ae.namespace != 'A') continue;
// check url
DigestURL guessedUrl = guessURL(this.guessedSource, de);
if (recordCnt < 10) {
// critical test for the first 10 urls
if (!guessedUrl.exists(ClientIdentification.browserAgent)) {
sb.log.info("zim importer: file " + this.file.getName() + " failed url " + recordCnt + " existence test: " + guessedUrl);
return;
}
}
// check availability of text parser
String mimeType = ae.getMimeType();
if (!mimeType.startsWith("text/") && !mimeType.equals("application/epub+zip")) continue; // in this import we want only text, not everything that is possible
if (TextParser.supportsMime(mimeType) != null) continue;
// read the content
byte[] b = this.reader.getArticleData(ae);
// create artificial request and response headers for the indexer
RequestHeader requestHeader = new RequestHeader();
ResponseHeader responseHeader = new ResponseHeader(200);
responseHeader.put(HeaderFramework.CONTENT_TYPE, de.getMimeType()); // very important to tell parser which kind of content
responseHeader.put(HeaderFramework.LAST_MODIFIED, dates); // put in the guessd date to have something that is not the current date
final Request request = new Request(
ASCII.getBytes(sb.peers.mySeed().hash),
guessedUrl,
null, // referrerhash the hash of the referrer URL
de.title, // name the name of the document to crawl
null, // appdate the time when the url was first time appeared
sb.crawler.defaultSurrogateProfile.handle(), // profileHandle the name of the prefetch profile. This must not be null!
0, // depth the crawling depth of the entry
sb.crawler.defaultSurrogateProfile.timezoneOffset() // timezone offset
);
final Response response = new Response(
request,
requestHeader,
responseHeader,
Switchboard.getSwitchboard().crawler.defaultSurrogateProfile,
false,
b
);
// throw this to the indexer
String error = sb.toIndexer(response);
if (error != null) ConcurrentLog.info("ZimImporter", "error parsing: " + error);
this.recordCnt++;
} catch (Exception e) {
// catch any error that could stop the importer
ConcurrentLog.info("ZimImporter", "error loading: " + e.getMessage());
}
}
} catch (IOException e) {
ConcurrentLog.info("ZimImporter", "error reading: " + e.getMessage());
@ -195,8 +242,6 @@ public class ZimImporter extends Thread implements Importer {
return "fas.org";
case "fonts":
return "fonts.google.com";
case "gutenberg":
return "gutenberg.org";
case "ifixit":
return "ifixit.com";
case "lesfondamentaux":
@ -216,11 +261,23 @@ public class ZimImporter extends Thread implements Importer {
case "rapsberry_pi_docs":
return "raspberrypi.org";
case "ted":
return "ted.com";
return "www.ted.com/search?q=";
case "vikidia":
return "vikidia.org";
return parts[1] + ".vikidia.org/wiki";
case "westeros":
return "westeros.org";
case "wikihow":
return parts[1].equals("en") ? "wikihow.com" : parts[1] + ".wikihow.com";
case "wikisource":
return parts[1] + ".wikisource.org/wiki";
case "wikiversity":
return parts[1] + ".wikiversity.org/wiki";
case "wikivoyage":
return parts[1] + ".wikivoyage.org/wiki";
case "wiktionary":
return parts[1] + ".wiktionary.org/wiki";
case "wikiquote":
return parts[1] + ".wikiquote.org/wiki";
case "wikibooks":
return parts[1] + ".wikibooks.org/wiki";
case "wikinews":
@ -266,16 +323,174 @@ public class ZimImporter extends Thread implements Importer {
return source;
}
public static String guessURL(String guessedSource, DirectoryEntry de) {
public static Date getDate(ZIMReader r) throws IOException {
String date = r.getMetadata("Date");
if (date != null) try {
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd", Locale.US);
return format.parse(date);
} catch (ParseException e) {}
// failover situation: use file date
return new Date(r.getZIMFile().lastModified());
}
public static DigestURL guessURL(String guessedSource, DirectoryEntry de) throws MalformedURLException {
String url = de.url;
if (url.equals("Main_Page")) url = "";
if (guessedSource != null) return guessedSource + url;
if (url.startsWith("A/")) return "https://" + url.substring(2);
if (url.startsWith("H/")) return "https://" + url.substring(2);
return guessedSource + url;
if (url.startsWith("A/")) return new DigestURL("https://" + url.substring(2));
if (url.startsWith("H/")) return new DigestURL("https://" + url.substring(2));
if (guessedSource != null) return new DigestURL(guessedSource + url);
return new DigestURL(guessedSource + url);
}
private final static String[] skip_files = {
"iota.stackexchange.com_en_all_2023-05.zim",
"stellar.stackexchange.com_en_all_2023-10.zim",
"vegetarianism.stackexchange.com_en_all_2023-05.zim",
"esperanto.stackexchange.com_eo_all_2023-10.zim",
"tezos.stackexchange.com_en_all_2023-10.zim",
"eosio.stackexchange.com_en_all_2023-10.zim",
"ebooks.stackexchange.com_en_all_2023-10.zim",
"poker.stackexchange.com_en_all_2023-05.zim",
"cseducators.stackexchange.com_en_all_2023-10.zim",
"iot.stackexchange.com_en_all_2023-05.zim",
"portuguese.stackexchange.com_pt_all_2023-04.zim",
"portuguese.stackexchange.com_pt_all_2023-10.zim",
"italian.stackexchange.com_it_all_2023-05.zim",
"monero.stackexchange.com_en_all_2022-11.zim",
"sustainability.stackexchange.com_en_all_2023-05.zim",
"westeros_en_all_nopic_2021-03.zim",
"opensource.stackexchange.com_en_all_2023-10.zim",
"tor.stackexchange.com_en_all_2023-05.zim",
"devops.stackexchange.com_en_all_2023-10.zim",
"patents.stackexchange.com_en_all_2023-10.zim",
"stackapps.com_en_all_2023-05.zim",
"hardwarerecs.stackexchange.com_en_all_2023-05.zim",
"hsm.stackexchange.com_en_all_2023-05.zim",
"expatriates.stackexchange.com_en_all_2023-11.zim",
"opendata.stackexchange.com_en_all_2023-10.zim",
"sports.stackexchange.com_en_all_2023-05.zim",
"wikinews_de_all_nopic_2023-10.zim",
"computergraphics.stackexchange.com_en_all_2023-10.zim",
"tridion.stackexchange.com_en_all_2023-10.zim",
"bioinformatics.stackexchange.com_en_all_2023-10.zim",
"expressionengine.stackexchange.com_en_all_2023-11.zim",
"elementaryos.stackexchange.com_en_all_2023-10.zim",
"cstheory.stackexchange.com_en_all_2023-10.zim",
"chess.stackexchange.com_en_all_2023-05.zim",
"vi.stackexchange.com_en_all_2023-05.zim",
"fitness.stackexchange.com_en_all_2023-10.zim",
"pets.stackexchange.com_en_all_2023-05.zim",
"french.stackexchange.com_fr_all_2023-10.zim",
"sqa.stackexchange.com_en_all_2023-05.zim",
"islam.stackexchange.com_en_all_2023-05.zim",
"scicomp.stackexchange.com_en_all_2023-05.zim",
"wikinews_en_all_nopic_2023-09.zim",
"ai.stackexchange.com_en_all_2023-10.zim",
"boardgames.stackexchange.com_en_all_2023-05.zim",
"economics.stackexchange.com_en_all_2023-05.zim",
"3dprinting.stackexchange.com_en_all_2023-07.zim",
"earthscience.stackexchange.com_en_all_2023-05.zim",
"emacs.stackexchange.com_en_all_2023-10.zim",
"bitcoin.stackexchange.com_en_all_2023-05.zim",
"philosophy.stackexchange.com_en_all_2023-05.zim",
"law.stackexchange.com_en_all_2023-05.zim",
"astronomy.stackexchange.com_en_all_2023-05.zim",
"artofproblemsolving_en_all_nopic_2021-03.zim",
"engineering.stackexchange.com_en_all_2023-05.zim",
"ja.stackoverflow.com_ja_all_2023-06.zim",
"webmasters.stackexchange.com_en_all_2023-05.zim",
"anime.stackexchange.com_en_all_2023-10.zim",
"cooking.stackexchange.com_en_all_2023-05.zim",
"arduino.stackexchange.com_en_all_2023-05.zim",
"money.stackexchange.com_en_all_2023-05.zim",
"judaism.stackexchange.com_en_all_2023-05.zim",
"ethereum.stackexchange.com_en_all_2023-05.zim",
"datascience.stackexchange.com_en_all_2023-10.zim",
"academia.stackexchange.com_en_all_2023-10.zim",
"music.stackexchange.com_en_all_2023-05.zim",
"cs.stackexchange.com_en_all_2023-03.zim",
"dsp.stackexchange.com_en_all_2023-05.zim",
"biology.stackexchange.com_en_all_2023-05.zim",
"android.stackexchange.com_en_all_2023-10.zim",
"bicycles.stackexchange.com_en_all_2023-05.zim",
"puzzling.stackexchange.com_en_all_2023-05.zim",
"photo.stackexchange.com_en_all_2023-05.zim",
"aviation.stackexchange.com_en_all_2023-05.zim",
"drupal.stackexchange.com_en_all_2023-05.zim",
"ux.stackexchange.com_en_all_2023-05.zim",
"ell.stackexchange.com_en_all_2023-10.zim",
"openstreetmap-wiki_en_all_nopic_2023-05.zim",
"softwareengineering.stackexchange.com_en_all_2023-05.zim",
"gaming.stackexchange.com_en_all_2023-10.zim",
"mathematica.stackexchange.com_en_all_2023-10.zim",
"pt.stackoverflow.com_pt_all_2023-06.zim",
"apple.stackexchange.com_en_all_2023-05.zim",
"diy.stackexchange.com_en_all_2023-08.zim",
"es.stackoverflow.com_es_all_2023-06.zim",
"gis.stackexchange.com_en_all_2023-05.zim",
"stats.stackexchange.com_en_all_2023-05.zim",
"physics.stackexchange.com_en_all_2023-05.zim",
"serverfault.com_en_all_2023-05.zim",
"electronics.stackexchange.com_en_all_2023-05.zim",
"tex.stackexchange.com_en_all_2023-05.zim",
"wikibooks_de_all_nopic_2021-03.zim",
"askubuntu.com_en_all_2023-05.zim",
"superuser.com_en_all_2023-05.zim",
"lesfondamentaux.reseau-canope.fr_fr_all_2022-11.zim",
"wikibooks_en_all_nopic_2021-03.zim",
"courses.lumenlearning.com_en_all_2021-03.zim",
"wikipedia_de_all_nopic_2023-10.zim",
"wikipedia_en_all_nopic_2023-10.zim",
"stackoverflow.com_en_all_nopic_2022-07.zim",
"stackoverflow.com_en_all_2023-05.zim",
"armypubs_en_all_2023-08.zim",
"vikidia_en_all_nopic_2023-09.zim",
"wikiquote_de_all_nopic_2023-10.zim",
"wikiquote_en_all_nopic_2023-09.zim",
"wiktionary_de_all_nopic_2023-10.zim",
"wiktionary_en_all_nopic_2023-10.zim",
"wikihow_de_maxi_2023-10.zim",
"wikivoyage_de_all_nopic_2023-09.zim",
"wikiversity_de_all_nopic_2021-03.zim",
"wikiversity_en_all_nopic_2021-03.zim",
"wikisource_de_all_nopic_2023-09.zim",
"wikisource_en_all_nopic_2023-08.zim",
"ted_countdown_global_2023-09.zim",
"ted_en_design_2023-09.zim",
"ted_en_business_2023-09.zim",
"ted_en_global_issues_2023-09.zim",
"opentextbooks_en_all_2023-08.zim",
"bestedlessons.org_en_all_2023-08.zim",
"wikivoyage_en_all_nopic_2023-10.zim",
"based.cooking_en_all_2023-10.zim",
"wordnet_en_all_2023-04.zim",
"internet-encyclopedia-philosophy_en_all_2023-08.zim",
"100r-off-the-grid_en_2023-09.zim",
"coopmaths_2023-04.zim",
"birds-of-ladakh_en_all_2023-02.zim",
"storyweaver.org_en_2023-09.zim",
"developer.mozilla.org_en_all_2023-02.zim",
"www.ready.gov_es_2023-06.zim",
"teoria.com_en_2023-08.zim",
"theworldfactbook_en_all_2023-06.zim",
"mutopiaproject.org_en_2023-08.zim",
"dp.la_en_all_2023-08.zim",
// 302
"moderators.stackexchange.com_en_all_2023-05.zim",
"beer.stackexchange.com_en_all_2023-05.zim",
"health.stackexchange.com_en_all_2023-05.zim",
"avp.stackexchange.com_en_all_2023-05.zim",
"lowtechmagazine.com_en_all_2023-08.zim",
"ifixit_de_all_2023-07.zim",
"ifixit_en_all_2023-10.zim",
"der-postillon.com_de_all_2020-12.zim",
"wikihow_en_maxi_2023-03.zim",
};
public static void main(String[] args) {
Set<String> skip = new HashSet<>();
for (String s: skip_files) skip.add(s);
// zim file import test
// will test mostly if domain names are included in zim file urls
String zimFilesPath = args[0];
@ -291,7 +506,10 @@ public class ZimImporter extends Thread implements Importer {
}
Collection<File> orderedFiles = orderedFileMap.values();
Set<String> files_ok = new LinkedHashSet<>();
Set<String> files_nok = new LinkedHashSet<>();
for (File f: orderedFiles) {
if (skip.contains(f.getName())) continue;
try {
ZIMFile z = new ZIMFile(f.getAbsolutePath());
ZIMReader r = new ZIMReader(z);
@ -301,14 +519,21 @@ public class ZimImporter extends Thread implements Importer {
System.out.println("Namespace: " + de.namespace);
System.out.println("Title: " + de.title);
System.out.println("URL: " + de.url);
System.out.println("guessed domain: " + guessDomainName(f.getName()));
System.out.println("Mime Type " + de.getMimeType());
System.out.println("guessed domain: " + guessDomainName(f.getName())); // uses a table and rules that deduces a source from the file name
String source = getSource(r);
System.out.println("guessed Source: " + source);
System.out.println("guessed main article: " + guessURL(source, de));
System.out.println("guessed Source: " + source); // this uses metadata stored in the zim file
DigestURL mainURL = guessURL(source, de);
System.out.println("guessed main article: " + mainURL);
boolean ok = mainURL.exists(ClientIdentification.browserAgent);
System.out.println("main article exists: " + ok);
if (ok) files_ok.add(f.getName()); else files_nok.add(f.getName());
System.out.println();
} catch (IOException e) {
e.printStackTrace();
}
}
System.out.println("ok files: " + files_ok.toString());
System.out.println("not-ok files: " + files_nok.toString());
}
}

@ -53,7 +53,6 @@ import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
import org.apache.pdfbox.text.PDFTextStripper;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
@ -69,9 +68,6 @@ import net.yacy.kelondro.util.MemoryControl;
public class pdfParser extends AbstractParser implements Parser {
public static boolean individualPages = false;
public static String individualPagePropertyname = "page";
public pdfParser() {
super("Acrobat Portable Document Parser");
this.SUPPORTED_EXTENSIONS.add("pdf");
@ -149,98 +145,36 @@ public class pdfParser extends AbstractParser implements Parser {
// get the links
final List<Collection<AnchorURL>> pdflinks = extractPdfLinks(pdfDoc);
// get the fulltext (either per document or for each page)
final PDFTextStripper stripper = new PDFTextStripper(/*StandardCharsets.UTF_8.name()*/);
if (individualPages) {
// this is a hack which stores individual pages of the source pdf into individual index documents
// the new documents will get a virtual link with a post argument page=X appended to the original url
// collect text
final int pagecount = pdfDoc.getNumberOfPages();
final String[] pages = new String[pagecount];
for (int page = 1; page <= pagecount; page++) {
stripper.setStartPage(page);
stripper.setEndPage(page);
pages[page - 1] = stripper.getText(pdfDoc);
//System.out.println("PAGE " + page + ": " + pages[page - 1]);
}
// create individual documents for each page
assert pages.length == pdflinks.size() : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.size();
result = new Document[Math.min(pages.length, pdflinks.size())];
final String loc = location.toNormalform(true);
for (int page = 0; page < result.length; page++) {
result[page] = new Document(
new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash
mimeType,
StandardCharsets.UTF_8.name(),
this,
null,
docKeywords,
singleList(docTitle),
docAuthor,
docPublisher,
null,
null,
0.0d, 0.0d,
pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]),
pdflinks == null || page >= pdflinks.size() ? null : pdflinks.get(page),
null,
null,
false,
docDate);
}
} else {
// collect the whole text at once
final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
byte[] contentBytes = new byte[0];
stripper.setEndPage(3); // get first 3 pages (always)
writer.append(stripper.getText(pdfDoc));
contentBytes = writer.getBytes(); // remember text in case of interrupting thread
if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read
stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text)
stripper.setEndPage(Integer.MAX_VALUE); // set to default
// we start the pdf parsing in a separate thread to ensure that it can be terminated
final PDDocument pdfDocC = pdfDoc;
final Thread t = new Thread("pdfParser.getText:" + location) {
@Override
public void run() {
try {
writer.append(stripper.getText(pdfDocC));
} catch (final Throwable e) {}
}
};
t.start();
t.join(3000); // pdfbox likes to forget to terminate ... (quite often)
if (t.isAlive()) t.interrupt();
contentBytes = writer.getBytes(); // get final text before closing writer
writer.close(); // free writer resources
}
final Collection<AnchorURL> pdflinksCombined = new HashSet<>();
for (final Collection<AnchorURL> pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx);
result = new Document[]{new Document(
location,
mimeType,
StandardCharsets.UTF_8.name(),
this,
null,
docKeywords,
singleList(docTitle),
docAuthor,
docPublisher,
null,
null,
0.0d, 0.0d,
contentBytes,
pdflinksCombined,
null,
null,
false,
docDate)};
}
// collect the whole text at once
final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
byte[] contentBytes = new byte[0];
final PDFTextStripper stripper = new PDFTextStripper();
stripper.setEndPage(Integer.MAX_VALUE);
writer.append(stripper.getText(pdfDoc));
contentBytes = writer.getBytes(); // remember text in case of interrupting thread
writer.close(); // free writer resources
final Collection<AnchorURL> pdflinksCombined = new HashSet<>();
for (final Collection<AnchorURL> pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx);
result = new Document[]{new Document(
location,
mimeType,
StandardCharsets.UTF_8.name(),
this,
null,
docKeywords,
singleList(docTitle),
docAuthor,
docPublisher,
null,
null,
0.0d, 0.0d,
contentBytes,
pdflinksCombined,
null,
null,
false,
docDate)};
} catch (final Throwable e) {
//throw new Parser.Failure(e.getMessage(), location);
} finally {

@ -49,7 +49,6 @@ import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.io.ByteCountInputStream;
import org.w3c.dom.CharacterData;
import org.w3c.dom.Element;
@ -116,7 +115,8 @@ public class sitemapParser extends AbstractParser implements Parser {
ConcurrentLog.info("SitemapReader", "loading sitemap from " + sitemapURL.toNormalform(true));
// client.setHeader(requestHeader.entrySet());
try (final HTTPClient client = new HTTPClient(agent)) {
client.GET(sitemapURL.toNormalform(false), false);
String url = sitemapURL.toNormalform(false);
client.GET(url, false);
if (client.getStatusCode() != 200) {
throw new IOException("Unable to download the sitemap file " + sitemapURL +
"\nServer returned status: " + client.getHttpResponse().getStatusLine());
@ -128,11 +128,10 @@ public class sitemapParser extends AbstractParser implements Parser {
final String contentMimeType = header.mime();
InputStream contentStream = client.getContentstream();
if (contentMimeType != null && (contentMimeType.equals("application/x-gzip") || contentMimeType.equals("application/gzip"))) {
if ((contentMimeType != null && (contentMimeType.equals("application/x-gzip") || contentMimeType.equals("application/gzip"))) || url.endsWith(".gz")) {
contentStream = new GZIPInputStream(contentStream);
}
final ByteCountInputStream counterStream = new ByteCountInputStream(contentStream, null);
return new SitemapReader(counterStream, agent);
return new SitemapReader(contentStream, agent);
} catch (final IOException e) {
throw e;
}

@ -61,13 +61,6 @@ public class ConfigParser_p {
env.setConfig(SwitchboardConstants.PARSER_MIME_DENY, TextParser.getDenyMime());
env.setConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, TextParser.getDenyExtension());
}
if (post.containsKey("pdfSettings")) {
env.setConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, post.getBoolean("individualPages"));
env.setConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, post.get("individualPagePropertyname", "page"));
pdfParser.individualPages = sb.getConfigBool(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, false);
pdfParser.individualPagePropertyname = sb.getConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, "page");
}
}
int i = 0;
@ -94,9 +87,6 @@ public class ConfigParser_p {
prop.put("parser", i);
prop.put("individualPages", sb.getConfigBool(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, false));
prop.put("individualPagePropertyname", sb.getConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, "page"));
// return rewrite properties
return prop;
}

@ -774,7 +774,7 @@ public class Crawler_p {
}
/*
* <input id="customPPM" name="customPPM" type="number" min="10" max="30000" style="width:46px" value="#[customPPMdefault]#" />PPM
* <input id="customPPM" name="customPPM" type="number" min="10" max="60000" style="width:46px" value="#[customPPMdefault]#" />PPM
<input id="latencyFactor" name="latencyFactor" type="number" min="0.1" max="3.0" step="0.1" style="width:32px" value="#[latencyFactorDefault]#" />LF
<input id="MaxSameHostInQueue" name="MaxSameHostInQueue" type="number" min="1" max="30" style="width:32px" value="#[MaxSameHostInQueueDefault]#" />MH
<input type="submit" name="crawlingPerformance" value="set" />
@ -784,19 +784,19 @@ public class Crawler_p {
if (post != null && post.containsKey("crawlingPerformance")) {
final String crawlingPerformance = post.get("crawlingPerformance", "custom");
final long LCbusySleep1 = sb.getConfigLong(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, 1000L);
int wantedPPM = (LCbusySleep1 == 0) ? 30000 : (int) (60000L / LCbusySleep1);
int wantedPPM = (LCbusySleep1 == 0) ? 60000 : (int) (60000L / LCbusySleep1);
try {
wantedPPM = post.getInt("customPPM", wantedPPM);
} catch (final NumberFormatException e) {}
if ("minimum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 10;
if ("maximum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 30000;
if ("maximum".equals(crawlingPerformance.toLowerCase(Locale.ROOT))) wantedPPM = 60000;
int wPPM = wantedPPM;
if ( wPPM <= 0 ) {
wPPM = 1;
}
if ( wPPM >= 30000 ) {
wPPM = 30000;
if ( wPPM >= 60000 ) {
wPPM = 60000;
}
final int newBusySleep = 60000 / wPPM; // for wantedPPM = 10: 6000; for wantedPPM = 1000: 60

@ -64,8 +64,8 @@ public class IndexExport_p {
prop.put("lurlexport", 0);
prop.put("reload", 0);
prop.put("dumprestore", 1);
prop.put("dumprestore_dumpRestoreEnabled", sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_FULLTEXT,
SwitchboardConstants.CORE_SERVICE_FULLTEXT_DEFAULT));
prop.put("dumprestore_dumpRestoreEnabled", sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_FULLTEXT,
SwitchboardConstants.CORE_SERVICE_FULLTEXT_DEFAULT));
List<File> dumpFiles = segment.fulltext().dumpFiles();
prop.put("dumprestore_dumpfile", dumpFiles.size() == 0 ? "" : dumpFiles.get(dumpFiles.size() - 1).getAbsolutePath());
prop.put("dumprestore_optimizemax", 10);
@ -80,7 +80,7 @@ public class IndexExport_p {
prop.put("lurlexportfinished", 0);
prop.put("lurlexporterror", 0);
prop.put("lurlexport_exportfile", export.file().toString());
prop.put("lurlexport_urlcount", export.count());
prop.put("lurlexport_urlcount", export.docCount());
prop.put("reload", 1);
} else {
prop.put("lurlexport", 1);
@ -93,7 +93,7 @@ public class IndexExport_p {
// an export was running but has finished
prop.put("lurlexportfinished", 1);
prop.put("lurlexportfinished_exportfile", export.file().toString());
prop.put("lurlexportfinished_urlcount", export.count());
prop.put("lurlexportfinished_urlcount", export.docCount());
if (export.failed() == null) {
prop.put("lurlexporterror", 0);
} else {
@ -123,14 +123,17 @@ public class IndexExport_p {
final String filter = post.get("exportfilter", ".*");
final String query = post.get("exportquery", "*:*");
final int maxseconds = post.getInt("exportmaxseconds", -1);
long maxChunkSize = post.getLong("maxchunksize", Long.MAX_VALUE);
if (maxChunkSize <= 0) maxChunkSize = Long.MAX_VALUE;
final String path = post.get("exportfilepath", "");
final boolean minified = post.get("minified", "no").equals("yes");
// store this call as api call: we do this even if there is a chance that it fails because recurring calls may do not fail
if (maxseconds != -1) sb.tables.recordAPICall(post, "IndexExport_p.html", WorkTables.TABLE_API_TYPE_DUMP, format + "-dump, q=" + query + ", maxseconds=" + maxseconds);
// start the export
try {
export = sb.index.fulltext().export(format, filter, query, maxseconds, new File(path), dom, text);
export = sb.index.fulltext().export(format, filter, query, maxseconds, new File(path), dom, text, maxChunkSize, minified);
} catch (final IOException e) {
prop.put("lurlexporterror", 1);
prop.put("lurlexporterror_exportfile", "-no export-");
@ -140,7 +143,7 @@ public class IndexExport_p {
// show result
prop.put("lurlexport_exportfile", export.file().toString());
prop.put("lurlexport_urlcount", export.count());
prop.put("lurlexport_urlcount", export.docCount());
if ((export != null) && (export.failed() == null)) {
prop.put("lurlexport", 2);
}
@ -148,34 +151,34 @@ public class IndexExport_p {
}
if (post.containsKey("indexdump")) {
try {
final File dump = segment.fulltext().dumpEmbeddedSolr();
prop.put("indexdump", 1);
prop.put("indexdump_dumpfile", dump.getAbsolutePath());
dumpFiles = segment.fulltext().dumpFiles();
prop.put("dumprestore_dumpfile", dumpFiles.size() == 0 ? "" : dumpFiles.get(dumpFiles.size() - 1).getAbsolutePath());
// sb.tables.recordAPICall(post, "IndexExport_p.html", WorkTables.TABLE_API_TYPE_STEERING, "solr dump generation");
} catch(final SolrException e) {
if(ErrorCode.SERVICE_UNAVAILABLE.code == e.code()) {
prop.put("indexdump", 2);
} else {
prop.put("indexdump", 3);
}
}
try {
final File dump = segment.fulltext().dumpEmbeddedSolr();
prop.put("indexdump", 1);
prop.put("indexdump_dumpfile", dump.getAbsolutePath());
dumpFiles = segment.fulltext().dumpFiles();
prop.put("dumprestore_dumpfile", dumpFiles.size() == 0 ? "" : dumpFiles.get(dumpFiles.size() - 1).getAbsolutePath());
// sb.tables.recordAPICall(post, "IndexExport_p.html", WorkTables.TABLE_API_TYPE_STEERING, "solr dump generation");
} catch(final SolrException e) {
if(ErrorCode.SERVICE_UNAVAILABLE.code == e.code()) {
prop.put("indexdump", 2);
} else {
prop.put("indexdump", 3);
}
}
}
if (post.containsKey("indexrestore")) {
try {
final File dump = new File(post.get("dumpfile", ""));
segment.fulltext().restoreEmbeddedSolr(dump);
prop.put("indexRestore", 1);
} catch(final SolrException e) {
if(ErrorCode.SERVICE_UNAVAILABLE.code == e.code()) {
prop.put("indexRestore", 2);
} else {
prop.put("indexRestore", 3);
}
}
try {
final File dump = new File(post.get("dumpfile", ""));
segment.fulltext().restoreEmbeddedSolr(dump);
prop.put("indexRestore", 1);
} catch(final SolrException e) {
if(ErrorCode.SERVICE_UNAVAILABLE.code == e.code()) {
prop.put("indexRestore", 2);
} else {
prop.put("indexRestore", 3);
}
}
}
// insert constants

@ -981,17 +981,7 @@ public class URIMetadataNode extends SolrDocument /* implements Comparable<URIMe
public String urlstring() {
if (this.alternative_urlstring != null) return this.alternative_urlstring;
if (!pdfParser.individualPages) return this.url().toNormalform(true);
if (!"pdf".equals(MultiProtocolURL.getFileExtension(this.url().getFileName()).toLowerCase(Locale.ROOT))) return this.url().toNormalform(true);
// for pdf links we rewrite the url
// this is a special treatment of pdf files which can be splitted into subpages
String pageprop = pdfParser.individualPagePropertyname;
String resultUrlstring = this.url().toNormalform(true);
int p = resultUrlstring.lastIndexOf(pageprop + "=");
if (p > 0) {
return resultUrlstring.substring(0, p - 1) + "#page=" + resultUrlstring.substring(p + pageprop.length() + 1);
}
return resultUrlstring;
return this.url().toNormalform(true);
}
/**
* used for search result entry

@ -176,6 +176,7 @@ import net.yacy.document.content.SurrogateReader;
import net.yacy.document.importer.JsonListImporter;
import net.yacy.document.importer.OAIListFriendsLoader;
import net.yacy.document.importer.WarcImporter;
import net.yacy.document.importer.ZimImporter;
import net.yacy.document.parser.audioTagParser;
import net.yacy.document.parser.pdfParser;
import net.yacy.document.parser.html.Evaluation;
@ -906,8 +907,6 @@ public final class Switchboard extends serverSwitch {
TextParser.setDenyMime(this.getConfig(SwitchboardConstants.PARSER_MIME_DENY, ""));
TextParser.setDenyExtension(this.getConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, ""));
pdfParser.individualPages = this.getConfigBool(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES, false);
pdfParser.individualPagePropertyname = this.getConfig(SwitchboardConstants.PARSER_PDF_INDIVIDUALPAGES_KEY, "page");
// start a loader
this.log.config("Starting Crawl Loader");
@ -2153,7 +2152,24 @@ public final class Switchboard extends serverSwitch {
this.log.warn("IO Error processing warc file " + infile);
}
return moved;
} else if (s.endsWith(".jsonlist") || s.endsWith(".jsonlist.gz") || s.endsWith(".flatjson")) {
} else if (s.endsWith(".zim")) {
try {
final ZimImporter wri = new ZimImporter(infile.getAbsolutePath());
wri.start();
try {
wri.join();
} catch (final InterruptedException ex) {
return moved;
}
moved = infile.renameTo(outfile);
} catch (final IOException ex) {
this.log.warn("IO Error processing zim file " + infile);
}
return moved;
} else if (
s.endsWith(".jsonl") || s.endsWith(".jsonl.gz") ||
s.endsWith(".jsonlist") || s.endsWith(".jsonlist.gz") ||
s.endsWith(".flatjson") || s.endsWith(".flatjson.gz")) {
return this.processSurrogateJson(infile, outfile);
}
InputStream is = null;
@ -2349,6 +2365,7 @@ public final class Switchboard extends serverSwitch {
if ( surrogate.endsWith(".xml")
|| surrogate.endsWith(".xml.gz")
|| surrogate.endsWith(".xml.zip")
|| surrogate.endsWith(".zim")
|| surrogate.endsWith(".warc")
|| surrogate.endsWith(".warc.gz")
|| surrogate.endsWith(".jsonlist")

@ -220,8 +220,6 @@ public final class SwitchboardConstants {
public static final String INDEX_TRANSFER_GZIP_BODY = "indexTransfer.gzipBody";
public static final String PARSER_MIME_DENY = "parser.mime.deny";
public static final String PARSER_EXTENSIONS_DENY = "parser.extensions.deny";
public static final String PARSER_PDF_INDIVIDUALPAGES = "parser.pdf.individualpages";
public static final String PARSER_PDF_INDIVIDUALPAGES_KEY = "parser.pdf.individualpages.key";
/**
* <p><code>public static final String <strong>PROXY_ONLINE_CAUTION_DELAY</strong> = "onlineCautionDelay"</code></p>
* <p>Name of the setting how long indexing should pause after the last time the proxy was used in milliseconds</p>

@ -34,8 +34,10 @@ import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.atomic.AtomicInteger;
@ -118,7 +120,7 @@ public final class Fulltext {
this.writeWebgraph = false;
}
public void setUseWebgraph(boolean check) {
public void setUseWebgraph(final boolean check) {
this.writeWebgraph = check;
}
@ -142,8 +144,8 @@ public final class Fulltext {
final File solrLocation = new File(this.segmentPath, SOLR_PATH);
// migrate old solr to new
for (String oldVersion: SOLR_OLD_PATH) {
File oldLocation = new File(this.segmentPath, oldVersion);
for (final String oldVersion: SOLR_OLD_PATH) {
final File oldLocation = new File(this.segmentPath, oldVersion);
if (oldLocation.exists()) {
if (!oldLocation.renameTo(solrLocation)) {
ConcurrentLog.severe("Fulltext", "Failed renaming old Solr location ("
@ -183,11 +185,11 @@ public final class Fulltext {
return this.solrInstances.getDefaultEmbeddedConnector();
}
public EmbeddedSolrConnector getEmbeddedConnector(String corename) {
public EmbeddedSolrConnector getEmbeddedConnector(final String corename) {
return this.solrInstances.getEmbeddedConnector(corename);
}
public SolrConnector getConnectorForRead(String corename) {
public SolrConnector getConnectorForRead(final String corename) {
if (this.solrInstances.isConnectedRemote()) return this.solrInstances.getRemoteConnector(corename);
if (this.solrInstances.isConnectedEmbedded()) return this.solrInstances.getEmbeddedConnector(corename);
return null;
@ -315,7 +317,7 @@ public final class Fulltext {
}
private long lastCommit = 0;
public void commit(boolean softCommit) {
public void commit(final boolean softCommit) {
final long t = System.currentTimeMillis();
if (this.lastCommit + 10000 > t) return;
this.lastCommit = t;
@ -423,7 +425,7 @@ public final class Fulltext {
* @param freshdate either NULL or a date in the past which is the limit for deletion. Only documents older than this date are deleted
* @throws IOException
*/
public void deleteStaleDomainHashes(final Set<String> hosthashes, Date freshdate) {
public void deleteStaleDomainHashes(final Set<String> hosthashes, final Date freshdate) {
// delete in solr
final Date now = new Date();
deleteDomainWithConstraint(this.getDefaultConnector(), CollectionSchema.host_id_s.getSolrFieldName(), hosthashes,
@ -434,7 +436,7 @@ public final class Fulltext {
(WebgraphSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]"));
}
public void deleteStaleDomainNames(final Set<String> hostnames, Date freshdate) {
public void deleteStaleDomainNames(final Set<String> hostnames, final Date freshdate) {
final Date now = new Date();
deleteDomainWithConstraint(this.getDefaultConnector(), CollectionSchema.host_s.getSolrFieldName(), hostnames,
@ -453,7 +455,7 @@ public final class Fulltext {
deleteDomainWithConstraint(this.getDefaultConnector(), CollectionSchema.host_id_s.getSolrFieldName(), hosthashes, CollectionSchema.failreason_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM);
}
private static void deleteDomainWithConstraint(SolrConnector connector, String fieldname, final Set<String> hosthashes, String constraintQuery) {
private static void deleteDomainWithConstraint(final SolrConnector connector, final String fieldname, final Set<String> hosthashes, final String constraintQuery) {
if (hosthashes == null || hosthashes.size() == 0) return;
final int subsetscount = 1 + (hosthashes.size() / 255); // if the list is too large, we get a "too many boolean clauses" exception
int c = 0;
@ -492,7 +494,7 @@ public final class Fulltext {
* @param basepath the left path of the url; at least until the end of the host
* @param freshdate either NULL or a date in the past which is the limit for deletion. Only documents older than this date are deleted
*/
public int remove(final String basepath, Date freshdate) {
public int remove(final String basepath, final Date freshdate) {
DigestURL uri;
try {uri = new DigestURL(basepath);} catch (final MalformedURLException e) {return 0;}
final String host = uri.getHost();
@ -690,12 +692,15 @@ public final class Fulltext {
public static enum ExportFormat {
text("txt"), html("html"), rss("rss"), solr("xml"), elasticsearch("flatjson");
private final String ext;
private ExportFormat(String ext) {this.ext = ext;}
private ExportFormat(final String ext) {this.ext = ext;}
public String getExt() {return this.ext;}
}
public final static String yacy_dump_prefix = "yacy_dump_";
public Export export(Fulltext.ExportFormat format, String filter, String query, final int maxseconds, File path, boolean dom, boolean text) throws IOException {
public Export export(
final Fulltext.ExportFormat format, final String filter, String query,
final int maxseconds, final File path, final boolean dom, final boolean text,
final long maxChunkSize, final boolean minified) throws IOException {
// modify query according to maxseconds
final long now = System.currentTimeMillis();
@ -760,32 +765,31 @@ public final class Fulltext {
}
}
String s = new File(path, yacy_dump_prefix +
final String filename = yacy_dump_prefix +
"f" + GenericFormatter.SHORT_MINUTE_FORMATTER.format(firstdate) + "_" +
"l" + GenericFormatter.SHORT_MINUTE_FORMATTER.format(lastdate) + "_" +
"n" + GenericFormatter.SHORT_MINUTE_FORMATTER.format(new Date(now)) + "_" +
"c" + String.format("%1$012d", doccount)).getAbsolutePath() + "_tc"; // the name ends with the transaction token ('c' = 'created')
"c" + String.format("%1$012d", doccount)+ "_tc"; // the name ends with the transaction token ('c' = 'created')
// create export file name
if (s.indexOf('.',0) < 0) s += "." + format.getExt();
final File f = new File(s);
f.getParentFile().mkdirs();
return export(f, filter, query, format, dom, text);
return export(path, filename, format.getExt(), filter, query, format, dom, text, maxChunkSize, minified);
}
// export methods
public Export export(final File f, final String filter, final String query, final ExportFormat format, final boolean dom, final boolean text) {
public Export export(
final File path, final String filename,
final String fileext, final String filter, final String query,
final ExportFormat format, final boolean dom, final boolean text,
final long maxChunkSize, final boolean minified) {
if ((this.exportthread != null) && (this.exportthread.isAlive())) {
ConcurrentLog.warn("LURL-EXPORT", "cannot start another export thread, already one running");
return this.exportthread;
}
this.exportthread = new Export(f, filter, query, format, dom, text);
this.exportthread = new Export(path, filename, fileext, filter, query, format, dom, text, maxChunkSize, minified);
this.exportthread.start();
return this.exportthread;
}
public static void main(String args[]) {
public static void main(final String args[]) {
final Date firstdate = null;
System.out.println(GenericFormatter.SHORT_MINUTE_FORMATTER.format(firstdate));
}
@ -794,70 +798,110 @@ public final class Fulltext {
return this.exportthread;
}
private final static Set<String> minified_keys = new HashSet<>();
static {
//minified_keys.add(CollectionSchema.id.getSolrFieldName());
minified_keys.add(CollectionSchema.sku.getSolrFieldName());
minified_keys.add(CollectionSchema.title.getSolrFieldName());
//minified_keys.add(CollectionSchema.author.getSolrFieldName());
minified_keys.add(CollectionSchema.description_txt.getSolrFieldName());
//minified_keys.add(CollectionSchema.size_i.getSolrFieldName());
minified_keys.add(CollectionSchema.last_modified.getSolrFieldName());
minified_keys.add(CollectionSchema.text_t.getSolrFieldName());
}
public class Export extends Thread {
private final File f;
private final File path;
private final String filename, fileext;
private final Pattern pattern;
private int count;
private String failure;
private final String query;
private final ExportFormat format;
private final boolean dom, text;
private Export(final File f, final String filter, final String query, final ExportFormat format, final boolean dom, final boolean text) {
private int docCount, chunkSize, chunkCount;
private final long maxChunkSize;
private final boolean minified;
private Export(
final File path, final String filename,
final String fileext, final String filter, final String query,
final ExportFormat format, final boolean dom, final boolean text,
final long maxChunkSize, final boolean minified) {
super("Fulltext.Export");
// format: 0=text, 1=html, 2=rss/xml
this.f = f;
this.path = path;
this.filename = filename;
this.fileext = fileext;
this.pattern = filter == null ? null : Pattern.compile(filter);
this.query = query == null? AbstractSolrConnector.CATCHALL_QUERY : query;
this.count = 0;
this.failure = null;
this.format = format;
this.dom = dom;
this.text = text;
this.docCount = 0; // number of all documents exported so far
this.chunkSize = 0; // number of documents in the current chunk
this.chunkCount = 0; // number of chunks opened so far
this.maxChunkSize = maxChunkSize; // number of maximum document count per chunk
this.minified = minified;
//if ((dom) && (format == 2)) dom = false;
}
private void printHead(final PrintWriter pw) {
if (this.format == ExportFormat.html) {
pw.println("<html><head></head><body>");
}
if (this.format == ExportFormat.rss) {
pw.println("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
pw.println("<?xml-stylesheet type='text/xsl' href='/yacysearch.xsl' version='1.0'?>");
pw.println("<rss version=\"2.0\" xmlns:yacy=\"http://www.yacy.net/\" xmlns:opensearch=\"http://a9.com/-/spec/opensearch/1.1/\" xmlns:atom=\"http://www.w3.org/2005/Atom\">");
pw.println("<channel>");
pw.println("<title>YaCy Peer-to-Peer - Web-Search URL Export</title>");
pw.println("<description></description>");
pw.println("<link>http://yacy.net</link>");
}
if (this.format == ExportFormat.solr) {
pw.println("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
pw.println("<response>");
pw.println("<lst name=\"responseHeader\">");
pw.println(" <str format=\"yacy.index.export.solr.xml\"/>");
pw.println(" <lst name=\"params\">");
pw.println(" <str name=\"q\">" + this.query + "</str>");
pw.println(" </lst>");
pw.println("</lst>");
pw.println("<result>");
}
}
private void printTail(final PrintWriter pw) {
if (this.format == ExportFormat.html) {
pw.println("</body></html>");
}
if (this.format == ExportFormat.rss) {
pw.println("</channel>");
pw.println("</rss>");
}
if (this.format == ExportFormat.solr) {
pw.println("</result>");
pw.println("</response>");
}
}
@Override
public void run() {
try {
final File parentf = this.f.getParentFile();
if (parentf != null) {
parentf.mkdirs();
}
if (this.path != null) this.path.mkdirs();
} catch(final Exception e) {
ConcurrentLog.logException(e);
this.failure = e.getMessage();
return;
}
try (/* Resources automatically closed by this try-with-resources statement */
final OutputStream os = new FileOutputStream(this.format == ExportFormat.solr ? new File(this.f.getAbsolutePath() + ".gz") : this.f);
final OutputStream wrappedStream = ((this.format == ExportFormat.solr)) ? new GZIPOutputStream(os, 65536){{this.def.setLevel(Deflater.BEST_COMPRESSION);}} : os;
final PrintWriter pw = new PrintWriter(new BufferedOutputStream(wrappedStream));
) {
if (this.format == ExportFormat.html) {
pw.println("<html><head></head><body>");
}
if (this.format == ExportFormat.rss) {
pw.println("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
pw.println("<?xml-stylesheet type='text/xsl' href='/yacysearch.xsl' version='1.0'?>");
pw.println("<rss version=\"2.0\" xmlns:yacy=\"http://www.yacy.net/\" xmlns:opensearch=\"http://a9.com/-/spec/opensearch/1.1/\" xmlns:atom=\"http://www.w3.org/2005/Atom\">");
pw.println("<channel>");
pw.println("<title>YaCy Peer-to-Peer - Web-Search URL Export</title>");
pw.println("<description></description>");
pw.println("<link>http://yacy.net</link>");
}
if (this.format == ExportFormat.solr) {
pw.println("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
pw.println("<response>");
pw.println("<lst name=\"responseHeader\">");
pw.println(" <str format=\"yacy.index.export.solr.xml\"/>");
pw.println(" <lst name=\"params\">");
pw.println(" <str name=\"q\">" + this.query + "</str>");
pw.println(" </lst>");
pw.println("</lst>");
pw.println("<result>");
}
try {
this.docCount = 0;
this.chunkSize = 0;
this.chunkCount = 0;
PrintWriter pw = getWriter();
printHead(pw);
if (this.dom) {
final Map<String, ReversibleScoreMap<String>> scores = Fulltext.this.getDefaultConnector().getFacets(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", 100000000, CollectionSchema.host_s.getSolrFieldName());
final ReversibleScoreMap<String> stats = scores.get(CollectionSchema.host_s.getSolrFieldName());
@ -865,7 +909,7 @@ public final class Fulltext {
if (this.pattern != null && !this.pattern.matcher(host).matches()) continue;
if (this.format == ExportFormat.text) pw.println(host);
if (this.format == ExportFormat.html) pw.println("<a href=\"http://" + host + "\">" + host + "</a><br>");
this.count++;
this.docCount++; this.chunkSize++;
}
} else {
if (this.format == ExportFormat.solr || this.format == ExportFormat.elasticsearch || (this.text && this.format == ExportFormat.text)) {
@ -874,6 +918,12 @@ public final class Fulltext {
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
final String url = getStringFrom(doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
if (this.pattern != null && !this.pattern.matcher(url).matches()) continue;
if (this.minified) {
final Iterator<Entry<String, Object>> i = doc.iterator();
while (i.hasNext()) {
if (!minified_keys.contains(i.next().getKey())) i.remove();
}
}
final CRIgnoreWriter sw = new CRIgnoreWriter();
if (this.text) sw.write((String) doc.getFieldValue(CollectionSchema.text_t.getSolrFieldName()));
if (this.format == ExportFormat.solr) EnhancedXMLResponseWriter.writeDoc(sw, doc);
@ -882,7 +932,15 @@ public final class Fulltext {
if (this.format == ExportFormat.elasticsearch) pw.println("{\"index\":{}}");
final String d = sw.toString();
pw.println(d);
this.count++;
this.docCount++; this.chunkSize++;
if (this.chunkSize >= this.maxChunkSize) {
printTail(pw);
pw.close();
this.chunkCount++;
pw = getWriter();
printHead(pw);
this.chunkSize = 0;
}
}
} else {
final BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", null, 0, 100000000, Long.MAX_VALUE, 100, 1, true,
@ -918,21 +976,20 @@ public final class Fulltext {
pw.println("<guid isPermaLink=\"false\">" + hash + "</guid>");
pw.println("</item>");
}
this.count++;
this.docCount++; this.chunkSize++;
if (this.chunkSize >= this.maxChunkSize) {
printTail(pw);
pw.close();
this.chunkCount++;
pw = getWriter();
printHead(pw);
this.chunkSize = 0;
}
}
}
}
if (this.format == ExportFormat.html) {
pw.println("</body></html>");
}
if (this.format == ExportFormat.rss) {
pw.println("</channel>");
pw.println("</rss>");
}
if (this.format == ExportFormat.solr) {
pw.println("</result>");
pw.println("</response>");
}
printTail(pw);
pw.close();
} catch (final Exception e) {
/* Catch but log any IO exception that can occur on copy, automatic closing or streams creation */
ConcurrentLog.logException(e);
@ -942,15 +999,46 @@ public final class Fulltext {
}
public File file() {
return this.f;
final File f = new File(this.path, this.filename + "_" + chunkcount(this.chunkCount) + "." + this.fileext);
return f;
}
private PrintWriter getWriter() throws IOException {
final File f = file();
final OutputStream os = new FileOutputStream(this.format == ExportFormat.solr ? new File(f.getAbsolutePath() + ".gz") : f);
final PrintWriter pw = new PrintWriter(new BufferedOutputStream(((this.format == ExportFormat.solr)) ? new GZIPOutputStream(os, 65536){{this.def.setLevel(Deflater.BEST_COMPRESSION);}} : os));
return pw;
}
private String chunkcount(final int count) {
if (count < 10) return "000" + count;
if (count < 100) return "00" + count;
if (count < 1000) return "0" + count;
return "" + count;
}
public File path() {
return this.path;
}
public String filename() {
return this.filename;
}
public String fileext() {
return this.fileext;
}
public String failed() {
return this.failure;
}
public int count() {
return this.count;
public int docCount() {
return this.docCount;
}
public int chunkCount() {
return this.chunkCount;
}
@SuppressWarnings("unchecked")

@ -113,23 +113,28 @@ public class ZIMFile extends File {
}
this.mimeTypeList = mList.toArray(new String[mList.size()]);
// Initialize the Url Pointer List
this.urlPtrListBlob = new byte[this.header_entryCount * 8];
mReader.seek(this.header_urlPtrPos);
RandomAccessFileZIMInputStream.readFully(mReader, this.urlPtrListBlob);
// Initialize the Title Pointer List
this.titlePtrListBlob = new byte[this.header_entryCount * 4];
mReader.seek(this.header_titlePtrPos);
RandomAccessFileZIMInputStream.readFully(mReader, this.titlePtrListBlob);
// Initialize the Cluster Pointer List
this.clusterPtrListBlob = new byte[this.header_clusterCount * 8];
mReader.seek(this.header_clusterPtrPos);
RandomAccessFileZIMInputStream.readFully(mReader, this.clusterPtrListBlob);
try {
// Initialize the Url Pointer List
this.urlPtrListBlob = new byte[this.header_entryCount * 8];
mReader.seek(this.header_urlPtrPos);
RandomAccessFileZIMInputStream.readFully(mReader, this.urlPtrListBlob);
// Initialize the Title Pointer List
this.titlePtrListBlob = new byte[this.header_entryCount * 4];
mReader.seek(this.header_titlePtrPos);
RandomAccessFileZIMInputStream.readFully(mReader, this.titlePtrListBlob);
// Initialize the Cluster Pointer List
this.clusterPtrListBlob = new byte[this.header_clusterCount * 8];
mReader.seek(this.header_clusterPtrPos);
RandomAccessFileZIMInputStream.readFully(mReader, this.clusterPtrListBlob);
} catch (IndexOutOfBoundsException e) {
throw new IOException(e.getMessage());
}
}
public final String getMimeType(int idx) {
if (idx >= this.mimeTypeList.length) return "";
return this.mimeTypeList[idx];
}

@ -237,11 +237,25 @@ public class ZIMReader {
public DirectoryEntry getMainDirectoryEntry() throws IOException {
DirectoryEntry de = getDirectoryInfo(this.mFile.header_mainPage);
if (de.namespace == 'W' && de.url.equals("mainPage") && de instanceof RedirectEntry) {
if (de instanceof RedirectEntry) {
// resolve redirect to get the actual main page
int redirect = ((RedirectEntry) de).redirect_index;
de = getDirectoryInfo(redirect);
}
// For the main entry we demand a "text/html" mime type.
// Many zim files do not provide this as the main file, which is strange (maybe lazy/irresponsibe)
// Because the main entry is important for a validation, we seek for one entry which may
// be proper for indexing.
int entryNumner = 0;
while (!de.getMimeType().equals("text/html") && entryNumner < this.mFile.header_entryCount) {
de = getDirectoryInfo(entryNumner);
entryNumner++;
if (de.namespace != 'C' && de.namespace != 'A') continue;
if (!(de instanceof ArticleEntry)) continue;
if (!de.getMimeType().equals("text/html")) continue;
if (de.url.contains("404") || de.title.contains("404") || de.title.contains("301")) continue; // is a pain
return de;
}
return de;
}
@ -337,10 +351,7 @@ public class ZIMReader {
public Cluster getCluster(int clusterNumber) throws IOException {
for (int i = 0; i < this.clusterCache.size(); i++) {
Cluster c = clusterCache.get(i);
if (c.cluster_number == clusterNumber) {
c.incUsage(); // cache hit
return c;
}
if (c.cluster_number == clusterNumber) return c;
}
// cache miss
@ -348,17 +359,10 @@ public class ZIMReader {
// check cache size
if (clusterCache.size() >= MAX_CLUSTER_CACHE_SIZE) {
// remove one entry
double maxEntry = Double.MIN_VALUE;
int pos = -1;
for (int i = 0; i < clusterCache.size(); i++) {
double r = this.clusterCache.get(i).getUsageRatio();
if (r > maxEntry) {maxEntry = r; pos = i;}
}
if (pos >= 0) this.clusterCache.remove(pos);
// remove one entry: the first entry is the oldest entry
this.clusterCache.remove(0);
}
c.incUsage();
this.clusterCache.add(c);
return c;
}
@ -378,12 +382,10 @@ public class ZIMReader {
private int cluster_number; // used to identify the correct cache entry
private List<byte[]> blobs;
private int usageCounter; // used for efficient caching and cache stale detection
private boolean extended;
public Cluster(int cluster_number) throws IOException {
this.cluster_number = cluster_number;
this.usageCounter = 0;
// open the cluster and make a Input Stream with the proper decompression type
final long clusterPos = mFile.geClusterPtr(cluster_number);
@ -444,21 +446,9 @@ public class ZIMReader {
return this.blobs.get(i);
}
public void incUsage() {
this.usageCounter++;
}
public int getUsage() {
return this.usageCounter;
}
public int getSize() {
return this.blobs.size();
}
public double getUsageRatio() {
return ((double) this.usageCounter) / ((double) this.blobs.size());
}
}
public byte[] getArticleData(final DirectoryEntry directoryInfo) throws IOException {

Loading…
Cancel
Save