calculating the correct size of an export.

This can be seen as a fix for
https://github.com/yacy/yacy_search_server/issues/343
however, the export was not flawed, it is just the impression that
something is wrong, but the export size must be smaller than the index
size because the index also containers error documents.
Now an information line is presented that shows i.e.:
"The local index currently contains 181,319 documents, only 106,887
exportable with status code 200 - the remaining are error documents."
pull/436/head
Michael Peter Christen 3 years ago
parent 4cadd557dc
commit 1bab4ffe20

@ -12,7 +12,7 @@
<h2>Index Export</h2> <h2>Index Export</h2>
<p>The local index currently contains #[ucount]# documents.</p> <p>The local index currently contains #[ucount]# documents, only #[ucount200]# exportable with status code 200 - the remaining are error documents.</p>
#(lurlexport)#:: #(lurlexport)#::
<form action="IndexExport_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8"> <form action="IndexExport_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">

@ -34,6 +34,7 @@ import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants; import net.yacy.search.SwitchboardConstants;
import net.yacy.search.index.Fulltext; import net.yacy.search.index.Fulltext;
import net.yacy.search.index.Segment; import net.yacy.search.index.Segment;
import net.yacy.search.schema.CollectionSchema;
import net.yacy.server.serverObjects; import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch; import net.yacy.server.serverSwitch;
@ -46,8 +47,13 @@ public class IndexExport_p {
final serverObjects prop = new serverObjects(); final serverObjects prop = new serverObjects();
Segment segment = sb.index; Segment segment = sb.index;
// we have two counts of document: total number and such that are exportable with status code 200
long ucount = segment.fulltext().collectionSize(); long ucount = segment.fulltext().collectionSize();
long ucount200 = ucount;
try {
ucount200 = segment.fulltext().getDefaultConnector().getCountByQuery(CollectionSchema.httpstatus_i.getSolrFieldName() + ":200");
} catch (IOException e1) {}
// set default values // set default values
prop.put("otherHosts", ""); prop.put("otherHosts", "");
prop.put("reload", 0); prop.put("reload", 0);
@ -62,7 +68,8 @@ public class IndexExport_p {
prop.put("dumprestore_dumpfile", dumpFiles.size() == 0 ? "" : dumpFiles.get(dumpFiles.size() - 1).getAbsolutePath()); prop.put("dumprestore_dumpfile", dumpFiles.size() == 0 ? "" : dumpFiles.get(dumpFiles.size() - 1).getAbsolutePath());
prop.put("dumprestore_optimizemax", 10); prop.put("dumprestore_optimizemax", 10);
prop.putNum("ucount", ucount); prop.putNum("ucount", ucount);
prop.putNum("ucount200", ucount200);
// show export messages // show export messages
Fulltext.Export export = segment.fulltext().export(); Fulltext.Export export = segment.fulltext().export();
if ((export != null) && (export.isAlive())) { if ((export != null) && (export.isAlive())) {
@ -104,7 +111,7 @@ public class IndexExport_p {
Fulltext.ExportFormat format = Fulltext.ExportFormat.text; Fulltext.ExportFormat format = Fulltext.ExportFormat.text;
final String fname = post.get("format", "url-text"); final String fname = post.get("format", "url-text");
final boolean dom = fname.startsWith("dom"); // if dom== false complete urls are exported, otherwise only the domain final boolean dom = fname.startsWith("dom"); // if dom== false complete urls are exported, otherwise only the domain
final boolean text = fname.startsWith("text"); final boolean text = fname.startsWith("text");
if (fname.endsWith("text")) format = Fulltext.ExportFormat.text; if (fname.endsWith("text")) format = Fulltext.ExportFormat.text;
if (fname.endsWith("html")) format = Fulltext.ExportFormat.html; if (fname.endsWith("html")) format = Fulltext.ExportFormat.html;
if (fname.endsWith("rss")) format = Fulltext.ExportFormat.rss; if (fname.endsWith("rss")) format = Fulltext.ExportFormat.rss;
@ -118,7 +125,7 @@ public class IndexExport_p {
// store this call as api call: we do this even if there is a chance that it fails because recurring calls may do not fail // store this call as api call: we do this even if there is a chance that it fails because recurring calls may do not fail
if (maxseconds != -1) sb.tables.recordAPICall(post, "IndexExport_p.html", WorkTables.TABLE_API_TYPE_DUMP, format + "-dump, q=" + query + ", maxseconds=" + maxseconds); if (maxseconds != -1) sb.tables.recordAPICall(post, "IndexExport_p.html", WorkTables.TABLE_API_TYPE_DUMP, format + "-dump, q=" + query + ", maxseconds=" + maxseconds);
// start the export // start the export
try { try {
export = sb.index.fulltext().export(format, filter, query, maxseconds, new File(path), dom, text); export = sb.index.fulltext().export(format, filter, query, maxseconds, new File(path), dom, text);
@ -128,7 +135,7 @@ public class IndexExport_p {
prop.put("lurlexporterror_exportfailmsg", e.getMessage()); prop.put("lurlexporterror_exportfailmsg", e.getMessage());
return prop; return prop;
} }
// show result // show result
prop.put("lurlexport_exportfile", export.file().toString()); prop.put("lurlexport_exportfile", export.file().toString());
prop.put("lurlexport_urlcount", export.count()); prop.put("lurlexport_urlcount", export.count());

Loading…
Cancel
Save