From a6bf0b1649f89de02fa9f2535891911ede38088e Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 23 Feb 2016 18:56:20 +0100 Subject: [PATCH] 0N - added option to generate index export files for a specific number of minutes in the past and reverted latest change. The export file dump will now contain four data elements: f - first date of index entry write date, l - last date of index write date, n - now-date of index dump time, c - count of numbers inside the dump. '0N' denotes a series of changes which will lead to the opportunity to exchange index data dumps in a way that is needed to integrate ZeroNet index data. This will be based on index dump sharing; that causes this commit. --- htroot/IndexExport_p.html | 7 ++- htroot/IndexExport_p.java | 42 ++++++++-------- source/net/yacy/search/index/Fulltext.java | 58 +++++++++++++++++++++- 3 files changed, 83 insertions(+), 24 deletions(-) diff --git a/htroot/IndexExport_p.html b/htroot/IndexExport_p.html index 56bfb3457..b6801be57 100644 --- a/htroot/IndexExport_p.html +++ b/htroot/IndexExport_p.html @@ -18,8 +18,8 @@
Loaded URL Export
-
Export File
-
+
Export Path
+
URL Filter
@@ -27,6 +27,9 @@
query
+
maximum age (seconds, -1 = unlimited)
+
+
Export Format
diff --git a/htroot/IndexExport_p.java b/htroot/IndexExport_p.java index a465c034c..12f437c26 100644 --- a/htroot/IndexExport_p.java +++ b/htroot/IndexExport_p.java @@ -22,9 +22,9 @@ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA import java.io.File; +import java.io.IOException; import java.util.List; -import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.protocol.RequestHeader; import net.yacy.search.Switchboard; import net.yacy.search.index.Fulltext; @@ -53,9 +53,10 @@ public class IndexExport_p { List dumpFiles = segment.fulltext().dumpFiles(); prop.put("dumprestore_dumpfile", dumpFiles.size() == 0 ? "" : dumpFiles.get(dumpFiles.size() - 1).getAbsolutePath()); prop.put("dumprestore_optimizemax", 10); + prop.putNum("ucount", ucount); // show export messages - final Fulltext.Export export = segment.fulltext().export(); + Fulltext.Export export = segment.fulltext().export(); if ((export != null) && (export.isAlive())) { // there is currently a running export prop.put("lurlexport", 2); @@ -66,7 +67,7 @@ public class IndexExport_p { prop.put("reload", 1); } else { prop.put("lurlexport", 1); - prop.put("lurlexport_exportfile", sb.getDataPath() + "/DATA/EXPORT/yacy_export_" + sb.peers.myID() + "_" + GenericFormatter.SHORT_SECOND_FORMATTER.format()); + prop.put("lurlexport_exportfilepath", sb.getDataPath() + "/DATA/EXPORT/"); if (export == null) { // there has never been an export prop.put("lurlexportfinished", 0); @@ -87,7 +88,6 @@ public class IndexExport_p { } if (post == null || env == null) { - prop.putNum("ucount", ucount); return prop; // nothing to do } @@ -102,23 +102,25 @@ public class IndexExport_p { if (fname.endsWith("rss")) format = Fulltext.ExportFormat.rss; if (fname.endsWith("solr")) format = Fulltext.ExportFormat.solr; - // extend export file name - String s = post.get("exportfile", ""); - if (s.indexOf('.',0) < 0) { - if (format == Fulltext.ExportFormat.text) s = s + ".txt"; - if (format == Fulltext.ExportFormat.html) s = s + ".html"; - if (format == Fulltext.ExportFormat.rss ) s = s + "_rss.xml"; - if (format == Fulltext.ExportFormat.solr) s = s + "_full.xml"; - } - final File f = new File(s); - f.getParentFile().mkdirs(); final String filter = post.get("exportfilter", ".*"); final String query = post.get("exportquery", "*:*"); - final Fulltext.Export running = segment.fulltext().export(f, filter, query, format, dom, text); - - prop.put("lurlexport_exportfile", s); - prop.put("lurlexport_urlcount", running.count()); - if ((running != null) && (running.failed() == null)) { + final int maxseconds = post.getInt("exportmaxseconds", -1); + final String path = post.get("exportfilepath", ""); + + // start the export + try { + export = sb.index.fulltext().export(format, filter, query, maxseconds, new File(path), dom, text); + } catch (IOException e) { + prop.put("lurlexporterror", 1); + prop.put("lurlexporterror_exportfile", "-no export-"); + prop.put("lurlexporterror_exportfailmsg", e.getMessage()); + return prop; + } + + // show result + prop.put("lurlexport_exportfile", export.file().toString()); + prop.put("lurlexport_urlcount", export.count()); + if ((export != null) && (export.failed() == null)) { prop.put("lurlexport", 2); } prop.put("reload", 1); @@ -144,4 +146,4 @@ public class IndexExport_p { return prop; } -} +} \ No newline at end of file diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index d6d6b411b..a8fbed860 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -77,9 +77,11 @@ import net.yacy.search.schema.WebgraphConfiguration; import net.yacy.search.schema.WebgraphSchema; import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.core.SolrInfoMBean; +import org.apache.solr.util.DateFormatUtil; import org.apache.lucene.util.Version; public final class Fulltext { @@ -617,9 +619,61 @@ public final class Fulltext { } } } - + public static enum ExportFormat { - text, html, rss, solr; + text("txt"), html("html"), rss("rss"), solr("xml"); + private final String ext; + private ExportFormat(String ext) {this.ext = ext;} + public String getExt() {return this.ext;} + } + + public Export export(Fulltext.ExportFormat format, String filter, String query, final int maxseconds, File path, boolean dom, boolean text) throws IOException { + + // modify query according to maxseconds + long now = System.currentTimeMillis(); + if (maxseconds > 0) { + long from = now - maxseconds * 1000L; + String nowstr = DateFormatUtil.formatExternal(new Date(now)); + String fromstr = DateFormatUtil.formatExternal(new Date(from)); + String dateq = CollectionSchema.load_date_dt.getSolrFieldName() + ":[" + fromstr + " TO " + nowstr + "]"; + query = query == null || AbstractSolrConnector.CATCHALL_QUERY.equals(query) ? dateq : query + " AND " + dateq; + } else { + query = query == null? AbstractSolrConnector.CATCHALL_QUERY : query; + } + + // check the oldest and latest entry in the index for this query + SolrDocumentList firstdoclist, lastdoclist; + firstdoclist = this.getDefaultConnector().getDocumentListByQuery( + query, CollectionSchema.load_date_dt.getSolrFieldName() + " asc", 0, 1,CollectionSchema.load_date_dt.getSolrFieldName()); + lastdoclist = this.getDefaultConnector().getDocumentListByQuery( + query, CollectionSchema.load_date_dt.getSolrFieldName() + " desc", 0, 1,CollectionSchema.load_date_dt.getSolrFieldName()); + + if (firstdoclist.size() == 0 || lastdoclist.size() == 0) { + assert firstdoclist.size() == 0 && lastdoclist.size() == 0; + throw new IOException("number of exported documents == 0"); + } + assert firstdoclist.size() == 1 && lastdoclist.size() == 1; + long doccount = firstdoclist.getNumFound(); + + // create the export name + SolrDocument firstdoc = firstdoclist.get(0); + SolrDocument lastdoc = lastdoclist.get(0); + Object firstdateobject = firstdoc.getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName()); + Object lastdateobject = lastdoc.getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName()); + Date firstdate = (Date) firstdateobject; + Date lastdate = (Date) lastdateobject; + String s = new File(path, "yacy_dump_" + + "f" + GenericFormatter.FORMAT_SHORT_MINUTE.format(firstdate) + "_" + + "l" + GenericFormatter.FORMAT_SHORT_MINUTE.format(lastdate) + "_" + + "n" + GenericFormatter.FORMAT_SHORT_MINUTE.format(new Date(now)) + "_" + + "c" + String.format("%1$012d", doccount)).getAbsolutePath(); + + // create export file name + if (s.indexOf('.',0) < 0) s += "." + format.getExt(); + final File f = new File(s); + f.getParentFile().mkdirs(); + + return export(f, filter, query, format, dom, text); } // export methods