diff --git a/htroot/IndexExport_p.html b/htroot/IndexExport_p.html index eb3e1f188..87ee4b62d 100644 --- a/htroot/IndexExport_p.html +++ b/htroot/IndexExport_p.html @@ -21,13 +21,16 @@
URL Filter
-
+
 .*.* (default) is a catch-all; format: java regex
query
-
+
 *:* (default) is a catch-all; format: :
-
maximum age (seconds, -1 = unlimited)
-
+
maximum age (seconds)
+
 -1 = unlimited -> no document is too old +
+
maximum number of records per chunk
+
 if exceeded: several chunks are stored; -1 = unlimited (makes only one chunk)
Export Format
diff --git a/source/net/yacy/htroot/IndexExport_p.java b/source/net/yacy/htroot/IndexExport_p.java index 78cc94132..aa5fc6f09 100644 --- a/source/net/yacy/htroot/IndexExport_p.java +++ b/source/net/yacy/htroot/IndexExport_p.java @@ -64,8 +64,8 @@ public class IndexExport_p { prop.put("lurlexport", 0); prop.put("reload", 0); prop.put("dumprestore", 1); - prop.put("dumprestore_dumpRestoreEnabled", sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_FULLTEXT, - SwitchboardConstants.CORE_SERVICE_FULLTEXT_DEFAULT)); + prop.put("dumprestore_dumpRestoreEnabled", sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_FULLTEXT, + SwitchboardConstants.CORE_SERVICE_FULLTEXT_DEFAULT)); List dumpFiles = segment.fulltext().dumpFiles(); prop.put("dumprestore_dumpfile", dumpFiles.size() == 0 ? "" : dumpFiles.get(dumpFiles.size() - 1).getAbsolutePath()); prop.put("dumprestore_optimizemax", 10); @@ -80,7 +80,7 @@ public class IndexExport_p { prop.put("lurlexportfinished", 0); prop.put("lurlexporterror", 0); prop.put("lurlexport_exportfile", export.file().toString()); - prop.put("lurlexport_urlcount", export.count()); + prop.put("lurlexport_urlcount", export.docCount()); prop.put("reload", 1); } else { prop.put("lurlexport", 1); @@ -93,7 +93,7 @@ public class IndexExport_p { // an export was running but has finished prop.put("lurlexportfinished", 1); prop.put("lurlexportfinished_exportfile", export.file().toString()); - prop.put("lurlexportfinished_urlcount", export.count()); + prop.put("lurlexportfinished_urlcount", export.docCount()); if (export.failed() == null) { prop.put("lurlexporterror", 0); } else { @@ -123,6 +123,8 @@ public class IndexExport_p { final String filter = post.get("exportfilter", ".*"); final String query = post.get("exportquery", "*:*"); final int maxseconds = post.getInt("exportmaxseconds", -1); + long maxChunkSize = post.getLong("maxchunksize", Long.MAX_VALUE); + if (maxChunkSize <= 0) maxChunkSize = Long.MAX_VALUE; final String path = post.get("exportfilepath", ""); // store this call as api call: we do this even if there is a chance that it fails because recurring calls may do not fail @@ -130,7 +132,7 @@ public class IndexExport_p { // start the export try { - export = sb.index.fulltext().export(format, filter, query, maxseconds, new File(path), dom, text); + export = sb.index.fulltext().export(format, filter, query, maxseconds, new File(path), dom, text, maxChunkSize); } catch (final IOException e) { prop.put("lurlexporterror", 1); prop.put("lurlexporterror_exportfile", "-no export-"); @@ -140,7 +142,7 @@ public class IndexExport_p { // show result prop.put("lurlexport_exportfile", export.file().toString()); - prop.put("lurlexport_urlcount", export.count()); + prop.put("lurlexport_urlcount", export.docCount()); if ((export != null) && (export.failed() == null)) { prop.put("lurlexport", 2); } @@ -148,34 +150,34 @@ public class IndexExport_p { } if (post.containsKey("indexdump")) { - try { - final File dump = segment.fulltext().dumpEmbeddedSolr(); - prop.put("indexdump", 1); - prop.put("indexdump_dumpfile", dump.getAbsolutePath()); - dumpFiles = segment.fulltext().dumpFiles(); - prop.put("dumprestore_dumpfile", dumpFiles.size() == 0 ? "" : dumpFiles.get(dumpFiles.size() - 1).getAbsolutePath()); - // sb.tables.recordAPICall(post, "IndexExport_p.html", WorkTables.TABLE_API_TYPE_STEERING, "solr dump generation"); - } catch(final SolrException e) { - if(ErrorCode.SERVICE_UNAVAILABLE.code == e.code()) { - prop.put("indexdump", 2); - } else { - prop.put("indexdump", 3); - } - } + try { + final File dump = segment.fulltext().dumpEmbeddedSolr(); + prop.put("indexdump", 1); + prop.put("indexdump_dumpfile", dump.getAbsolutePath()); + dumpFiles = segment.fulltext().dumpFiles(); + prop.put("dumprestore_dumpfile", dumpFiles.size() == 0 ? "" : dumpFiles.get(dumpFiles.size() - 1).getAbsolutePath()); + // sb.tables.recordAPICall(post, "IndexExport_p.html", WorkTables.TABLE_API_TYPE_STEERING, "solr dump generation"); + } catch(final SolrException e) { + if(ErrorCode.SERVICE_UNAVAILABLE.code == e.code()) { + prop.put("indexdump", 2); + } else { + prop.put("indexdump", 3); + } + } } if (post.containsKey("indexrestore")) { - try { - final File dump = new File(post.get("dumpfile", "")); - segment.fulltext().restoreEmbeddedSolr(dump); - prop.put("indexRestore", 1); - } catch(final SolrException e) { - if(ErrorCode.SERVICE_UNAVAILABLE.code == e.code()) { - prop.put("indexRestore", 2); - } else { - prop.put("indexRestore", 3); - } - } + try { + final File dump = new File(post.get("dumpfile", "")); + segment.fulltext().restoreEmbeddedSolr(dump); + prop.put("indexRestore", 1); + } catch(final SolrException e) { + if(ErrorCode.SERVICE_UNAVAILABLE.code == e.code()) { + prop.put("indexRestore", 2); + } else { + prop.put("indexRestore", 3); + } + } } // insert constants diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index 718be0099..d8a1754a7 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -695,7 +695,10 @@ public final class Fulltext { } public final static String yacy_dump_prefix = "yacy_dump_"; - public Export export(Fulltext.ExportFormat format, String filter, String query, final int maxseconds, File path, boolean dom, boolean text) throws IOException { + public Export export( + Fulltext.ExportFormat format, String filter, String query, + final int maxseconds, File path, boolean dom, boolean text, + long maxChunkSize) throws IOException { // modify query according to maxseconds final long now = System.currentTimeMillis(); @@ -760,27 +763,26 @@ public final class Fulltext { } } - String s = new File(path, yacy_dump_prefix + + String filename = yacy_dump_prefix + "f" + GenericFormatter.SHORT_MINUTE_FORMATTER.format(firstdate) + "_" + "l" + GenericFormatter.SHORT_MINUTE_FORMATTER.format(lastdate) + "_" + "n" + GenericFormatter.SHORT_MINUTE_FORMATTER.format(new Date(now)) + "_" + - "c" + String.format("%1$012d", doccount)).getAbsolutePath() + "_tc"; // the name ends with the transaction token ('c' = 'created') + "c" + String.format("%1$012d", doccount)+ "_tc"; // the name ends with the transaction token ('c' = 'created') - // create export file name - if (s.indexOf('.',0) < 0) s += "." + format.getExt(); - final File f = new File(s); - f.getParentFile().mkdirs(); - - return export(f, filter, query, format, dom, text); + return export(path, filename, format.getExt(), filter, query, format, dom, text, maxChunkSize); } // export methods - public Export export(final File f, final String filter, final String query, final ExportFormat format, final boolean dom, final boolean text) { + public Export export( + final File path, final String filename, + final String fileext, final String filter, final String query, + final ExportFormat format, final boolean dom, final boolean text, + long maxChunkSize) { if ((this.exportthread != null) && (this.exportthread.isAlive())) { ConcurrentLog.warn("LURL-EXPORT", "cannot start another export thread, already one running"); return this.exportthread; } - this.exportthread = new Export(f, filter, query, format, dom, text); + this.exportthread = new Export(path, filename, fileext, filter, query, format, dom, text, maxChunkSize); this.exportthread.start(); return this.exportthread; } @@ -795,69 +797,95 @@ public final class Fulltext { } public class Export extends Thread { - private final File f; + private final File path; + private final String filename, fileext; private final Pattern pattern; - private int count; private String failure; private final String query; private final ExportFormat format; private final boolean dom, text; - - private Export(final File f, final String filter, final String query, final ExportFormat format, final boolean dom, final boolean text) { + private int docCount, chunkSize, chunkCount; + private final long maxChunkSize; + + private Export( + final File path, final String filename, + final String fileext, final String filter, final String query, + final ExportFormat format, final boolean dom, final boolean text, + long maxChunkSize) { super("Fulltext.Export"); // format: 0=text, 1=html, 2=rss/xml - this.f = f; + this.path = path; + this.filename = filename; + this.fileext = fileext; this.pattern = filter == null ? null : Pattern.compile(filter); this.query = query == null? AbstractSolrConnector.CATCHALL_QUERY : query; - this.count = 0; this.failure = null; this.format = format; this.dom = dom; this.text = text; + this.docCount = 0; // number of all documents exported so far + this.chunkSize = 0; // number of documents in the current chunk + this.chunkCount = 0; // number of chunks opened so far + this.maxChunkSize = maxChunkSize; // number of maximum document count per chunk //if ((dom) && (format == 2)) dom = false; } + private void printHead(PrintWriter pw) { + if (this.format == ExportFormat.html) { + pw.println(""); + } + if (this.format == ExportFormat.rss) { + pw.println(""); + pw.println(""); + pw.println(""); + pw.println(""); + pw.println("YaCy Peer-to-Peer - Web-Search URL Export"); + pw.println(""); + pw.println("http://yacy.net"); + } + if (this.format == ExportFormat.solr) { + pw.println(""); + pw.println(""); + pw.println(""); + pw.println(" "); + pw.println(" "); + pw.println(" " + this.query + ""); + pw.println(" "); + pw.println(""); + pw.println(""); + } + } + + private void printTail(PrintWriter pw) { + if (this.format == ExportFormat.html) { + pw.println(""); + } + if (this.format == ExportFormat.rss) { + pw.println(""); + pw.println(""); + } + if (this.format == ExportFormat.solr) { + pw.println(""); + pw.println(""); + } + } + @Override public void run() { try { - final File parentf = this.f.getParentFile(); - if (parentf != null) { - parentf.mkdirs(); - } + if (this.path != null) this.path.mkdirs(); } catch(final Exception e) { ConcurrentLog.logException(e); this.failure = e.getMessage(); return; } - try (/* Resources automatically closed by this try-with-resources statement */ - final OutputStream os = new FileOutputStream(this.format == ExportFormat.solr ? new File(this.f.getAbsolutePath() + ".gz") : this.f); - final OutputStream wrappedStream = ((this.format == ExportFormat.solr)) ? new GZIPOutputStream(os, 65536){{this.def.setLevel(Deflater.BEST_COMPRESSION);}} : os; - final PrintWriter pw = new PrintWriter(new BufferedOutputStream(wrappedStream)); - ) { - if (this.format == ExportFormat.html) { - pw.println(""); - } - if (this.format == ExportFormat.rss) { - pw.println(""); - pw.println(""); - pw.println(""); - pw.println(""); - pw.println("YaCy Peer-to-Peer - Web-Search URL Export"); - pw.println(""); - pw.println("http://yacy.net"); - } - if (this.format == ExportFormat.solr) { - pw.println(""); - pw.println(""); - pw.println(""); - pw.println(" "); - pw.println(" "); - pw.println(" " + this.query + ""); - pw.println(" "); - pw.println(""); - pw.println(""); - } + try { + docCount = 0; + chunkSize = 0; + chunkCount = 0; + PrintWriter pw = getWriter(); + printHead(pw); if (this.dom) { final Map> scores = Fulltext.this.getDefaultConnector().getFacets(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", 100000000, CollectionSchema.host_s.getSolrFieldName()); final ReversibleScoreMap stats = scores.get(CollectionSchema.host_s.getSolrFieldName()); @@ -865,7 +893,7 @@ public final class Fulltext { if (this.pattern != null && !this.pattern.matcher(host).matches()) continue; if (this.format == ExportFormat.text) pw.println(host); if (this.format == ExportFormat.html) pw.println("" + host + "
"); - this.count++; + this.docCount++; this.chunkSize++; } } else { if (this.format == ExportFormat.solr || this.format == ExportFormat.elasticsearch || (this.text && this.format == ExportFormat.text)) { @@ -882,7 +910,14 @@ public final class Fulltext { if (this.format == ExportFormat.elasticsearch) pw.println("{\"index\":{}}"); final String d = sw.toString(); pw.println(d); - this.count++; + this.docCount++; this.chunkSize++; + if (this.chunkSize >= this.maxChunkSize) { + printTail(pw); + pw.close(); + pw = getWriter(); // increases chunkCount as side-effect + printHead(pw); + this.chunkSize = 0; + } } } else { final BlockingQueue docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", null, 0, 100000000, Long.MAX_VALUE, 100, 1, true, @@ -918,21 +953,19 @@ public final class Fulltext { pw.println("" + hash + ""); pw.println(""); } - this.count++; + this.docCount++; this.chunkSize++; + if (this.chunkSize >= this.maxChunkSize) { + printTail(pw); + pw.close(); + pw = getWriter(); // increases chunkCount as side-effect + printHead(pw); + this.chunkSize = 0; + } } } } - if (this.format == ExportFormat.html) { - pw.println(""); - } - if (this.format == ExportFormat.rss) { - pw.println("
"); - pw.println("
"); - } - if (this.format == ExportFormat.solr) { - pw.println(""); - pw.println(""); - } + printTail(pw); + pw.close(); } catch (final Exception e) { /* Catch but log any IO exception that can occur on copy, automatic closing or streams creation */ ConcurrentLog.logException(e); @@ -942,15 +975,47 @@ public final class Fulltext { } public File file() { - return this.f; + final File f = new File(this.path, this.filename + "_" + chunkcount(this.chunkCount) + "." + this.fileext); + return f; + } + + private PrintWriter getWriter() throws IOException { + File f = file(); + final OutputStream os = new FileOutputStream(this.format == ExportFormat.solr ? new File(f.getAbsolutePath() + ".gz") : f); + final PrintWriter pw = new PrintWriter(new BufferedOutputStream(((this.format == ExportFormat.solr)) ? new GZIPOutputStream(os, 65536){{this.def.setLevel(Deflater.BEST_COMPRESSION);}} : os)); + this.chunkCount++; + return pw; + } + + private String chunkcount(int count) { + if (count < 10) return "000" + count; + if (count < 100) return "00" + count; + if (count < 1000) return "0" + count; + return "" + count; + } + + public File path() { + return this.path; + } + + public String filename() { + return this.filename; + } + + public String fileext() { + return this.fileext; } public String failed() { return this.failure; } - public int count() { - return this.count; + public int docCount() { + return this.docCount; + } + + public int chunkCount() { + return this.chunkCount; } @SuppressWarnings("unchecked")