added a 'minified' option to YaCy dumps

pull/612/head
Michael Peter Christen 1 year ago
parent c20c4b8a21
commit 3268a93019

@ -32,6 +32,11 @@
<dt class="TableCellDark">maximum number of records per chunk</dt> <dt class="TableCellDark">maximum number of records per chunk</dt>
<dd><input type="text" name="maxchunksize" value="-1" size="20" maxlength="250" />&nbsp;if exceeded: several chunks are stored; -1 = unlimited (makes only one chunk) <dd><input type="text" name="maxchunksize" value="-1" size="20" maxlength="250" />&nbsp;if exceeded: several chunks are stored; -1 = unlimited (makes only one chunk)
</dd> </dd>
<dt class="TableCellDark">Export Size</dt>
<dd>
full size, all fields:<input type="radio" name="minified" value="no" checked="checked">&nbsp;
minified; only fields sku, date, title, description, text_t<input type="radio" name="minified" value="yes" >
</dd>
<dt class="TableCellDark">Export Format</dt> <dt class="TableCellDark">Export Format</dt>
<dd> <dd>
<dl> <dl>

@ -126,13 +126,14 @@ public class IndexExport_p {
long maxChunkSize = post.getLong("maxchunksize", Long.MAX_VALUE); long maxChunkSize = post.getLong("maxchunksize", Long.MAX_VALUE);
if (maxChunkSize <= 0) maxChunkSize = Long.MAX_VALUE; if (maxChunkSize <= 0) maxChunkSize = Long.MAX_VALUE;
final String path = post.get("exportfilepath", ""); final String path = post.get("exportfilepath", "");
final boolean minified = post.get("minified", "no").equals("yes");
// store this call as api call: we do this even if there is a chance that it fails because recurring calls may do not fail // store this call as api call: we do this even if there is a chance that it fails because recurring calls may do not fail
if (maxseconds != -1) sb.tables.recordAPICall(post, "IndexExport_p.html", WorkTables.TABLE_API_TYPE_DUMP, format + "-dump, q=" + query + ", maxseconds=" + maxseconds); if (maxseconds != -1) sb.tables.recordAPICall(post, "IndexExport_p.html", WorkTables.TABLE_API_TYPE_DUMP, format + "-dump, q=" + query + ", maxseconds=" + maxseconds);
// start the export // start the export
try { try {
export = sb.index.fulltext().export(format, filter, query, maxseconds, new File(path), dom, text, maxChunkSize); export = sb.index.fulltext().export(format, filter, query, maxseconds, new File(path), dom, text, maxChunkSize, minified);
} catch (final IOException e) { } catch (final IOException e) {
prop.put("lurlexporterror", 1); prop.put("lurlexporterror", 1);
prop.put("lurlexporterror_exportfile", "-no export-"); prop.put("lurlexporterror_exportfile", "-no export-");

@ -34,8 +34,10 @@ import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
import java.util.Date; import java.util.Date;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry;
import java.util.Set; import java.util.Set;
import java.util.concurrent.BlockingQueue; import java.util.concurrent.BlockingQueue;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
@ -118,7 +120,7 @@ public final class Fulltext {
this.writeWebgraph = false; this.writeWebgraph = false;
} }
public void setUseWebgraph(boolean check) { public void setUseWebgraph(final boolean check) {
this.writeWebgraph = check; this.writeWebgraph = check;
} }
@ -142,8 +144,8 @@ public final class Fulltext {
final File solrLocation = new File(this.segmentPath, SOLR_PATH); final File solrLocation = new File(this.segmentPath, SOLR_PATH);
// migrate old solr to new // migrate old solr to new
for (String oldVersion: SOLR_OLD_PATH) { for (final String oldVersion: SOLR_OLD_PATH) {
File oldLocation = new File(this.segmentPath, oldVersion); final File oldLocation = new File(this.segmentPath, oldVersion);
if (oldLocation.exists()) { if (oldLocation.exists()) {
if (!oldLocation.renameTo(solrLocation)) { if (!oldLocation.renameTo(solrLocation)) {
ConcurrentLog.severe("Fulltext", "Failed renaming old Solr location (" ConcurrentLog.severe("Fulltext", "Failed renaming old Solr location ("
@ -183,11 +185,11 @@ public final class Fulltext {
return this.solrInstances.getDefaultEmbeddedConnector(); return this.solrInstances.getDefaultEmbeddedConnector();
} }
public EmbeddedSolrConnector getEmbeddedConnector(String corename) { public EmbeddedSolrConnector getEmbeddedConnector(final String corename) {
return this.solrInstances.getEmbeddedConnector(corename); return this.solrInstances.getEmbeddedConnector(corename);
} }
public SolrConnector getConnectorForRead(String corename) { public SolrConnector getConnectorForRead(final String corename) {
if (this.solrInstances.isConnectedRemote()) return this.solrInstances.getRemoteConnector(corename); if (this.solrInstances.isConnectedRemote()) return this.solrInstances.getRemoteConnector(corename);
if (this.solrInstances.isConnectedEmbedded()) return this.solrInstances.getEmbeddedConnector(corename); if (this.solrInstances.isConnectedEmbedded()) return this.solrInstances.getEmbeddedConnector(corename);
return null; return null;
@ -315,7 +317,7 @@ public final class Fulltext {
} }
private long lastCommit = 0; private long lastCommit = 0;
public void commit(boolean softCommit) { public void commit(final boolean softCommit) {
final long t = System.currentTimeMillis(); final long t = System.currentTimeMillis();
if (this.lastCommit + 10000 > t) return; if (this.lastCommit + 10000 > t) return;
this.lastCommit = t; this.lastCommit = t;
@ -423,7 +425,7 @@ public final class Fulltext {
* @param freshdate either NULL or a date in the past which is the limit for deletion. Only documents older than this date are deleted * @param freshdate either NULL or a date in the past which is the limit for deletion. Only documents older than this date are deleted
* @throws IOException * @throws IOException
*/ */
public void deleteStaleDomainHashes(final Set<String> hosthashes, Date freshdate) { public void deleteStaleDomainHashes(final Set<String> hosthashes, final Date freshdate) {
// delete in solr // delete in solr
final Date now = new Date(); final Date now = new Date();
deleteDomainWithConstraint(this.getDefaultConnector(), CollectionSchema.host_id_s.getSolrFieldName(), hosthashes, deleteDomainWithConstraint(this.getDefaultConnector(), CollectionSchema.host_id_s.getSolrFieldName(), hosthashes,
@ -434,7 +436,7 @@ public final class Fulltext {
(WebgraphSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]")); (WebgraphSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]"));
} }
public void deleteStaleDomainNames(final Set<String> hostnames, Date freshdate) { public void deleteStaleDomainNames(final Set<String> hostnames, final Date freshdate) {
final Date now = new Date(); final Date now = new Date();
deleteDomainWithConstraint(this.getDefaultConnector(), CollectionSchema.host_s.getSolrFieldName(), hostnames, deleteDomainWithConstraint(this.getDefaultConnector(), CollectionSchema.host_s.getSolrFieldName(), hostnames,
@ -453,7 +455,7 @@ public final class Fulltext {
deleteDomainWithConstraint(this.getDefaultConnector(), CollectionSchema.host_id_s.getSolrFieldName(), hosthashes, CollectionSchema.failreason_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM); deleteDomainWithConstraint(this.getDefaultConnector(), CollectionSchema.host_id_s.getSolrFieldName(), hosthashes, CollectionSchema.failreason_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM);
} }
private static void deleteDomainWithConstraint(SolrConnector connector, String fieldname, final Set<String> hosthashes, String constraintQuery) { private static void deleteDomainWithConstraint(final SolrConnector connector, final String fieldname, final Set<String> hosthashes, final String constraintQuery) {
if (hosthashes == null || hosthashes.size() == 0) return; if (hosthashes == null || hosthashes.size() == 0) return;
final int subsetscount = 1 + (hosthashes.size() / 255); // if the list is too large, we get a "too many boolean clauses" exception final int subsetscount = 1 + (hosthashes.size() / 255); // if the list is too large, we get a "too many boolean clauses" exception
int c = 0; int c = 0;
@ -492,7 +494,7 @@ public final class Fulltext {
* @param basepath the left path of the url; at least until the end of the host * @param basepath the left path of the url; at least until the end of the host
* @param freshdate either NULL or a date in the past which is the limit for deletion. Only documents older than this date are deleted * @param freshdate either NULL or a date in the past which is the limit for deletion. Only documents older than this date are deleted
*/ */
public int remove(final String basepath, Date freshdate) { public int remove(final String basepath, final Date freshdate) {
DigestURL uri; DigestURL uri;
try {uri = new DigestURL(basepath);} catch (final MalformedURLException e) {return 0;} try {uri = new DigestURL(basepath);} catch (final MalformedURLException e) {return 0;}
final String host = uri.getHost(); final String host = uri.getHost();
@ -690,15 +692,15 @@ public final class Fulltext {
public static enum ExportFormat { public static enum ExportFormat {
text("txt"), html("html"), rss("rss"), solr("xml"), elasticsearch("flatjson"); text("txt"), html("html"), rss("rss"), solr("xml"), elasticsearch("flatjson");
private final String ext; private final String ext;
private ExportFormat(String ext) {this.ext = ext;} private ExportFormat(final String ext) {this.ext = ext;}
public String getExt() {return this.ext;} public String getExt() {return this.ext;}
} }
public final static String yacy_dump_prefix = "yacy_dump_"; public final static String yacy_dump_prefix = "yacy_dump_";
public Export export( public Export export(
Fulltext.ExportFormat format, String filter, String query, final Fulltext.ExportFormat format, final String filter, String query,
final int maxseconds, File path, boolean dom, boolean text, final int maxseconds, final File path, final boolean dom, final boolean text,
long maxChunkSize) throws IOException { final long maxChunkSize, final boolean minified) throws IOException {
// modify query according to maxseconds // modify query according to maxseconds
final long now = System.currentTimeMillis(); final long now = System.currentTimeMillis();
@ -763,13 +765,13 @@ public final class Fulltext {
} }
} }
String filename = yacy_dump_prefix + final String filename = yacy_dump_prefix +
"f" + GenericFormatter.SHORT_MINUTE_FORMATTER.format(firstdate) + "_" + "f" + GenericFormatter.SHORT_MINUTE_FORMATTER.format(firstdate) + "_" +
"l" + GenericFormatter.SHORT_MINUTE_FORMATTER.format(lastdate) + "_" + "l" + GenericFormatter.SHORT_MINUTE_FORMATTER.format(lastdate) + "_" +
"n" + GenericFormatter.SHORT_MINUTE_FORMATTER.format(new Date(now)) + "_" + "n" + GenericFormatter.SHORT_MINUTE_FORMATTER.format(new Date(now)) + "_" +
"c" + String.format("%1$012d", doccount)+ "_tc"; // the name ends with the transaction token ('c' = 'created') "c" + String.format("%1$012d", doccount)+ "_tc"; // the name ends with the transaction token ('c' = 'created')
return export(path, filename, format.getExt(), filter, query, format, dom, text, maxChunkSize); return export(path, filename, format.getExt(), filter, query, format, dom, text, maxChunkSize, minified);
} }
// export methods // export methods
@ -777,17 +779,17 @@ public final class Fulltext {
final File path, final String filename, final File path, final String filename,
final String fileext, final String filter, final String query, final String fileext, final String filter, final String query,
final ExportFormat format, final boolean dom, final boolean text, final ExportFormat format, final boolean dom, final boolean text,
long maxChunkSize) { final long maxChunkSize, final boolean minified) {
if ((this.exportthread != null) && (this.exportthread.isAlive())) { if ((this.exportthread != null) && (this.exportthread.isAlive())) {
ConcurrentLog.warn("LURL-EXPORT", "cannot start another export thread, already one running"); ConcurrentLog.warn("LURL-EXPORT", "cannot start another export thread, already one running");
return this.exportthread; return this.exportthread;
} }
this.exportthread = new Export(path, filename, fileext, filter, query, format, dom, text, maxChunkSize); this.exportthread = new Export(path, filename, fileext, filter, query, format, dom, text, maxChunkSize, minified);
this.exportthread.start(); this.exportthread.start();
return this.exportthread; return this.exportthread;
} }
public static void main(String args[]) { public static void main(final String args[]) {
final Date firstdate = null; final Date firstdate = null;
System.out.println(GenericFormatter.SHORT_MINUTE_FORMATTER.format(firstdate)); System.out.println(GenericFormatter.SHORT_MINUTE_FORMATTER.format(firstdate));
} }
@ -796,6 +798,18 @@ public final class Fulltext {
return this.exportthread; return this.exportthread;
} }
private final static Set<String> minified_keys = new HashSet<>();
static {
//minified_keys.add(CollectionSchema.id.getSolrFieldName());
minified_keys.add(CollectionSchema.sku.getSolrFieldName());
minified_keys.add(CollectionSchema.title.getSolrFieldName());
//minified_keys.add(CollectionSchema.author.getSolrFieldName());
minified_keys.add(CollectionSchema.description_txt.getSolrFieldName());
//minified_keys.add(CollectionSchema.size_i.getSolrFieldName());
minified_keys.add(CollectionSchema.last_modified.getSolrFieldName());
minified_keys.add(CollectionSchema.text_t.getSolrFieldName());
}
public class Export extends Thread { public class Export extends Thread {
private final File path; private final File path;
private final String filename, fileext; private final String filename, fileext;
@ -806,12 +820,13 @@ public final class Fulltext {
private final boolean dom, text; private final boolean dom, text;
private int docCount, chunkSize, chunkCount; private int docCount, chunkSize, chunkCount;
private final long maxChunkSize; private final long maxChunkSize;
private final boolean minified;
private Export( private Export(
final File path, final String filename, final File path, final String filename,
final String fileext, final String filter, final String query, final String fileext, final String filter, final String query,
final ExportFormat format, final boolean dom, final boolean text, final ExportFormat format, final boolean dom, final boolean text,
long maxChunkSize) { final long maxChunkSize, final boolean minified) {
super("Fulltext.Export"); super("Fulltext.Export");
// format: 0=text, 1=html, 2=rss/xml // format: 0=text, 1=html, 2=rss/xml
this.path = path; this.path = path;
@ -827,10 +842,11 @@ public final class Fulltext {
this.chunkSize = 0; // number of documents in the current chunk this.chunkSize = 0; // number of documents in the current chunk
this.chunkCount = 0; // number of chunks opened so far this.chunkCount = 0; // number of chunks opened so far
this.maxChunkSize = maxChunkSize; // number of maximum document count per chunk this.maxChunkSize = maxChunkSize; // number of maximum document count per chunk
this.minified = minified;
//if ((dom) && (format == 2)) dom = false; //if ((dom) && (format == 2)) dom = false;
} }
private void printHead(PrintWriter pw) { private void printHead(final PrintWriter pw) {
if (this.format == ExportFormat.html) { if (this.format == ExportFormat.html) {
pw.println("<html><head></head><body>"); pw.println("<html><head></head><body>");
} }
@ -856,7 +872,7 @@ public final class Fulltext {
} }
} }
private void printTail(PrintWriter pw) { private void printTail(final PrintWriter pw) {
if (this.format == ExportFormat.html) { if (this.format == ExportFormat.html) {
pw.println("</body></html>"); pw.println("</body></html>");
} }
@ -881,9 +897,9 @@ public final class Fulltext {
} }
try { try {
docCount = 0; this.docCount = 0;
chunkSize = 0; this.chunkSize = 0;
chunkCount = 0; this.chunkCount = 0;
PrintWriter pw = getWriter(); PrintWriter pw = getWriter();
printHead(pw); printHead(pw);
if (this.dom) { if (this.dom) {
@ -902,6 +918,12 @@ public final class Fulltext {
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
final String url = getStringFrom(doc.getFieldValue(CollectionSchema.sku.getSolrFieldName())); final String url = getStringFrom(doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
if (this.pattern != null && !this.pattern.matcher(url).matches()) continue; if (this.pattern != null && !this.pattern.matcher(url).matches()) continue;
if (this.minified) {
final Iterator<Entry<String, Object>> i = doc.iterator();
while (i.hasNext()) {
if (!minified_keys.contains(i.next().getKey())) i.remove();
}
}
final CRIgnoreWriter sw = new CRIgnoreWriter(); final CRIgnoreWriter sw = new CRIgnoreWriter();
if (this.text) sw.write((String) doc.getFieldValue(CollectionSchema.text_t.getSolrFieldName())); if (this.text) sw.write((String) doc.getFieldValue(CollectionSchema.text_t.getSolrFieldName()));
if (this.format == ExportFormat.solr) EnhancedXMLResponseWriter.writeDoc(sw, doc); if (this.format == ExportFormat.solr) EnhancedXMLResponseWriter.writeDoc(sw, doc);
@ -914,7 +936,8 @@ public final class Fulltext {
if (this.chunkSize >= this.maxChunkSize) { if (this.chunkSize >= this.maxChunkSize) {
printTail(pw); printTail(pw);
pw.close(); pw.close();
pw = getWriter(); // increases chunkCount as side-effect this.chunkCount++;
pw = getWriter();
printHead(pw); printHead(pw);
this.chunkSize = 0; this.chunkSize = 0;
} }
@ -957,7 +980,8 @@ public final class Fulltext {
if (this.chunkSize >= this.maxChunkSize) { if (this.chunkSize >= this.maxChunkSize) {
printTail(pw); printTail(pw);
pw.close(); pw.close();
pw = getWriter(); // increases chunkCount as side-effect this.chunkCount++;
pw = getWriter();
printHead(pw); printHead(pw);
this.chunkSize = 0; this.chunkSize = 0;
} }
@ -980,14 +1004,13 @@ public final class Fulltext {
} }
private PrintWriter getWriter() throws IOException { private PrintWriter getWriter() throws IOException {
File f = file(); final File f = file();
final OutputStream os = new FileOutputStream(this.format == ExportFormat.solr ? new File(f.getAbsolutePath() + ".gz") : f); final OutputStream os = new FileOutputStream(this.format == ExportFormat.solr ? new File(f.getAbsolutePath() + ".gz") : f);
final PrintWriter pw = new PrintWriter(new BufferedOutputStream(((this.format == ExportFormat.solr)) ? new GZIPOutputStream(os, 65536){{this.def.setLevel(Deflater.BEST_COMPRESSION);}} : os)); final PrintWriter pw = new PrintWriter(new BufferedOutputStream(((this.format == ExportFormat.solr)) ? new GZIPOutputStream(os, 65536){{this.def.setLevel(Deflater.BEST_COMPRESSION);}} : os));
this.chunkCount++;
return pw; return pw;
} }
private String chunkcount(int count) { private String chunkcount(final int count) {
if (count < 10) return "000" + count; if (count < 10) return "000" + count;
if (count < 100) return "00" + count; if (count < 100) return "00" + count;
if (count < 1000) return "0" + count; if (count < 1000) return "0" + count;

Loading…
Cancel
Save