added a 'minified' option to YaCy dumps

1 year ago · 3268a93019
parent c20c4b8a21
commit 3268a93019
3 changed files with 62 additions and 33 deletions
--- a/htroot/IndexExport_p.html
+++ b/htroot/IndexExport_p.html
@ -32,6 +32,11 @@
        <dt class="TableCellDark">maximum number of records per chunk</dt>
        <dd><input type="text" name="maxchunksize" value="-1" size="20" maxlength="250" />&nbsp;if exceeded: several chunks are stored; -1 = unlimited (makes only one chunk)
        </dd>
+        <dt class="TableCellDark">Export Size</dt>
+        <dd>
+          full size, all fields:<input type="radio" name="minified" value="no" checked="checked">&nbsp;
+          minified; only fields sku, date, title, description, text_t<input type="radio" name="minified" value="yes" >
+        </dd>
        <dt class="TableCellDark">Export Format</dt>
        <dd>
        <dl>
--- a/source/net/yacy/htroot/IndexExport_p.java
+++ b/source/net/yacy/htroot/IndexExport_p.java
@ -126,13 +126,14 @@ public class IndexExport_p {
            long maxChunkSize = post.getLong("maxchunksize", Long.MAX_VALUE);
            if (maxChunkSize <= 0) maxChunkSize = Long.MAX_VALUE;
            final String path = post.get("exportfilepath", "");
+            final boolean minified = post.get("minified", "no").equals("yes");

            // store this call as api call: we do this even if there is a chance that it fails because recurring calls may do not fail
            if (maxseconds != -1) sb.tables.recordAPICall(post, "IndexExport_p.html", WorkTables.TABLE_API_TYPE_DUMP, format + "-dump, q=" + query + ", maxseconds=" + maxseconds);

            // start the export
            try {
-                export = sb.index.fulltext().export(format, filter, query, maxseconds, new File(path), dom, text, maxChunkSize);
+                export = sb.index.fulltext().export(format, filter, query, maxseconds, new File(path), dom, text, maxChunkSize, minified);
            } catch (final IOException e) {
                prop.put("lurlexporterror", 1);
                prop.put("lurlexporterror_exportfile", "-no export-");
--- a/source/net/yacy/search/index/Fulltext.java
+++ b/source/net/yacy/search/index/Fulltext.java
@ -34,8 +34,10 @@ import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Date;
 import java.util.HashSet;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
+import java.util.Map.Entry;
 import java.util.Set;
 import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.atomic.AtomicInteger;
@ -118,7 +120,7 @@ public final class Fulltext {
        this.writeWebgraph = false;
    }

-    public void setUseWebgraph(boolean check) {
+    public void setUseWebgraph(final boolean check) {
        this.writeWebgraph = check;
    }

@ -142,8 +144,8 @@ public final class Fulltext {
        final File solrLocation = new File(this.segmentPath, SOLR_PATH);

        // migrate old solr to new
-        for (String oldVersion: SOLR_OLD_PATH) {
-            File oldLocation = new File(this.segmentPath, oldVersion);
+        for (final String oldVersion: SOLR_OLD_PATH) {
+            final File oldLocation = new File(this.segmentPath, oldVersion);
            if (oldLocation.exists()) {
                if (!oldLocation.renameTo(solrLocation)) {
                    ConcurrentLog.severe("Fulltext", "Failed renaming old Solr location ("
@ -183,11 +185,11 @@ public final class Fulltext {
        return this.solrInstances.getDefaultEmbeddedConnector();
    }

-    public EmbeddedSolrConnector getEmbeddedConnector(String corename) {
+    public EmbeddedSolrConnector getEmbeddedConnector(final String corename) {
        return this.solrInstances.getEmbeddedConnector(corename);
    }

-    public SolrConnector getConnectorForRead(String corename) {
+    public SolrConnector getConnectorForRead(final String corename) {
        if (this.solrInstances.isConnectedRemote()) return this.solrInstances.getRemoteConnector(corename);
        if (this.solrInstances.isConnectedEmbedded()) return this.solrInstances.getEmbeddedConnector(corename);
        return null;
@ -315,7 +317,7 @@ public final class Fulltext {
    }

    private long lastCommit = 0;
-    public void commit(boolean softCommit) {
+    public void commit(final boolean softCommit) {
        final long t = System.currentTimeMillis();
        if (this.lastCommit + 10000 > t) return;
        this.lastCommit = t;
@ -423,7 +425,7 @@ public final class Fulltext {
     * @param freshdate either NULL or a date in the past which is the limit for deletion. Only documents older than this date are deleted
     * @throws IOException
     */
-    public void deleteStaleDomainHashes(final Set<String> hosthashes, Date freshdate) {
+    public void deleteStaleDomainHashes(final Set<String> hosthashes, final Date freshdate) {
        // delete in solr
        final Date now = new Date();
        deleteDomainWithConstraint(this.getDefaultConnector(), CollectionSchema.host_id_s.getSolrFieldName(), hosthashes,
@ -434,7 +436,7 @@ public final class Fulltext {
                    (WebgraphSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]"));
    }

-    public void deleteStaleDomainNames(final Set<String> hostnames, Date freshdate) {
+    public void deleteStaleDomainNames(final Set<String> hostnames, final Date freshdate) {

        final Date now = new Date();
        deleteDomainWithConstraint(this.getDefaultConnector(), CollectionSchema.host_s.getSolrFieldName(), hostnames,
@ -453,7 +455,7 @@ public final class Fulltext {
        deleteDomainWithConstraint(this.getDefaultConnector(), CollectionSchema.host_id_s.getSolrFieldName(), hosthashes, CollectionSchema.failreason_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM);
    }

-    private static void deleteDomainWithConstraint(SolrConnector connector, String fieldname, final Set<String> hosthashes, String constraintQuery) {
+    private static void deleteDomainWithConstraint(final SolrConnector connector, final String fieldname, final Set<String> hosthashes, final String constraintQuery) {
        if (hosthashes == null || hosthashes.size() == 0) return;
        final int subsetscount = 1 + (hosthashes.size() / 255); // if the list is too large, we get a "too many boolean clauses" exception
        int c = 0;
@ -492,7 +494,7 @@ public final class Fulltext {
     * @param basepath the left path of the url; at least until the end of the host
     * @param freshdate either NULL or a date in the past which is the limit for deletion. Only documents older than this date are deleted
     */
-    public int remove(final String basepath, Date freshdate) {
+    public int remove(final String basepath, final Date freshdate) {
        DigestURL uri;
        try {uri = new DigestURL(basepath);} catch (final MalformedURLException e) {return 0;}
        final String host = uri.getHost();
@ -690,15 +692,15 @@ public final class Fulltext {
    public static enum ExportFormat {
        text("txt"), html("html"), rss("rss"), solr("xml"), elasticsearch("flatjson");
        private final String ext;
-        private ExportFormat(String ext) {this.ext = ext;}
+        private ExportFormat(final String ext) {this.ext = ext;}
        public String getExt() {return this.ext;}
    }

    public final static String yacy_dump_prefix = "yacy_dump_";
    public Export export(
-            Fulltext.ExportFormat format, String filter, String query,
-            final int maxseconds, File path, boolean dom, boolean text,
-            long maxChunkSize) throws IOException {
+            final Fulltext.ExportFormat format, final String filter, String query,
+            final int maxseconds, final File path, final boolean dom, final boolean text,
+            final long maxChunkSize, final boolean minified) throws IOException {

        // modify query according to maxseconds
        final long now = System.currentTimeMillis();
@ -763,13 +765,13 @@ public final class Fulltext {
            }
        }

-        String filename = yacy_dump_prefix +
+        final String filename = yacy_dump_prefix +
                "f" + GenericFormatter.SHORT_MINUTE_FORMATTER.format(firstdate) + "_" +
                "l" + GenericFormatter.SHORT_MINUTE_FORMATTER.format(lastdate) + "_" +
                "n" + GenericFormatter.SHORT_MINUTE_FORMATTER.format(new Date(now)) + "_" +
                "c" + String.format("%1$012d", doccount)+ "_tc"; // the name ends with the transaction token ('c' = 'created')

-        return export(path, filename, format.getExt(), filter, query, format, dom, text, maxChunkSize);
+        return export(path, filename, format.getExt(), filter, query, format, dom, text, maxChunkSize, minified);
    }

    // export methods
@ -777,17 +779,17 @@ public final class Fulltext {
            final File path, final String filename,
            final String fileext, final String filter, final String query,
            final ExportFormat format, final boolean dom, final boolean text,
-            long maxChunkSize) {
+            final long maxChunkSize, final boolean minified) {
        if ((this.exportthread != null) && (this.exportthread.isAlive())) {
            ConcurrentLog.warn("LURL-EXPORT", "cannot start another export thread, already one running");
            return this.exportthread;
        }
-        this.exportthread = new Export(path, filename, fileext, filter, query, format, dom, text, maxChunkSize);
+        this.exportthread = new Export(path, filename, fileext, filter, query, format, dom, text, maxChunkSize, minified);
        this.exportthread.start();
        return this.exportthread;
    }

-    public static void main(String args[]) {
+    public static void main(final String args[]) {
        final Date firstdate = null;
        System.out.println(GenericFormatter.SHORT_MINUTE_FORMATTER.format(firstdate));
    }
@ -796,6 +798,18 @@ public final class Fulltext {
        return this.exportthread;
    }

+    private final static Set<String> minified_keys = new HashSet<>();
+    static {
+        //minified_keys.add(CollectionSchema.id.getSolrFieldName());
+        minified_keys.add(CollectionSchema.sku.getSolrFieldName());
+        minified_keys.add(CollectionSchema.title.getSolrFieldName());
+        //minified_keys.add(CollectionSchema.author.getSolrFieldName());
+        minified_keys.add(CollectionSchema.description_txt.getSolrFieldName());
+        //minified_keys.add(CollectionSchema.size_i.getSolrFieldName());
+        minified_keys.add(CollectionSchema.last_modified.getSolrFieldName());
+        minified_keys.add(CollectionSchema.text_t.getSolrFieldName());
+    }
+
    public class Export extends Thread {
        private final File path;
        private final String filename, fileext;
@ -806,12 +820,13 @@ public final class Fulltext {
        private final boolean dom, text;
        private int docCount, chunkSize, chunkCount;
        private final long maxChunkSize;
+        private final boolean minified;

        private Export(
                final File path, final String filename,
                final String fileext, final String filter, final String query,
                final ExportFormat format, final boolean dom, final boolean text,
-                long maxChunkSize) {
+                final long maxChunkSize, final boolean minified) {
            super("Fulltext.Export");
            // format: 0=text, 1=html, 2=rss/xml
            this.path = path;
@ -827,10 +842,11 @@ public final class Fulltext {
            this.chunkSize = 0; // number of documents in the current chunk
            this.chunkCount = 0; // number of chunks opened so far
            this.maxChunkSize = maxChunkSize; // number of maximum document count per chunk
+            this.minified = minified;
            //if ((dom) && (format == 2)) dom = false;
        }

-        private void printHead(PrintWriter pw) {
+        private void printHead(final PrintWriter pw) {
            if (this.format == ExportFormat.html) {
                pw.println("<html><head></head><body>");
            }
@ -855,8 +871,8 @@ public final class Fulltext {
                pw.println("<result>");
            }
        }
- 
-        private void printTail(PrintWriter pw) {
+
+        private void printTail(final PrintWriter pw) {
            if (this.format == ExportFormat.html) {
                pw.println("</body></html>");
            }
@ -869,7 +885,7 @@ public final class Fulltext {
                pw.println("</response>");
            }
        }
- 
+
        @Override
        public void run() {
            try {
@ -881,9 +897,9 @@ public final class Fulltext {
            }

            try {
-                docCount = 0;
-                chunkSize = 0;
-                chunkCount = 0;
+                this.docCount = 0;
+                this.chunkSize = 0;
+                this.chunkCount = 0;
                PrintWriter pw = getWriter();
                printHead(pw);
                if (this.dom) {
@ -902,6 +918,12 @@ public final class Fulltext {
                        while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
                            final String url = getStringFrom(doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
                            if (this.pattern != null && !this.pattern.matcher(url).matches()) continue;
+                            if (this.minified) {
+                                final Iterator<Entry<String, Object>> i = doc.iterator();
+                                while (i.hasNext()) {
+                                    if (!minified_keys.contains(i.next().getKey())) i.remove();
+                                }
+                            }
                            final CRIgnoreWriter sw = new CRIgnoreWriter();
                            if (this.text) sw.write((String) doc.getFieldValue(CollectionSchema.text_t.getSolrFieldName()));
                            if (this.format == ExportFormat.solr) EnhancedXMLResponseWriter.writeDoc(sw, doc);
@ -914,7 +936,8 @@ public final class Fulltext {
                            if (this.chunkSize >= this.maxChunkSize) {
                                printTail(pw);
                                pw.close();
-                                pw = getWriter(); // increases chunkCount as side-effect
+                                this.chunkCount++;
+                                pw = getWriter();
                                printHead(pw);
                                this.chunkSize = 0;
                            }
@ -957,7 +980,8 @@ public final class Fulltext {
                            if (this.chunkSize >= this.maxChunkSize) {
                                printTail(pw);
                                pw.close();
-                                pw = getWriter(); // increases chunkCount as side-effect
+                                this.chunkCount++;
+                                pw = getWriter();
                                printHead(pw);
                                this.chunkSize = 0;
                            }
@ -980,14 +1004,13 @@ public final class Fulltext {
        }

        private PrintWriter getWriter() throws IOException {
-            File f = file();
+            final File f = file();
            final OutputStream os = new FileOutputStream(this.format == ExportFormat.solr ? new File(f.getAbsolutePath() + ".gz") : f);
            final PrintWriter pw =  new PrintWriter(new BufferedOutputStream(((this.format == ExportFormat.solr)) ? new GZIPOutputStream(os, 65536){{this.def.setLevel(Deflater.BEST_COMPRESSION);}} : os));
-            this.chunkCount++;
            return pw;
        }

-        private String chunkcount(int count) {
+        private String chunkcount(final int count) {
            if (count < 10) return "000" + count;
            if (count < 100) return "00" + count;
            if (count < 1000) return "0" + count;