From c7576d60286ae25a26c651de14343122b9836882 Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Fri, 29 May 2015 15:05:52 +0200
Subject: [PATCH] added a full solr export to the IndexControlURLs_p.html
 servlet. The export function is also now the default export option. The
 export file format for a full solr export is very similar to a solr search
 result xml, only the <lst name="responseHeader"> tag is missing.

The exported xml has a special line termination feature: all documents
will be exported into a single line without any CR in between. That
means that every document is completely inside a single line. While this
is not readable at all for humans, it is very useful for linux line
processing scripts, like grep. Using grep it will be easy to select
single documents which match for a given pattern.

Such dumps shall be importable with the DATA/SURROGATE/in import
function, but that import is not yet adopted to the new file format.
---
 htroot/IndexControlURLs_p.html                | 33 ++++---
 htroot/IndexControlURLs_p.java                |  4 +-
 source/net/yacy/cora/util/CRIgnoreWriter.java | 97 +++++++++++++++++++
 source/net/yacy/search/index/Fulltext.java    | 94 +++++++++++-------
 4 files changed, 178 insertions(+), 50 deletions(-)
 create mode 100644 source/net/yacy/cora/util/CRIgnoreWriter.java
diff --git a/htroot/IndexControlURLs_p.html b/htroot/IndexControlURLs_p.html
index e4eee30f5..0d245a6ab 100644
--- a/htroot/IndexControlURLs_p.html
+++ b/htroot/IndexControlURLs_p.html
@@ -118,7 +118,7 @@ function updatepage(str) {
     <form action="IndexControlURLs_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
     <fieldset><legend>Statistics about top-domains in URL Database</legend>
       <dl>
-        <dt class="TableCellDark">&nbsp;</dt>
+        <dt>&nbsp;</dt>
         <dd>Show top <input type="text" name="lines" value="#[lines]#" size="6" maxlength="6" /> domains from all URLs.
             <input type="submit" name="statistics" value="Generate Statistics" class="btn btn-primary" style="width:240px;"/>
         </dd>
@@ -157,7 +157,7 @@ function updatepage(str) {
     <form action="IndexControlURLs_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
     <fieldset><legend>Dump and Restore of Solr Index</legend>
       <dl>
-        <dt class="TableCellDark">&nbsp;</dt>
+        <dt>&nbsp;</dt>
         <dd><input type="submit" name="indexdump" value="Create Dump" class="btn btn-primary" style="width:240px;"/>
         </dd>
       </dl>
@@ -165,7 +165,7 @@ function updatepage(str) {
         <dt class="TableCellDark">Dump File</dt>
         <dd><input type="text" name="dumpfile" value="#[dumpfile]#" size="80" maxlength="250" />
         </dd>
-        <dt class="TableCellDark">&nbsp;</dt>
+        <dt>&nbsp;</dt>
         <dd><input type="submit" name="indexrestore" value="Restore Dump" class="btn btn-primary" style="width:240px;"/>
         </dd>
       </dl>
@@ -174,7 +174,7 @@ function updatepage(str) {
     <form action="IndexControlURLs_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
     <fieldset><legend>Optimize Solr</legend>
       <dl>
-        <dt class="TableCellDark">&nbsp;</dt>
+        <dt>&nbsp;</dt>
         <dd>merge to max. <input type="text" name="optimizemax" value="#[optimizemax]#" size="6" maxlength="6" /> segments
         <input type="submit" name="optimizesolr" value="Optimize Solr" class="btn btn-primary" style="width:240px;"/>
         </dd>
@@ -184,7 +184,7 @@ function updatepage(str) {
     <form action="IndexControlURLs_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
     <fieldset><legend>Reboot Solr Core</legend>
       <dl>
-        <dt class="TableCellDark">&nbsp;</dt>
+        <dt>&nbsp;</dt>
         <dd><input type="submit" name="rebootsolr" value="Shut Down and Re-Start Solr" class="btn btn-primary" style="width:240px;"/>
         </dd>
       </dl>
@@ -206,16 +206,19 @@ function updatepage(str) {
         <dd><input type="text" name="exportquery" value="*:*" size="20" maxlength="250" />
         </dd>
         <dt class="TableCellDark">Export Format</dt>
-        <dd>Only Domain:
-            <input type="radio" name="format" value="dom-text" />Plain Text List (domains only)&nbsp;&nbsp;
-            <input type="radio" name="format" value="dom-html" checked="checked" />HTML (domains as URLs, no title)<br />
-            Full URL List:
-            <input type="radio" name="format" value="url-text" />Plain Text List (URLs only)&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
-            <input type="radio" name="format" value="url-html" />HTML (URLs with title)&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
-            <input type="radio" name="format" value="url-rss" />XML (RSS)
-            <br />
-        </dd>
-        <dt class="TableCellDark">&nbsp;</dt>
+        <dd>
+        <dl>
+        <dt>Only Domain:</dt>
+        <dd><input type="radio" name="format" value="dom-text" /> Plain Text List (domains only)<br />
+            <input type="radio" name="format" value="dom-html" /> HTML (domains as URLs, no title)</dd>
+        <dt>Full URL List:</dt>
+        <dd><input type="radio" name="format" value="url-text" /> Plain Text List (URLs only)<br />
+            <input type="radio" name="format" value="url-html" /> HTML (URLs with title)</dd>
+        <dt>Full Data Records:</dt>
+        <dd><input type="radio" name="format" value="full-rss" /> XML (RSS)<br />
+            <input type="radio" name="format" value="full-solr" checked="checked" /> XML (Rich and full Solr data using Solr Schema, can be imported with DATA/SURROGATE/in/)</dd>
+		</dl></dd>
+        <dt>&nbsp;</dt>
         <dd><input type="submit" name="lurlexport" value="Export URLs" class="btn btn-primary" style="width:240px;"/>
         </dd>
       </dl>
diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java
index d0ed3cda9..bf35bda5f 100644
--- a/htroot/IndexControlURLs_p.java
+++ b/htroot/IndexControlURLs_p.java
@@ -256,13 +256,15 @@ public class IndexControlURLs_p {
             if (fname.endsWith("text")) format = 0;
             if (fname.endsWith("html")) format = 1;
             if (fname.endsWith("rss")) format = 2;
+            if (fname.endsWith("solr")) format = 3;
 
             // extend export file name
             String s = post.get("exportfile", "");
             if (s.indexOf('.',0) < 0) {
                 if (format == 0) s = s + ".txt";
                 if (format == 1) s = s + ".html";
-                if (format == 2) s = s + ".xml";
+                if (format == 2 ) s = s + "_rss.xml";
+                if (format == 3) s = s + "_full.xml";
             }
             final File f = new File(s);
             f.getParentFile().mkdirs();
diff --git a/source/net/yacy/cora/util/CRIgnoreWriter.java b/source/net/yacy/cora/util/CRIgnoreWriter.java
new file mode 100644
index 000000000..e6b1d74f0
--- /dev/null
+++ b/source/net/yacy/cora/util/CRIgnoreWriter.java
@@ -0,0 +1,97 @@
+/**
+ *  CRIgnoreWriter
+ *  Copyright 29.5.2015 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program in the file lgpl21.txt
+ *  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+package net.yacy.cora.util;
+
+import java.io.StringWriter;
+
+public class CRIgnoreWriter extends StringWriter {
+
+    public CRIgnoreWriter() {
+        super();
+    }
+    
+    public CRIgnoreWriter(final int initialSize) {
+        super(initialSize);
+    }
+
+    @Override
+    public void write(int c) {
+        if (c >= 32) super.write(c);
+    }
+
+    @Override
+    public void write(char cbuf[], int off, int len) {
+        if ((off < 0) || (off > cbuf.length) || (len < 0) ||
+            ((off + len) > cbuf.length) || ((off + len) < 0)) {
+            throw new IndexOutOfBoundsException();
+        } else if (len == 0) {
+            return;
+        }
+        int p = off;
+        char c;
+        for (int i = 0; i < len; i++) {
+            c = cbuf[p];
+            if (c >= 32) super.write(c);
+            p++;
+        }
+    }
+
+    @Override
+    public void write(String str) {
+        int len = str.length();
+        char c;
+        for (int i = 0; i < len; i++) {
+            c = str.charAt(i);
+            if (c >= 32) super.write(c);
+        }
+    }
+
+    @Override
+    public void write(String str, int off, int len)  {
+        int p = off;
+        char c;
+        for (int i = 0; i < len; i++) {
+            c = str.charAt(p);
+            if (c >= 32) super.write(c);
+            p++;
+        }
+    }
+
+    @Override
+    public CRIgnoreWriter append(CharSequence csq) {
+        this.write(csq == null ? "null" : csq.toString());
+        return this;
+    }
+
+    @Override
+    public CRIgnoreWriter append(CharSequence csq, int start, int end) {
+        CharSequence cs = (csq == null ? "null" : csq);
+        this.write(cs.subSequence(start, end).toString());
+        return this;
+    }
+
+    @Override
+    public CRIgnoreWriter append(char c) {
+        if (c >= 32) write(c);
+        return this;
+    }
+
+}
diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java
index 27cd5607a..6d70297e6 100644
--- a/source/net/yacy/search/index/Fulltext.java
+++ b/source/net/yacy/search/index/Fulltext.java
@@ -27,6 +27,7 @@ import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.PrintWriter;
+import java.io.StringWriter;
 import java.lang.reflect.Array;
 import java.net.MalformedURLException;
 import java.util.ArrayList;
@@ -55,11 +56,13 @@ import net.yacy.cora.federate.solr.instance.EmbeddedInstance;
 import net.yacy.cora.federate.solr.instance.InstanceMirror;
 import net.yacy.cora.federate.solr.instance.RemoteInstance;
 import net.yacy.cora.federate.solr.instance.ShardInstance;
+import net.yacy.cora.federate.solr.responsewriter.EnhancedXMLResponseWriter;
 import net.yacy.cora.protocol.HeaderFramework;
 import net.yacy.cora.sorting.ReversibleScoreMap;
 import net.yacy.cora.sorting.WeakPriorityBlockingQueue;
 import net.yacy.cora.storage.ZIPReader;
 import net.yacy.cora.storage.ZIPWriter;
+import net.yacy.cora.util.CRIgnoreWriter;
 import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.document.parser.html.CharacterCoding;
 import net.yacy.kelondro.data.meta.URIMetadataNode;
@@ -75,6 +78,7 @@ import org.apache.solr.common.SolrDocument;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrInputDocument;
 import org.apache.solr.core.SolrInfoMBean;
+import org.apache.commons.io.output.StringBuilderWriter;
 import org.apache.lucene.util.Version;
 
 public final class Fulltext {
@@ -666,8 +670,11 @@ public final class Fulltext {
                     pw.println("<description></description>");
                     pw.println("<link>http://yacy.net</link>");
                 }
-                
-               
+                if (this.format == 3) {
+                    pw.println("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
+                    pw.println("<response>");
+                    pw.println("<result>");
+                }
                 if (this.dom) {
                     Map<String, ReversibleScoreMap<String>> scores = Fulltext.this.getDefaultConnector().getFacets(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", 100000000, CollectionSchema.host_s.getSolrFieldName());
                     ReversibleScoreMap<String> stats = scores.get(CollectionSchema.host_s.getSolrFieldName());
@@ -678,40 +685,55 @@ public final class Fulltext {
                         this.count++;
                     }
                 } else {
-                    BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", null, 0, 100000000, Long.MAX_VALUE, 100, 1, true, 
-                            CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.title.getSolrFieldName(),
-                            CollectionSchema.author.getSolrFieldName(), CollectionSchema.description_txt.getSolrFieldName(), CollectionSchema.size_i.getSolrFieldName(), CollectionSchema.last_modified.getSolrFieldName());
-                    SolrDocument doc;
-                    String url, hash, title, author, description;
-                    Integer size;
-                    Date date;
-                    while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
-                        hash = getStringFrom(doc.getFieldValue(CollectionSchema.id.getSolrFieldName()));
-                        url = getStringFrom(doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
-                        title = getStringFrom(doc.getFieldValue(CollectionSchema.title.getSolrFieldName()));
-                        author = getStringFrom(doc.getFieldValue(CollectionSchema.author.getSolrFieldName()));
-                        description = getStringFrom(doc.getFieldValue(CollectionSchema.description_txt.getSolrFieldName()));
-                        size = (Integer) doc.getFieldValue(CollectionSchema.size_i.getSolrFieldName());
-                        date = (Date) doc.getFieldValue(CollectionSchema.last_modified.getSolrFieldName());
-                        if (this.pattern != null && !this.pattern.matcher(url).matches()) continue;
-                        if (this.format == 0) {
-                            pw.println(url);
-                        }
-                        if (this.format == 1) {
-                            if (title != null) pw.println("<a href=\"" + MultiProtocolURL.escape(url) + "\">" + CharacterCoding.unicode2xml(title, true) + "</a>");
+                    if (this.format < 3) {
+                        BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", null, 0, 100000000, Long.MAX_VALUE, 100, 1, true, 
+                                CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.title.getSolrFieldName(),
+                                CollectionSchema.author.getSolrFieldName(), CollectionSchema.description_txt.getSolrFieldName(), CollectionSchema.size_i.getSolrFieldName(), CollectionSchema.last_modified.getSolrFieldName());
+                        SolrDocument doc;
+                        String url, hash, title, author, description;
+                        Integer size;
+                        Date date;
+                        while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
+                            hash = getStringFrom(doc.getFieldValue(CollectionSchema.id.getSolrFieldName()));
+                            url = getStringFrom(doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
+                            title = getStringFrom(doc.getFieldValue(CollectionSchema.title.getSolrFieldName()));
+                            author = getStringFrom(doc.getFieldValue(CollectionSchema.author.getSolrFieldName()));
+                            description = getStringFrom(doc.getFieldValue(CollectionSchema.description_txt.getSolrFieldName()));
+                            size = (Integer) doc.getFieldValue(CollectionSchema.size_i.getSolrFieldName());
+                            date = (Date) doc.getFieldValue(CollectionSchema.last_modified.getSolrFieldName());
+                            if (this.pattern != null && !this.pattern.matcher(url).matches()) continue;
+                            if (this.format == 0) {
+                                pw.println(url);
+                            }
+                            if (this.format == 1) {
+                                if (title != null) pw.println("<a href=\"" + MultiProtocolURL.escape(url) + "\">" + CharacterCoding.unicode2xml(title, true) + "</a>");
+                            }
+                            if (this.format == 2) {
+                                pw.println("<item>");
+                                if (title != null) pw.println("<title>" + CharacterCoding.unicode2xml(title, true) + "</title>");
+                                pw.println("<link>" + MultiProtocolURL.escape(url) + "</link>");
+                                if (author != null && !author.isEmpty()) pw.println("<author>" + CharacterCoding.unicode2xml(author, true) + "</author>");
+                                if (description != null && !description.isEmpty()) pw.println("<description>" + CharacterCoding.unicode2xml(description, true) + "</description>");
+                                if (date != null) pw.println("<pubDate>" + HeaderFramework.formatRFC1123(date) + "</pubDate>");
+                                if (size != null) pw.println("<yacy:size>" + size.intValue() + "</yacy:size>");
+                                pw.println("<guid isPermaLink=\"false\">" + hash + "</guid>");
+                                pw.println("</item>");
+                            }
+                            this.count++;
                         }
-                        if (this.format == 2) {
-                            pw.println("<item>");
-                            if (title != null) pw.println("<title>" + CharacterCoding.unicode2xml(title, true) + "</title>");
-                            pw.println("<link>" + MultiProtocolURL.escape(url) + "</link>");
-                            if (author != null && !author.isEmpty()) pw.println("<author>" + CharacterCoding.unicode2xml(author, true) + "</author>");
-                            if (description != null && !description.isEmpty()) pw.println("<description>" + CharacterCoding.unicode2xml(description, true) + "</description>");
-                            if (date != null) pw.println("<pubDate>" + HeaderFramework.formatRFC1123(date) + "</pubDate>");
-                            if (size != null) pw.println("<yacy:size>" + size.intValue() + "</yacy:size>");
-                            pw.println("<guid isPermaLink=\"false\">" + hash + "</guid>");
-                            pw.println("</item>");
+                    } else {
+                        BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", null, 0, 100000000, Long.MAX_VALUE, 100, 1, true);
+                        SolrDocument doc;
+                        while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
+                            String url = getStringFrom(doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
+                            if (this.pattern != null && !this.pattern.matcher(url).matches()) continue;
+                            CRIgnoreWriter sw = new CRIgnoreWriter();
+                            EnhancedXMLResponseWriter.writeDoc(sw, doc);
+                            sw.close();
+                            String d = sw.toString();
+                            pw.println(d);
+                            this.count++;
                         }
-                        this.count++;
                     }
                 }
                 if (this.format == 1) {
@@ -721,6 +743,10 @@ public final class Fulltext {
                     pw.println("</channel>");
                     pw.println("</rss>");
                 }
+                if (this.format == 3) {
+                    pw.println("</result>");
+                    pw.println("</response>");
+                }
                 pw.close();
             } catch (final IOException e) {
                 ConcurrentLog.logException(e);