added a full solr export to the IndexControlURLs_p.html servlet. The

export function is also now the default export option. The export file
format for a full solr export is very similar to a solr search result
xml, only the <lst name="responseHeader"> tag is missing.

The exported xml has a special line termination feature: all documents
will be exported into a single line without any CR in between. That
means that every document is completely inside a single line. While this
is not readable at all for humans, it is very useful for linux line
processing scripts, like grep. Using grep it will be easy to select
single documents which match for a given pattern.

Such dumps shall be importable with the DATA/SURROGATE/in import
function, but that import is not yet adopted to the new file format.
pull/8/head
Michael Peter Christen 10 years ago
parent 47682bf467
commit c7576d6028

@ -118,7 +118,7 @@ function updatepage(str) {
<form action="IndexControlURLs_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset><legend>Statistics about top-domains in URL Database</legend>
<dl>
<dt class="TableCellDark">&nbsp;</dt>
<dt>&nbsp;</dt>
<dd>Show top <input type="text" name="lines" value="#[lines]#" size="6" maxlength="6" /> domains from all URLs.
<input type="submit" name="statistics" value="Generate Statistics" class="btn btn-primary" style="width:240px;"/>
</dd>
@ -157,7 +157,7 @@ function updatepage(str) {
<form action="IndexControlURLs_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset><legend>Dump and Restore of Solr Index</legend>
<dl>
<dt class="TableCellDark">&nbsp;</dt>
<dt>&nbsp;</dt>
<dd><input type="submit" name="indexdump" value="Create Dump" class="btn btn-primary" style="width:240px;"/>
</dd>
</dl>
@ -165,7 +165,7 @@ function updatepage(str) {
<dt class="TableCellDark">Dump File</dt>
<dd><input type="text" name="dumpfile" value="#[dumpfile]#" size="80" maxlength="250" />
</dd>
<dt class="TableCellDark">&nbsp;</dt>
<dt>&nbsp;</dt>
<dd><input type="submit" name="indexrestore" value="Restore Dump" class="btn btn-primary" style="width:240px;"/>
</dd>
</dl>
@ -174,7 +174,7 @@ function updatepage(str) {
<form action="IndexControlURLs_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset><legend>Optimize Solr</legend>
<dl>
<dt class="TableCellDark">&nbsp;</dt>
<dt>&nbsp;</dt>
<dd>merge to max. <input type="text" name="optimizemax" value="#[optimizemax]#" size="6" maxlength="6" /> segments
<input type="submit" name="optimizesolr" value="Optimize Solr" class="btn btn-primary" style="width:240px;"/>
</dd>
@ -184,7 +184,7 @@ function updatepage(str) {
<form action="IndexControlURLs_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset><legend>Reboot Solr Core</legend>
<dl>
<dt class="TableCellDark">&nbsp;</dt>
<dt>&nbsp;</dt>
<dd><input type="submit" name="rebootsolr" value="Shut Down and Re-Start Solr" class="btn btn-primary" style="width:240px;"/>
</dd>
</dl>
@ -206,16 +206,19 @@ function updatepage(str) {
<dd><input type="text" name="exportquery" value="*:*" size="20" maxlength="250" />
</dd>
<dt class="TableCellDark">Export Format</dt>
<dd>Only Domain:
<input type="radio" name="format" value="dom-text" />Plain Text List (domains only)&nbsp;&nbsp;
<input type="radio" name="format" value="dom-html" checked="checked" />HTML (domains as URLs, no title)<br />
Full URL List:
<input type="radio" name="format" value="url-text" />Plain Text List (URLs only)&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
<input type="radio" name="format" value="url-html" />HTML (URLs with title)&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
<input type="radio" name="format" value="url-rss" />XML (RSS)
<br />
</dd>
<dt class="TableCellDark">&nbsp;</dt>
<dd>
<dl>
<dt>Only Domain:</dt>
<dd><input type="radio" name="format" value="dom-text" /> Plain Text List (domains only)<br />
<input type="radio" name="format" value="dom-html" /> HTML (domains as URLs, no title)</dd>
<dt>Full URL List:</dt>
<dd><input type="radio" name="format" value="url-text" /> Plain Text List (URLs only)<br />
<input type="radio" name="format" value="url-html" /> HTML (URLs with title)</dd>
<dt>Full Data Records:</dt>
<dd><input type="radio" name="format" value="full-rss" /> XML (RSS)<br />
<input type="radio" name="format" value="full-solr" checked="checked" /> XML (Rich and full Solr data using Solr Schema, can be imported with DATA/SURROGATE/in/)</dd>
</dl></dd>
<dt>&nbsp;</dt>
<dd><input type="submit" name="lurlexport" value="Export URLs" class="btn btn-primary" style="width:240px;"/>
</dd>
</dl>

@ -256,13 +256,15 @@ public class IndexControlURLs_p {
if (fname.endsWith("text")) format = 0;
if (fname.endsWith("html")) format = 1;
if (fname.endsWith("rss")) format = 2;
if (fname.endsWith("solr")) format = 3;
// extend export file name
String s = post.get("exportfile", "");
if (s.indexOf('.',0) < 0) {
if (format == 0) s = s + ".txt";
if (format == 1) s = s + ".html";
if (format == 2) s = s + ".xml";
if (format == 2 ) s = s + "_rss.xml";
if (format == 3) s = s + "_full.xml";
}
final File f = new File(s);
f.getParentFile().mkdirs();

@ -0,0 +1,97 @@
/**
* CRIgnoreWriter
* Copyright 29.5.2015 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.util;
import java.io.StringWriter;
public class CRIgnoreWriter extends StringWriter {
public CRIgnoreWriter() {
super();
}
public CRIgnoreWriter(final int initialSize) {
super(initialSize);
}
@Override
public void write(int c) {
if (c >= 32) super.write(c);
}
@Override
public void write(char cbuf[], int off, int len) {
if ((off < 0) || (off > cbuf.length) || (len < 0) ||
((off + len) > cbuf.length) || ((off + len) < 0)) {
throw new IndexOutOfBoundsException();
} else if (len == 0) {
return;
}
int p = off;
char c;
for (int i = 0; i < len; i++) {
c = cbuf[p];
if (c >= 32) super.write(c);
p++;
}
}
@Override
public void write(String str) {
int len = str.length();
char c;
for (int i = 0; i < len; i++) {
c = str.charAt(i);
if (c >= 32) super.write(c);
}
}
@Override
public void write(String str, int off, int len) {
int p = off;
char c;
for (int i = 0; i < len; i++) {
c = str.charAt(p);
if (c >= 32) super.write(c);
p++;
}
}
@Override
public CRIgnoreWriter append(CharSequence csq) {
this.write(csq == null ? "null" : csq.toString());
return this;
}
@Override
public CRIgnoreWriter append(CharSequence csq, int start, int end) {
CharSequence cs = (csq == null ? "null" : csq);
this.write(cs.subSequence(start, end).toString());
return this;
}
@Override
public CRIgnoreWriter append(char c) {
if (c >= 32) write(c);
return this;
}
}

@ -27,6 +27,7 @@ import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.lang.reflect.Array;
import java.net.MalformedURLException;
import java.util.ArrayList;
@ -55,11 +56,13 @@ import net.yacy.cora.federate.solr.instance.EmbeddedInstance;
import net.yacy.cora.federate.solr.instance.InstanceMirror;
import net.yacy.cora.federate.solr.instance.RemoteInstance;
import net.yacy.cora.federate.solr.instance.ShardInstance;
import net.yacy.cora.federate.solr.responsewriter.EnhancedXMLResponseWriter;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.cora.sorting.WeakPriorityBlockingQueue;
import net.yacy.cora.storage.ZIPReader;
import net.yacy.cora.storage.ZIPWriter;
import net.yacy.cora.util.CRIgnoreWriter;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.kelondro.data.meta.URIMetadataNode;
@ -75,6 +78,7 @@ import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.core.SolrInfoMBean;
import org.apache.commons.io.output.StringBuilderWriter;
import org.apache.lucene.util.Version;
public final class Fulltext {
@ -666,8 +670,11 @@ public final class Fulltext {
pw.println("<description></description>");
pw.println("<link>http://yacy.net</link>");
}
if (this.format == 3) {
pw.println("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
pw.println("<response>");
pw.println("<result>");
}
if (this.dom) {
Map<String, ReversibleScoreMap<String>> scores = Fulltext.this.getDefaultConnector().getFacets(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", 100000000, CollectionSchema.host_s.getSolrFieldName());
ReversibleScoreMap<String> stats = scores.get(CollectionSchema.host_s.getSolrFieldName());
@ -678,40 +685,55 @@ public final class Fulltext {
this.count++;
}
} else {
BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", null, 0, 100000000, Long.MAX_VALUE, 100, 1, true,
CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.title.getSolrFieldName(),
CollectionSchema.author.getSolrFieldName(), CollectionSchema.description_txt.getSolrFieldName(), CollectionSchema.size_i.getSolrFieldName(), CollectionSchema.last_modified.getSolrFieldName());
SolrDocument doc;
String url, hash, title, author, description;
Integer size;
Date date;
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
hash = getStringFrom(doc.getFieldValue(CollectionSchema.id.getSolrFieldName()));
url = getStringFrom(doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
title = getStringFrom(doc.getFieldValue(CollectionSchema.title.getSolrFieldName()));
author = getStringFrom(doc.getFieldValue(CollectionSchema.author.getSolrFieldName()));
description = getStringFrom(doc.getFieldValue(CollectionSchema.description_txt.getSolrFieldName()));
size = (Integer) doc.getFieldValue(CollectionSchema.size_i.getSolrFieldName());
date = (Date) doc.getFieldValue(CollectionSchema.last_modified.getSolrFieldName());
if (this.pattern != null && !this.pattern.matcher(url).matches()) continue;
if (this.format == 0) {
pw.println(url);
if (this.format < 3) {
BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", null, 0, 100000000, Long.MAX_VALUE, 100, 1, true,
CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.title.getSolrFieldName(),
CollectionSchema.author.getSolrFieldName(), CollectionSchema.description_txt.getSolrFieldName(), CollectionSchema.size_i.getSolrFieldName(), CollectionSchema.last_modified.getSolrFieldName());
SolrDocument doc;
String url, hash, title, author, description;
Integer size;
Date date;
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
hash = getStringFrom(doc.getFieldValue(CollectionSchema.id.getSolrFieldName()));
url = getStringFrom(doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
title = getStringFrom(doc.getFieldValue(CollectionSchema.title.getSolrFieldName()));
author = getStringFrom(doc.getFieldValue(CollectionSchema.author.getSolrFieldName()));
description = getStringFrom(doc.getFieldValue(CollectionSchema.description_txt.getSolrFieldName()));
size = (Integer) doc.getFieldValue(CollectionSchema.size_i.getSolrFieldName());
date = (Date) doc.getFieldValue(CollectionSchema.last_modified.getSolrFieldName());
if (this.pattern != null && !this.pattern.matcher(url).matches()) continue;
if (this.format == 0) {
pw.println(url);
}
if (this.format == 1) {
if (title != null) pw.println("<a href=\"" + MultiProtocolURL.escape(url) + "\">" + CharacterCoding.unicode2xml(title, true) + "</a>");
}
if (this.format == 2) {
pw.println("<item>");
if (title != null) pw.println("<title>" + CharacterCoding.unicode2xml(title, true) + "</title>");
pw.println("<link>" + MultiProtocolURL.escape(url) + "</link>");
if (author != null && !author.isEmpty()) pw.println("<author>" + CharacterCoding.unicode2xml(author, true) + "</author>");
if (description != null && !description.isEmpty()) pw.println("<description>" + CharacterCoding.unicode2xml(description, true) + "</description>");
if (date != null) pw.println("<pubDate>" + HeaderFramework.formatRFC1123(date) + "</pubDate>");
if (size != null) pw.println("<yacy:size>" + size.intValue() + "</yacy:size>");
pw.println("<guid isPermaLink=\"false\">" + hash + "</guid>");
pw.println("</item>");
}
this.count++;
}
if (this.format == 1) {
if (title != null) pw.println("<a href=\"" + MultiProtocolURL.escape(url) + "\">" + CharacterCoding.unicode2xml(title, true) + "</a>");
}
if (this.format == 2) {
pw.println("<item>");
if (title != null) pw.println("<title>" + CharacterCoding.unicode2xml(title, true) + "</title>");
pw.println("<link>" + MultiProtocolURL.escape(url) + "</link>");
if (author != null && !author.isEmpty()) pw.println("<author>" + CharacterCoding.unicode2xml(author, true) + "</author>");
if (description != null && !description.isEmpty()) pw.println("<description>" + CharacterCoding.unicode2xml(description, true) + "</description>");
if (date != null) pw.println("<pubDate>" + HeaderFramework.formatRFC1123(date) + "</pubDate>");
if (size != null) pw.println("<yacy:size>" + size.intValue() + "</yacy:size>");
pw.println("<guid isPermaLink=\"false\">" + hash + "</guid>");
pw.println("</item>");
} else {
BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", null, 0, 100000000, Long.MAX_VALUE, 100, 1, true);
SolrDocument doc;
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
String url = getStringFrom(doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
if (this.pattern != null && !this.pattern.matcher(url).matches()) continue;
CRIgnoreWriter sw = new CRIgnoreWriter();
EnhancedXMLResponseWriter.writeDoc(sw, doc);
sw.close();
String d = sw.toString();
pw.println(d);
this.count++;
}
this.count++;
}
}
if (this.format == 1) {
@ -721,6 +743,10 @@ public final class Fulltext {
pw.println("</channel>");
pw.println("</rss>");
}
if (this.format == 3) {
pw.println("</result>");
pw.println("</response>");
}
pw.close();
} catch (final IOException e) {
ConcurrentLog.logException(e);

Loading…
Cancel
Save