From c7576d60286ae25a26c651de14343122b9836882 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Fri, 29 May 2015 15:05:52 +0200 Subject: [PATCH] added a full solr export to the IndexControlURLs_p.html servlet. The export function is also now the default export option. The export file format for a full solr export is very similar to a solr search result xml, only the tag is missing. The exported xml has a special line termination feature: all documents will be exported into a single line without any CR in between. That means that every document is completely inside a single line. While this is not readable at all for humans, it is very useful for linux line processing scripts, like grep. Using grep it will be easy to select single documents which match for a given pattern. Such dumps shall be importable with the DATA/SURROGATE/in import function, but that import is not yet adopted to the new file format. --- htroot/IndexControlURLs_p.html | 33 ++++--- htroot/IndexControlURLs_p.java | 4 +- source/net/yacy/cora/util/CRIgnoreWriter.java | 97 +++++++++++++++++++ source/net/yacy/search/index/Fulltext.java | 94 +++++++++++------- 4 files changed, 178 insertions(+), 50 deletions(-) create mode 100644 source/net/yacy/cora/util/CRIgnoreWriter.java diff --git a/htroot/IndexControlURLs_p.html b/htroot/IndexControlURLs_p.html index e4eee30f5..0d245a6ab 100644 --- a/htroot/IndexControlURLs_p.html +++ b/htroot/IndexControlURLs_p.html @@ -118,7 +118,7 @@ function updatepage(str) {
Statistics about top-domains in URL Database
-
 
+
 
Show top domains from all URLs.
@@ -157,7 +157,7 @@ function updatepage(str) {
Dump and Restore of Solr Index
-
 
+
 
@@ -165,7 +165,7 @@ function updatepage(str) {
Dump File
-
 
+
 
@@ -174,7 +174,7 @@ function updatepage(str) {
Optimize Solr
-
 
+
 
merge to max. segments
@@ -184,7 +184,7 @@ function updatepage(str) {
Reboot Solr Core
-
 
+
 
@@ -206,16 +206,19 @@ function updatepage(str) {
Export Format
-
Only Domain: - Plain Text List (domains only)   - HTML (domains as URLs, no title)
- Full URL List: - Plain Text List (URLs only)        - HTML (URLs with title)       - XML (RSS) -
-
-
 
+
+
+
Only Domain:
+
Plain Text List (domains only)
+ HTML (domains as URLs, no title)
+
Full URL List:
+
Plain Text List (URLs only)
+ HTML (URLs with title)
+
Full Data Records:
+
XML (RSS)
+ XML (Rich and full Solr data using Solr Schema, can be imported with DATA/SURROGATE/in/)
+
+
 
diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java index d0ed3cda9..bf35bda5f 100644 --- a/htroot/IndexControlURLs_p.java +++ b/htroot/IndexControlURLs_p.java @@ -256,13 +256,15 @@ public class IndexControlURLs_p { if (fname.endsWith("text")) format = 0; if (fname.endsWith("html")) format = 1; if (fname.endsWith("rss")) format = 2; + if (fname.endsWith("solr")) format = 3; // extend export file name String s = post.get("exportfile", ""); if (s.indexOf('.',0) < 0) { if (format == 0) s = s + ".txt"; if (format == 1) s = s + ".html"; - if (format == 2) s = s + ".xml"; + if (format == 2 ) s = s + "_rss.xml"; + if (format == 3) s = s + "_full.xml"; } final File f = new File(s); f.getParentFile().mkdirs(); diff --git a/source/net/yacy/cora/util/CRIgnoreWriter.java b/source/net/yacy/cora/util/CRIgnoreWriter.java new file mode 100644 index 000000000..e6b1d74f0 --- /dev/null +++ b/source/net/yacy/cora/util/CRIgnoreWriter.java @@ -0,0 +1,97 @@ +/** + * CRIgnoreWriter + * Copyright 29.5.2015 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + + +package net.yacy.cora.util; + +import java.io.StringWriter; + +public class CRIgnoreWriter extends StringWriter { + + public CRIgnoreWriter() { + super(); + } + + public CRIgnoreWriter(final int initialSize) { + super(initialSize); + } + + @Override + public void write(int c) { + if (c >= 32) super.write(c); + } + + @Override + public void write(char cbuf[], int off, int len) { + if ((off < 0) || (off > cbuf.length) || (len < 0) || + ((off + len) > cbuf.length) || ((off + len) < 0)) { + throw new IndexOutOfBoundsException(); + } else if (len == 0) { + return; + } + int p = off; + char c; + for (int i = 0; i < len; i++) { + c = cbuf[p]; + if (c >= 32) super.write(c); + p++; + } + } + + @Override + public void write(String str) { + int len = str.length(); + char c; + for (int i = 0; i < len; i++) { + c = str.charAt(i); + if (c >= 32) super.write(c); + } + } + + @Override + public void write(String str, int off, int len) { + int p = off; + char c; + for (int i = 0; i < len; i++) { + c = str.charAt(p); + if (c >= 32) super.write(c); + p++; + } + } + + @Override + public CRIgnoreWriter append(CharSequence csq) { + this.write(csq == null ? "null" : csq.toString()); + return this; + } + + @Override + public CRIgnoreWriter append(CharSequence csq, int start, int end) { + CharSequence cs = (csq == null ? "null" : csq); + this.write(cs.subSequence(start, end).toString()); + return this; + } + + @Override + public CRIgnoreWriter append(char c) { + if (c >= 32) write(c); + return this; + } + +} diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index 27cd5607a..6d70297e6 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -27,6 +27,7 @@ import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintWriter; +import java.io.StringWriter; import java.lang.reflect.Array; import java.net.MalformedURLException; import java.util.ArrayList; @@ -55,11 +56,13 @@ import net.yacy.cora.federate.solr.instance.EmbeddedInstance; import net.yacy.cora.federate.solr.instance.InstanceMirror; import net.yacy.cora.federate.solr.instance.RemoteInstance; import net.yacy.cora.federate.solr.instance.ShardInstance; +import net.yacy.cora.federate.solr.responsewriter.EnhancedXMLResponseWriter; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.sorting.ReversibleScoreMap; import net.yacy.cora.sorting.WeakPriorityBlockingQueue; import net.yacy.cora.storage.ZIPReader; import net.yacy.cora.storage.ZIPWriter; +import net.yacy.cora.util.CRIgnoreWriter; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.parser.html.CharacterCoding; import net.yacy.kelondro.data.meta.URIMetadataNode; @@ -75,6 +78,7 @@ import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.core.SolrInfoMBean; +import org.apache.commons.io.output.StringBuilderWriter; import org.apache.lucene.util.Version; public final class Fulltext { @@ -666,8 +670,11 @@ public final class Fulltext { pw.println(""); pw.println("http://yacy.net"); } - - + if (this.format == 3) { + pw.println(""); + pw.println(""); + pw.println(""); + } if (this.dom) { Map> scores = Fulltext.this.getDefaultConnector().getFacets(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", 100000000, CollectionSchema.host_s.getSolrFieldName()); ReversibleScoreMap stats = scores.get(CollectionSchema.host_s.getSolrFieldName()); @@ -678,40 +685,55 @@ public final class Fulltext { this.count++; } } else { - BlockingQueue docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", null, 0, 100000000, Long.MAX_VALUE, 100, 1, true, - CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.title.getSolrFieldName(), - CollectionSchema.author.getSolrFieldName(), CollectionSchema.description_txt.getSolrFieldName(), CollectionSchema.size_i.getSolrFieldName(), CollectionSchema.last_modified.getSolrFieldName()); - SolrDocument doc; - String url, hash, title, author, description; - Integer size; - Date date; - while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { - hash = getStringFrom(doc.getFieldValue(CollectionSchema.id.getSolrFieldName())); - url = getStringFrom(doc.getFieldValue(CollectionSchema.sku.getSolrFieldName())); - title = getStringFrom(doc.getFieldValue(CollectionSchema.title.getSolrFieldName())); - author = getStringFrom(doc.getFieldValue(CollectionSchema.author.getSolrFieldName())); - description = getStringFrom(doc.getFieldValue(CollectionSchema.description_txt.getSolrFieldName())); - size = (Integer) doc.getFieldValue(CollectionSchema.size_i.getSolrFieldName()); - date = (Date) doc.getFieldValue(CollectionSchema.last_modified.getSolrFieldName()); - if (this.pattern != null && !this.pattern.matcher(url).matches()) continue; - if (this.format == 0) { - pw.println(url); - } - if (this.format == 1) { - if (title != null) pw.println("" + CharacterCoding.unicode2xml(title, true) + ""); + if (this.format < 3) { + BlockingQueue docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", null, 0, 100000000, Long.MAX_VALUE, 100, 1, true, + CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.title.getSolrFieldName(), + CollectionSchema.author.getSolrFieldName(), CollectionSchema.description_txt.getSolrFieldName(), CollectionSchema.size_i.getSolrFieldName(), CollectionSchema.last_modified.getSolrFieldName()); + SolrDocument doc; + String url, hash, title, author, description; + Integer size; + Date date; + while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { + hash = getStringFrom(doc.getFieldValue(CollectionSchema.id.getSolrFieldName())); + url = getStringFrom(doc.getFieldValue(CollectionSchema.sku.getSolrFieldName())); + title = getStringFrom(doc.getFieldValue(CollectionSchema.title.getSolrFieldName())); + author = getStringFrom(doc.getFieldValue(CollectionSchema.author.getSolrFieldName())); + description = getStringFrom(doc.getFieldValue(CollectionSchema.description_txt.getSolrFieldName())); + size = (Integer) doc.getFieldValue(CollectionSchema.size_i.getSolrFieldName()); + date = (Date) doc.getFieldValue(CollectionSchema.last_modified.getSolrFieldName()); + if (this.pattern != null && !this.pattern.matcher(url).matches()) continue; + if (this.format == 0) { + pw.println(url); + } + if (this.format == 1) { + if (title != null) pw.println("" + CharacterCoding.unicode2xml(title, true) + ""); + } + if (this.format == 2) { + pw.println(""); + if (title != null) pw.println("" + CharacterCoding.unicode2xml(title, true) + ""); + pw.println("" + MultiProtocolURL.escape(url) + ""); + if (author != null && !author.isEmpty()) pw.println("" + CharacterCoding.unicode2xml(author, true) + ""); + if (description != null && !description.isEmpty()) pw.println("" + CharacterCoding.unicode2xml(description, true) + ""); + if (date != null) pw.println("" + HeaderFramework.formatRFC1123(date) + ""); + if (size != null) pw.println("" + size.intValue() + ""); + pw.println("" + hash + ""); + pw.println(""); + } + this.count++; } - if (this.format == 2) { - pw.println(""); - if (title != null) pw.println("" + CharacterCoding.unicode2xml(title, true) + ""); - pw.println("" + MultiProtocolURL.escape(url) + ""); - if (author != null && !author.isEmpty()) pw.println("" + CharacterCoding.unicode2xml(author, true) + ""); - if (description != null && !description.isEmpty()) pw.println("" + CharacterCoding.unicode2xml(description, true) + ""); - if (date != null) pw.println("" + HeaderFramework.formatRFC1123(date) + ""); - if (size != null) pw.println("" + size.intValue() + ""); - pw.println("" + hash + ""); - pw.println(""); + } else { + BlockingQueue docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", null, 0, 100000000, Long.MAX_VALUE, 100, 1, true); + SolrDocument doc; + while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { + String url = getStringFrom(doc.getFieldValue(CollectionSchema.sku.getSolrFieldName())); + if (this.pattern != null && !this.pattern.matcher(url).matches()) continue; + CRIgnoreWriter sw = new CRIgnoreWriter(); + EnhancedXMLResponseWriter.writeDoc(sw, doc); + sw.close(); + String d = sw.toString(); + pw.println(d); + this.count++; } - this.count++; } } if (this.format == 1) { @@ -721,6 +743,10 @@ public final class Fulltext { pw.println(""); pw.println(""); } + if (this.format == 3) { + pw.println(""); + pw.println(""); + } pw.close(); } catch (final IOException e) { ConcurrentLog.logException(e);