diff --git a/htroot/IndexControlURLs_p.html b/htroot/IndexControlURLs_p.html index e4eee30f5..0d245a6ab 100644 --- a/htroot/IndexControlURLs_p.html +++ b/htroot/IndexControlURLs_p.html @@ -118,7 +118,7 @@ function updatepage(str) {
Statistics about top-domains in URL Database
-
 
+
 
Show top domains from all URLs.
@@ -157,7 +157,7 @@ function updatepage(str) {
Dump and Restore of Solr Index
-
 
+
 
@@ -165,7 +165,7 @@ function updatepage(str) {
Dump File
-
 
+
 
@@ -174,7 +174,7 @@ function updatepage(str) {
Optimize Solr
-
 
+
 
merge to max. segments
@@ -184,7 +184,7 @@ function updatepage(str) {
Reboot Solr Core
-
 
+
 
@@ -206,16 +206,19 @@ function updatepage(str) {
Export Format
-
Only Domain: - Plain Text List (domains only)   - HTML (domains as URLs, no title)
- Full URL List: - Plain Text List (URLs only)        - HTML (URLs with title)       - XML (RSS) -
-
-
 
+
+
+
Only Domain:
+
Plain Text List (domains only)
+ HTML (domains as URLs, no title)
+
Full URL List:
+
Plain Text List (URLs only)
+ HTML (URLs with title)
+
Full Data Records:
+
XML (RSS)
+ XML (Rich and full Solr data using Solr Schema, can be imported with DATA/SURROGATE/in/)
+
+
 
diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java index d0ed3cda9..bf35bda5f 100644 --- a/htroot/IndexControlURLs_p.java +++ b/htroot/IndexControlURLs_p.java @@ -256,13 +256,15 @@ public class IndexControlURLs_p { if (fname.endsWith("text")) format = 0; if (fname.endsWith("html")) format = 1; if (fname.endsWith("rss")) format = 2; + if (fname.endsWith("solr")) format = 3; // extend export file name String s = post.get("exportfile", ""); if (s.indexOf('.',0) < 0) { if (format == 0) s = s + ".txt"; if (format == 1) s = s + ".html"; - if (format == 2) s = s + ".xml"; + if (format == 2 ) s = s + "_rss.xml"; + if (format == 3) s = s + "_full.xml"; } final File f = new File(s); f.getParentFile().mkdirs(); diff --git a/source/net/yacy/cora/util/CRIgnoreWriter.java b/source/net/yacy/cora/util/CRIgnoreWriter.java new file mode 100644 index 000000000..e6b1d74f0 --- /dev/null +++ b/source/net/yacy/cora/util/CRIgnoreWriter.java @@ -0,0 +1,97 @@ +/** + * CRIgnoreWriter + * Copyright 29.5.2015 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + + +package net.yacy.cora.util; + +import java.io.StringWriter; + +public class CRIgnoreWriter extends StringWriter { + + public CRIgnoreWriter() { + super(); + } + + public CRIgnoreWriter(final int initialSize) { + super(initialSize); + } + + @Override + public void write(int c) { + if (c >= 32) super.write(c); + } + + @Override + public void write(char cbuf[], int off, int len) { + if ((off < 0) || (off > cbuf.length) || (len < 0) || + ((off + len) > cbuf.length) || ((off + len) < 0)) { + throw new IndexOutOfBoundsException(); + } else if (len == 0) { + return; + } + int p = off; + char c; + for (int i = 0; i < len; i++) { + c = cbuf[p]; + if (c >= 32) super.write(c); + p++; + } + } + + @Override + public void write(String str) { + int len = str.length(); + char c; + for (int i = 0; i < len; i++) { + c = str.charAt(i); + if (c >= 32) super.write(c); + } + } + + @Override + public void write(String str, int off, int len) { + int p = off; + char c; + for (int i = 0; i < len; i++) { + c = str.charAt(p); + if (c >= 32) super.write(c); + p++; + } + } + + @Override + public CRIgnoreWriter append(CharSequence csq) { + this.write(csq == null ? "null" : csq.toString()); + return this; + } + + @Override + public CRIgnoreWriter append(CharSequence csq, int start, int end) { + CharSequence cs = (csq == null ? "null" : csq); + this.write(cs.subSequence(start, end).toString()); + return this; + } + + @Override + public CRIgnoreWriter append(char c) { + if (c >= 32) write(c); + return this; + } + +} diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index 27cd5607a..6d70297e6 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -27,6 +27,7 @@ import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintWriter; +import java.io.StringWriter; import java.lang.reflect.Array; import java.net.MalformedURLException; import java.util.ArrayList; @@ -55,11 +56,13 @@ import net.yacy.cora.federate.solr.instance.EmbeddedInstance; import net.yacy.cora.federate.solr.instance.InstanceMirror; import net.yacy.cora.federate.solr.instance.RemoteInstance; import net.yacy.cora.federate.solr.instance.ShardInstance; +import net.yacy.cora.federate.solr.responsewriter.EnhancedXMLResponseWriter; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.sorting.ReversibleScoreMap; import net.yacy.cora.sorting.WeakPriorityBlockingQueue; import net.yacy.cora.storage.ZIPReader; import net.yacy.cora.storage.ZIPWriter; +import net.yacy.cora.util.CRIgnoreWriter; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.parser.html.CharacterCoding; import net.yacy.kelondro.data.meta.URIMetadataNode; @@ -75,6 +78,7 @@ import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.core.SolrInfoMBean; +import org.apache.commons.io.output.StringBuilderWriter; import org.apache.lucene.util.Version; public final class Fulltext { @@ -666,8 +670,11 @@ public final class Fulltext { pw.println(""); pw.println("http://yacy.net"); } - - + if (this.format == 3) { + pw.println(""); + pw.println(""); + pw.println(""); + } if (this.dom) { Map> scores = Fulltext.this.getDefaultConnector().getFacets(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", 100000000, CollectionSchema.host_s.getSolrFieldName()); ReversibleScoreMap stats = scores.get(CollectionSchema.host_s.getSolrFieldName()); @@ -678,40 +685,55 @@ public final class Fulltext { this.count++; } } else { - BlockingQueue docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", null, 0, 100000000, Long.MAX_VALUE, 100, 1, true, - CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.title.getSolrFieldName(), - CollectionSchema.author.getSolrFieldName(), CollectionSchema.description_txt.getSolrFieldName(), CollectionSchema.size_i.getSolrFieldName(), CollectionSchema.last_modified.getSolrFieldName()); - SolrDocument doc; - String url, hash, title, author, description; - Integer size; - Date date; - while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { - hash = getStringFrom(doc.getFieldValue(CollectionSchema.id.getSolrFieldName())); - url = getStringFrom(doc.getFieldValue(CollectionSchema.sku.getSolrFieldName())); - title = getStringFrom(doc.getFieldValue(CollectionSchema.title.getSolrFieldName())); - author = getStringFrom(doc.getFieldValue(CollectionSchema.author.getSolrFieldName())); - description = getStringFrom(doc.getFieldValue(CollectionSchema.description_txt.getSolrFieldName())); - size = (Integer) doc.getFieldValue(CollectionSchema.size_i.getSolrFieldName()); - date = (Date) doc.getFieldValue(CollectionSchema.last_modified.getSolrFieldName()); - if (this.pattern != null && !this.pattern.matcher(url).matches()) continue; - if (this.format == 0) { - pw.println(url); - } - if (this.format == 1) { - if (title != null) pw.println("" + CharacterCoding.unicode2xml(title, true) + ""); + if (this.format < 3) { + BlockingQueue docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", null, 0, 100000000, Long.MAX_VALUE, 100, 1, true, + CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.title.getSolrFieldName(), + CollectionSchema.author.getSolrFieldName(), CollectionSchema.description_txt.getSolrFieldName(), CollectionSchema.size_i.getSolrFieldName(), CollectionSchema.last_modified.getSolrFieldName()); + SolrDocument doc; + String url, hash, title, author, description; + Integer size; + Date date; + while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { + hash = getStringFrom(doc.getFieldValue(CollectionSchema.id.getSolrFieldName())); + url = getStringFrom(doc.getFieldValue(CollectionSchema.sku.getSolrFieldName())); + title = getStringFrom(doc.getFieldValue(CollectionSchema.title.getSolrFieldName())); + author = getStringFrom(doc.getFieldValue(CollectionSchema.author.getSolrFieldName())); + description = getStringFrom(doc.getFieldValue(CollectionSchema.description_txt.getSolrFieldName())); + size = (Integer) doc.getFieldValue(CollectionSchema.size_i.getSolrFieldName()); + date = (Date) doc.getFieldValue(CollectionSchema.last_modified.getSolrFieldName()); + if (this.pattern != null && !this.pattern.matcher(url).matches()) continue; + if (this.format == 0) { + pw.println(url); + } + if (this.format == 1) { + if (title != null) pw.println("" + CharacterCoding.unicode2xml(title, true) + ""); + } + if (this.format == 2) { + pw.println(""); + if (title != null) pw.println("" + CharacterCoding.unicode2xml(title, true) + ""); + pw.println("" + MultiProtocolURL.escape(url) + ""); + if (author != null && !author.isEmpty()) pw.println("" + CharacterCoding.unicode2xml(author, true) + ""); + if (description != null && !description.isEmpty()) pw.println("" + CharacterCoding.unicode2xml(description, true) + ""); + if (date != null) pw.println("" + HeaderFramework.formatRFC1123(date) + ""); + if (size != null) pw.println("" + size.intValue() + ""); + pw.println("" + hash + ""); + pw.println(""); + } + this.count++; } - if (this.format == 2) { - pw.println(""); - if (title != null) pw.println("" + CharacterCoding.unicode2xml(title, true) + ""); - pw.println("" + MultiProtocolURL.escape(url) + ""); - if (author != null && !author.isEmpty()) pw.println("" + CharacterCoding.unicode2xml(author, true) + ""); - if (description != null && !description.isEmpty()) pw.println("" + CharacterCoding.unicode2xml(description, true) + ""); - if (date != null) pw.println("" + HeaderFramework.formatRFC1123(date) + ""); - if (size != null) pw.println("" + size.intValue() + ""); - pw.println("" + hash + ""); - pw.println(""); + } else { + BlockingQueue docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", null, 0, 100000000, Long.MAX_VALUE, 100, 1, true); + SolrDocument doc; + while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { + String url = getStringFrom(doc.getFieldValue(CollectionSchema.sku.getSolrFieldName())); + if (this.pattern != null && !this.pattern.matcher(url).matches()) continue; + CRIgnoreWriter sw = new CRIgnoreWriter(); + EnhancedXMLResponseWriter.writeDoc(sw, doc); + sw.close(); + String d = sw.toString(); + pw.println(d); + this.count++; } - this.count++; } } if (this.format == 1) { @@ -721,6 +743,10 @@ public final class Fulltext { pw.println(""); pw.println(""); } + if (this.format == 3) { + pw.println(""); + pw.println(""); + } pw.close(); } catch (final IOException e) { ConcurrentLog.logException(e);