diff --git a/htroot/IndexControlURLs_p.html b/htroot/IndexControlURLs_p.html index b83d63754..6b96b1b17 100644 --- a/htroot/IndexControlURLs_p.html +++ b/htroot/IndexControlURLs_p.html @@ -193,6 +193,9 @@ function updatepage(str) {
URL Filter
+
query
+
+
Export Format
Only Domain: Plain Text List (domains only)   diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java index 94d46ba0e..48da0982c 100644 --- a/htroot/IndexControlURLs_p.java +++ b/htroot/IndexControlURLs_p.java @@ -261,7 +261,8 @@ public class IndexControlURLs_p { final File f = new File(s); f.getParentFile().mkdirs(); final String filter = post.get("exportfilter", ".*"); - final Fulltext.Export running = segment.fulltext().export(f, filter, format, dom); + final String query = post.get("exportquery", "*:*"); + final Fulltext.Export running = segment.fulltext().export(f, filter, query, format, dom); prop.put("lurlexport_exportfile", s); prop.put("lurlexport_urlcount", running.count()); diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index ea8a2bac5..cc127ecbe 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -752,12 +752,12 @@ public final class Fulltext { } // export methods - public Export export(final File f, final String filter, final int format, final boolean dom) { + public Export export(final File f, final String filter, final String query, final int format, final boolean dom) { if ((this.exportthread != null) && (this.exportthread.isAlive())) { ConcurrentLog.warn("LURL-EXPORT", "cannot start another export thread, already one running"); return this.exportthread; } - this.exportthread = new Export(f, filter, format, dom); + this.exportthread = new Export(f, filter, query, format, dom); this.exportthread.start(); return this.exportthread; } @@ -770,14 +770,15 @@ public final class Fulltext { private final File f; private final Pattern pattern; private int count; - private String failure; + private String failure, query; private final int format; private final boolean dom; - private Export(final File f, final String filter, final int format, boolean dom) { + private Export(final File f, final String filter, final String query, final int format, boolean dom) { // format: 0=text, 1=html, 2=rss/xml this.f = f; this.pattern = filter == null ? null : Pattern.compile(filter); + this.query = query == null? "*:*" : query; this.count = 0; this.failure = null; this.format = format; @@ -806,7 +807,7 @@ public final class Fulltext { if (this.dom) { - Map> scores = Fulltext.this.getDefaultConnector().getFacets(CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", 100000000, CollectionSchema.host_s.getSolrFieldName()); + Map> scores = Fulltext.this.getDefaultConnector().getFacets(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", 100000000, CollectionSchema.host_s.getSolrFieldName()); ReversibleScoreMap stats = scores.get(CollectionSchema.host_s.getSolrFieldName()); for (final String host: stats) { if (this.pattern != null && !this.pattern.matcher(host).matches()) continue; @@ -815,21 +816,19 @@ public final class Fulltext { this.count++; } } else { - BlockingQueue docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", 0, 100000000, 10 * 60 * 60 * 1000, 100, + BlockingQueue docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", 0, 100000000, 10 * 60 * 60 * 1000, 100, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.title.getSolrFieldName(), CollectionSchema.author.getSolrFieldName(), CollectionSchema.description_txt.getSolrFieldName(), CollectionSchema.size_i.getSolrFieldName(), CollectionSchema.last_modified.getSolrFieldName()); SolrDocument doc; - ArrayList title; - String url, author, hash; - String[] descriptions; + String url, hash, title, author, description; Integer size; Date date; while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { - hash = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()); - url = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()); - title = (ArrayList) doc.getFieldValue(CollectionSchema.title.getSolrFieldName()); - author = (String) doc.getFieldValue(CollectionSchema.author.getSolrFieldName()); - descriptions = (String[]) doc.getFieldValue(CollectionSchema.description_txt.getSolrFieldName()); + hash = getStringFrom(doc.getFieldValue(CollectionSchema.id.getSolrFieldName())); + url = getStringFrom(doc.getFieldValue(CollectionSchema.sku.getSolrFieldName())); + title = getStringFrom(doc.getFieldValue(CollectionSchema.title.getSolrFieldName())); + author = getStringFrom(doc.getFieldValue(CollectionSchema.author.getSolrFieldName())); + description = getStringFrom(doc.getFieldValue(CollectionSchema.description_txt.getSolrFieldName())); size = (Integer) doc.getFieldValue(CollectionSchema.size_i.getSolrFieldName()); date = (Date) doc.getFieldValue(CollectionSchema.last_modified.getSolrFieldName()); if (this.pattern != null && !this.pattern.matcher(url).matches()) continue; @@ -837,16 +836,14 @@ public final class Fulltext { pw.println(url); } if (this.format == 1) { - if (title != null) pw.println("" + CharacterCoding.unicode2xml((String) title.iterator().next(), true) + ""); + if (title != null) pw.println("" + CharacterCoding.unicode2xml(title, true) + ""); } if (this.format == 2) { pw.println(""); - if (title != null) pw.println("" + CharacterCoding.unicode2xml((String) title.iterator().next(), true) + ""); + if (title != null) pw.println("" + CharacterCoding.unicode2xml(title, true) + ""); pw.println("" + MultiProtocolURL.escape(url) + ""); if (author != null && !author.isEmpty()) pw.println("" + CharacterCoding.unicode2xml(author, true) + ""); - if (descriptions != null && descriptions.length > 0) { - for (String d: descriptions) pw.println("" + CharacterCoding.unicode2xml(d, true) + ""); - } + if (description != null && !description.isEmpty()) pw.println("" + CharacterCoding.unicode2xml(description, true) + ""); if (date != null) pw.println("" + HeaderFramework.formatRFC1123(date) + ""); if (size != null) pw.println("" + size.intValue() + ""); pw.println("" + hash + ""); @@ -884,6 +881,13 @@ public final class Fulltext { public int count() { return this.count; } + + @SuppressWarnings("unchecked") + private String getStringFrom(final Object o) { + if (o == null) return ""; + if (o instanceof ArrayList) return ((ArrayList) o).get(0); + return (String) o; + } } diff --git a/source/net/yacy/search/query/QueryGoal.java b/source/net/yacy/search/query/QueryGoal.java index 745bbb2ac..50861de59 100644 --- a/source/net/yacy/search/query/QueryGoal.java +++ b/source/net/yacy/search/query/QueryGoal.java @@ -242,7 +242,8 @@ public class QueryGoal { // add filter to prevent that results come from failed urls q.append(CollectionSchema.httpstatus_i.getSolrFieldName()).append(":200").append(" AND ("); q.append(CollectionSchema.images_urlstub_sxt.getSolrFieldName()).append(":[* TO *] OR "); - q.append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":(jpg OR png OR gif))"); + q.append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":(jpg OR png OR gif) OR"); + q.append(CollectionSchema.content_type.getSolrFieldName()).append(":(image/*))"); // parse special requests if (isCatchall()) return q;