migrated the index export methods from the old metadata to solr. Now

exports are done using solr queries. removed superfluous methods and servlets.
12 years ago · 0fe7b6fd3b
parent 1768c82010
commit 0fe7b6fd3b
10 changed files with 95 additions and 317 deletions
--- a/htroot/CrawlResults.java
+++ b/htroot/CrawlResults.java
@ -124,10 +124,9 @@ public class CrawlResults {

            if (post.containsKey("deletedomain")) {
                final String domain = post.get("domain", null);
-                final String hashpart = domain == null ? null : DigestURI.hosthash6(domain);
-                if (hashpart != null) {
-                    sb.index.fulltext().deleteDomain(hashpart, null, false);
-                    ResultURLs.deleteDomain(tabletype, domain, hashpart);
+                if (domain != null) {
+                    sb.index.fulltext().deleteDomainHostname(domain, null, false);
+                    ResultURLs.deleteDomain(tabletype, domain);
                }
            }

--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@ -294,7 +294,7 @@ public class Crawler_p {
                        siteFilter = CrawlProfile.siteFilter(rootURLs);
                        if (deleteold) {
                            for (DigestURI u: rootURLs) {
-                                int count = sb.index.fulltext().deleteDomain(u.hosthash(), deleteageDate, rootURLs.size() > 1);
+                                int count = sb.index.fulltext().deleteDomainHashpart(u.hosthash(), deleteageDate, rootURLs.size() > 1);
                                if (count > 0) Log.logInfo("Crawler_p", "deleted " + count + " documents for host " + u.getHost());
                            }
                        }
--- a/htroot/IndexControlURLs_p.html
+++ b/htroot/IndexControlURLs_p.html
@ -77,7 +77,6 @@ function updatepage(str) {
        <dt class="TableCellDark">Retrieve by URL-Hash:</dt>
        <dd><input type="text" name="urlhash" value="#[urlhash]#" size="40" maxlength="12" />
            <input type="submit" name="urlhashsearch" value="Show Details for URL-Hash" class="submitready" style="width:240px;"/>
-            <input type="submit" name="urlhashsimilar" value="Generate List" class="submitready" style="width:240px;"/>
        </dd>
      </dl>
    </fieldset>
@ -132,7 +131,7 @@ function updatepage(str) {
        <td>
          <form action="IndexControlURLs_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
            <div>
-              <input type="hidden" name="hashpart" value="#[hashpart]#" />
+              <input type="hidden" name="domain" value="#[domain]#" />
              <input type="hidden" name="lines" value="#[lines]#" />
              <input type="submit" name="deletedomain" value="delete all" class="submitready" style="width:240px;"/>
            </div>
@ -206,13 +205,6 @@ function updatepage(str) {
    <div class="commit">Stored a solr dump to file #[dumpfile]#</div>::
    #(/indexdump)#
    
-    #(urlhashsimilar)#::<p>Sequential List of URL-Hashes:<br />
-    #{rows}# 
-    #{cols}#<a href="/IndexControlURLs_p.html?urlhash=#[urlHash]#&amp;urlhashsearch=1" class="tt">#[urlHash]#</a> #{/cols}#<br />
-    #{/rows}#
-    </p>
-    #(/urlhashsimilar)#
-    
    #(genUrlProfile)#
    ::No entry found for URL-hash #[urlhash]#
    ::<iframe src="/api/yacydoc.html?urlhash=#[urlhash]#" width="100%" height="420" frameborder="0" scrolling="no"></iframe><br />
--- a/htroot/IndexControlURLs_p.java
+++ b/htroot/IndexControlURLs_p.java
@ -30,13 +30,15 @@ import java.io.IOException;
 import java.net.MalformedURLException;
 import java.util.Iterator;
 import java.util.List;
+import java.util.Map;

 import net.yacy.cora.date.GenericFormatter;
 import net.yacy.cora.document.ASCII;
+import net.yacy.cora.federate.solr.YaCySchema;
 import net.yacy.cora.federate.yacy.CacheStrategy;
 import net.yacy.cora.lod.JenaTripleStore;
-import net.yacy.cora.order.Base64Order;
 import net.yacy.cora.protocol.RequestHeader;
+import net.yacy.cora.sorting.ReversibleScoreMap;
 import net.yacy.crawler.data.Cache;
 import net.yacy.crawler.data.ResultURLs;
 import net.yacy.data.WorkTables;
@ -44,7 +46,6 @@ import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.data.meta.URIMetadataNode;
 import net.yacy.kelondro.data.word.Word;
 import net.yacy.kelondro.logging.Log;
-import net.yacy.kelondro.util.RotateIterator;
 import net.yacy.search.Switchboard;
 import net.yacy.search.index.Fulltext;
 import net.yacy.search.index.Segment;
@ -236,30 +237,6 @@ public class IndexControlURLs_p {
            }
        }

-        // generate list
-        if (post.containsKey("urlhashsimilar")) {
-            final Iterator<DigestURI> entryIt = new RotateIterator<DigestURI>(segment.fulltext().urls(), ASCII.String(Base64Order.zero((urlhash == null ? 0 : urlhash.length()))), (int) segment.RWICount());
-			final StringBuilder result = new StringBuilder("Sequential List of URL-Hashes:<br />");
-			DigestURI entry;
-			int i = 0, rows = 0, cols = 0;
-			prop.put("urlhashsimilar", "1");
-			while (entryIt.hasNext() && i < 256) {
-			    entry = entryIt.next();
-			    if (entry == null) break;
-			    prop.put("urlhashsimilar_rows_"+rows+"_cols_"+cols+"_urlHash", ASCII.String(entry.hash()));
-			    cols++;
-			    if (cols==8) {
-			        prop.put("urlhashsimilar_rows_"+rows+"_cols", cols);
-			        cols = 0;
-			        rows++;
-			    }
-			    i++;
-			}
-			prop.put("statistics", 0);
-			prop.put("urlhashsimilar_rows", rows);
-			prop.put("result", result.toString());
-        }
-
        if (post.containsKey("lurlexport")) {
            // parse format
            int format = 0;
@ -279,7 +256,7 @@ public class IndexControlURLs_p {
            final File f = new File(s);
            f.getParentFile().mkdirs();
            final String filter = post.get("exportfilter", ".*");
-            final Fulltext.Export running = segment.fulltext().export(f, filter, null, format, dom);
+            final Fulltext.Export running = segment.fulltext().export(f, filter, format, dom);

            prop.put("lurlexport_exportfile", s);
            prop.put("lurlexport_urlcount", running.count());
@ -301,29 +278,29 @@ public class IndexControlURLs_p {
        }

        if (post.containsKey("deletedomain")) {
-            final String hp = post.get("hashpart");
-            segment.fulltext().deleteDomain(hp, null, false);
+            final String domain = post.get("domain");
+            segment.fulltext().deleteDomainHostname(domain, null, false);
            // trigger the loading of the table
            post.put("statistics", "");
        }

        if (post.containsKey("statistics")) {
            final int count = post.getInt("lines", 100);
-            Iterator<Fulltext.HostStat> statsiter;
            prop.put("statistics_lines", count);
            int cnt = 0;
            try {
                final Fulltext metadata = segment.fulltext();
-                statsiter = metadata.statistics(count, metadata.urlSampleScores(metadata.domainSampleCollector()));
+                Map<String, ReversibleScoreMap<String>> scores = metadata.getSolr().getFacets(YaCySchema.httpstatus_i.getSolrFieldName() + ":200", count, YaCySchema.host_s.getSolrFieldName());
+                ReversibleScoreMap<String> stats = scores.get(YaCySchema.host_s.getSolrFieldName());
+                Iterator<String> statsiter = stats.keys(false);
                boolean dark = true;
-                Fulltext.HostStat hs;
+                String hostname;
+                prop.put("statisticslines_domains_" + cnt + "lines", count);
                while (statsiter.hasNext() && cnt < count) {
-                    hs = statsiter.next();
+                    hostname = statsiter.next();
                    prop.put("statisticslines_domains_" + cnt + "_dark", (dark) ? "1" : "0");
-                    prop.put("statisticslines_domains_" + cnt + "_domain", hs.hostname + ((hs.port == 80) ? "" : ":" + hs.port));
-                    prop.put("statisticslines_domains_" + cnt + "lines", count);
-                    prop.put("statisticslines_domains_" + cnt + "_hashpart", hs.hosthash);
-                    prop.put("statisticslines_domains_" + cnt + "_count", hs.count);
+                    prop.put("statisticslines_domains_" + cnt + "_domain", hostname);
+                    prop.put("statisticslines_domains_" + cnt + "_count", stats.get(hostname));
                    dark = !dark;
                    cnt++;
                }
--- a/htroot/IndexControlURLs_p.xml
+++ b/htroot/IndexControlURLs_p.xml
@ -13,13 +13,4 @@
    #(indexdump)#::
    <dumpfile>#[dumpfile]#</dumpfile>::
    #(/indexdump)#
-    #(urlhashsimilar)#::
-    <urls>
-    #{rows}# 
-    #{cols}#
-    <urlhash>#[urlHash]#</urlhash>
-    #{/cols}#
-    #{/rows}#
-    </urls>
-    #(/urlhashsimilar)#
 </data>
--- a/htroot/YBRFetch_p.html
+++ b/htroot/YBRFetch_p.html
--- a/htroot/YBRFetch_p.java
+++ b/htroot/YBRFetch_p.java
@ -1,70 +0,0 @@
-import java.io.File;
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.Map;
-
-import net.yacy.cora.protocol.RequestHeader;
-import net.yacy.kelondro.logging.Log;
-import net.yacy.kelondro.rwi.ReferenceContainerCache;
-import net.yacy.kelondro.util.MemoryControl;
-import net.yacy.peers.graphics.WebStructureGraph.HostReference;
-import net.yacy.search.Switchboard;
-import net.yacy.search.index.Fulltext;
-import net.yacy.search.index.Fulltext.HostStat;
-import net.yacy.search.index.Segment;
-import net.yacy.search.ranking.BlockRank;
-import net.yacy.server.serverObjects;
-import net.yacy.server.serverSwitch;
-import net.yacy.server.servletProperties;
-
-public class YBRFetch_p
-{
-
-    public static servletProperties respond(
-        @SuppressWarnings("unused") final RequestHeader requestHeader,
-        final serverObjects post,
-        final serverSwitch env) {
-        final servletProperties prop = new servletProperties();
-        final Switchboard sb = (Switchboard) env;
-
-        if ( post == null || !post.containsKey("ghrt4") || MemoryControl.available() < 1024L * 1024L * 1024L ) {
-            return prop;
-        }
-        final File hostIndexFile = new File(sb.queuesRoot, "hostIndex.blob");
-
-        ReferenceContainerCache<HostReference> hostIndex; // this will get large, more than 0.5 million entries by now
-        if ( !hostIndexFile.exists() ) {
-            hostIndex = BlockRank.collect(sb.peers, sb.webStructure, Integer.MAX_VALUE);
-            BlockRank.saveHostIndex(hostIndex, hostIndexFile);
-        } else {
-            hostIndex = BlockRank.loadHostIndex(hostIndexFile);
-        }
-
-        // use an index segment to find hosts for given host hashes
-        final Segment segment = sb.index;
-        final Fulltext metadata = segment.fulltext();
-        Map<String, HostStat> hostHashResolver;
-        try {
-            hostHashResolver = metadata.domainHashResolver(metadata.domainSampleCollector());
-        } catch ( final IOException e ) {
-            hostHashResolver = new HashMap<String, HostStat>();
-        }
-
-        // recursively compute a new ranking table
-        Log.logInfo("BLOCK RANK", "computing new ranking tables...");
-        BlockRank.ybrTables = BlockRank.evaluate(hostIndex, hostHashResolver, null, 0);
-        hostIndex = null; // we don't need that here any more, so free the memory
-
-        // use the web structure and the hostHash resolver to analyse the ranking table
-        Log.logInfo("BLOCK RANK", "analysis of " + BlockRank.ybrTables.length + " tables...");
-        BlockRank.analyse(sb.webStructure, hostHashResolver);
-        // store the new table
-        Log.logInfo("BLOCK RANK", "storing fresh table...");
-        final File rankingPath = new File(sb.appPath, "ranking/YBR".replace('/', File.separatorChar));
-        BlockRank.storeBlockRankTable(rankingPath);
-        BlockRank.loadBlockRankTable(rankingPath, 16);
-
-        return prop;
-    }
-
-}
--- a/source/net/yacy/crawler/data/ResultURLs.java
+++ b/source/net/yacy/crawler/data/ResultURLs.java
@ -143,17 +143,8 @@ public final class ResultURLs {
        return getDomains(stack).keys(false);
    }

-    public static int deleteDomain(final EventOrigin stack, final String host, final String hosthash) {
+    public static int deleteDomain(final EventOrigin stack, final String host) {
        assert host != null : "host = null";
-        assert hosthash.length() == 6;
-        final Iterator<Map.Entry<String, InitExecEntry>> i = results(stack);
-        Map.Entry<String, InitExecEntry> w;
-        String urlhash;
-        while (i.hasNext()) {
-            w = i.next();
-            urlhash = w.getKey();
-            if (urlhash == null || urlhash.substring(6).equals(hosthash)) i.remove();
-        }
        assert getDomains(stack) != null : "getDomains(" + stack + ") = null";
        return getDomains(stack).delete(host);
    }
--- a/source/net/yacy/search/index/Fulltext.java
+++ b/source/net/yacy/search/index/Fulltext.java
@ -34,9 +34,9 @@ import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
-import java.util.TreeSet;
 import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.atomic.AtomicInteger;
+import java.util.regex.Pattern;

 import net.yacy.cora.date.GenericFormatter;
 import net.yacy.cora.date.ISO8601Formatter;
@ -49,8 +49,8 @@ import net.yacy.cora.federate.solr.connector.MirrorSolrConnector;
 import net.yacy.cora.federate.solr.connector.SolrConnector;
 import net.yacy.cora.order.CloneableIterator;
 import net.yacy.cora.sorting.ConcurrentScoreMap;
+import net.yacy.cora.sorting.ReversibleScoreMap;
 import net.yacy.cora.sorting.ScoreMap;
-import net.yacy.cora.storage.HandleSet;
 import net.yacy.cora.storage.ZIPReader;
 import net.yacy.cora.storage.ZIPWriter;
 import net.yacy.document.parser.html.CharacterCoding;
@ -64,15 +64,15 @@ import net.yacy.kelondro.index.Row;
 import net.yacy.kelondro.logging.Log;
 import net.yacy.kelondro.table.SplitTable;
 import net.yacy.kelondro.util.MemoryControl;
-import net.yacy.kelondro.util.MergeIterator;
 import net.yacy.search.Switchboard;

+import org.apache.commons.httpclient.util.DateUtil;
 import org.apache.lucene.util.Version;
 import org.apache.solr.common.SolrDocument;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrInputDocument;

-public final class Fulltext implements Iterable<byte[]> {
+public final class Fulltext {

    private static final String SOLR_PATH = "solr_40"; // the number should be identical to the number in the property luceneMatchVersion in solrconfig.xml
    private static final String SOLR_OLD_PATH[] = new String[]{"solr_36"};
@ -359,7 +359,7 @@ public final class Fulltext implements Iterable<byte[]> {
     * @return number of deleted domains
     * @throws IOException
     */
-    public int deleteDomain(final String hosthash, Date freshdate, boolean concurrent) {
+    public int deleteDomainHashpart(final String hosthash, Date freshdate, boolean concurrent) {
        // first collect all url hashes that belong to the domain
        assert hosthash.length() == 6;
        final String q = YaCySchema.host_id_s.getSolrFieldName() + ":\"" + hosthash + "\"" +
@ -412,6 +412,38 @@ public final class Fulltext implements Iterable<byte[]> {
        return count.get();
    }

+    public int deleteDomainHostname(final String hostname, Date freshdate, boolean concurrent) {
+        // first collect all url hashes that belong to the domain
+        final String q = YaCySchema.host_s.getSolrFieldName() + ":\"" + hostname + "\"" +
+                ((freshdate != null && freshdate.before(new Date())) ? (" AND " + YaCySchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : "");
+        final AtomicInteger count = new AtomicInteger(0);
+        Thread t = new Thread() {
+            public void run() {
+                // delete in solr
+                synchronized (Fulltext.this.solr) {
+                    try {
+                        count.addAndGet(Fulltext.this.solr.deleteByQuery(q));
+                        if (count.get() > 0) Fulltext.this.solr.commit(true);
+                    } catch (IOException e) {}
+                }
+                // finally remove the line with statistics
+                if (Fulltext.this.statsDump != null) {
+                    final Iterator<HostStat> hsi = Fulltext.this.statsDump.iterator();
+                    HostStat hs;
+                    while (hsi.hasNext()) {
+                        hs = hsi.next();
+                        if (hs.hostname.equals(hostname)) {
+                            hsi.remove();
+                            break;
+                        }
+                    }
+                }
+            }
+        };
+        if (concurrent) t.start(); else t.run();
+        return count.get();
+    }
+
    /**
     * remove a full subpath from the index
     * @param subpath the left path of the url; at least until the end of the host
@ -511,96 +543,6 @@ public final class Fulltext implements Iterable<byte[]> {
        return reason == null ? null : reason.length() == 0 ? null : reason;
    }
    
-    @Override
-    public Iterator<byte[]> iterator() {
-    	CloneableIterator<byte[]> a = null;
-    	if (this.urlIndexFile != null) try {a = this.urlIndexFile.keys(true, null);} catch (IOException e) {}
-    	final Iterator<String> idi = this.solr.iterator();
-    	CloneableIterator<byte[]> b = new CloneableIterator<byte[]>() {
-			@Override
-			public boolean hasNext() {
-				return idi.hasNext();
-			}
-			@Override
-			public byte[] next() {
-				String s = idi.next();
-				return s == null ? null : ASCII.getBytes(s);
-			}
-			@Override
-			public void remove() {
-				throw new UnsupportedOperationException();
-			}
-			@Override
-			public CloneableIterator<byte[]> clone(Object modifier) {
-				return this;
-			}
-			@Override
-			public void close() {
-			}
-    	};
-    	if (a == null) return b;
-        return new MergeIterator<byte[]>(a, b,
-                URIMetadataRow.rowdef.objectOrder,
-                MergeIterator.simpleMerge,
-                true);
-    }
-
-    public CloneableIterator<DigestURI> urls() {
-        // enumerates entry elements
-        final Iterator<byte[]> ids = iterator();
-        return new CloneableIterator<DigestURI>() {
-            @Override
-            public CloneableIterator<DigestURI> clone(final Object secondHash) {
-                return this;
-            }
-            @Override
-            public final boolean hasNext() {
-                return ids.hasNext();
-            }
-            @Override
-            public final DigestURI next() {
-                byte[] id = ids.next();
-                if (id == null) return null;
-                return getURL(id);
-            }
-            @Override
-            public final void remove() {
-                ids.remove();
-            }
-            @Override
-            public void close() {
-            }
-        };
-    }
-
-    public CloneableIterator<URIMetadataNode> entries() {
-        // enumerates entry elements
-        final Iterator<byte[]> ids = iterator();
-        return new CloneableIterator<URIMetadataNode>() {
-            @Override
-            public CloneableIterator<URIMetadataNode> clone(final Object secondHash) {
-                return this;
-            }
-            @Override
-            public final boolean hasNext() {
-                return ids.hasNext();
-            }
-            @Override
-            public final URIMetadataNode next() {
-                byte[] id = ids.next();
-                if (id == null) return null;
-                return getMetadata(id);
-            }
-            @Override
-            public final void remove() {
-                ids.remove();
-            }
-            @Override
-            public void close() {
-            }
-        };
-    }
-    
    public List<File> dumpFiles() {
        EmbeddedSolrConnector esc = (EmbeddedSolrConnector) this.solr.getSolr0();
        ArrayList<File> zips = new ArrayList<File>();
@ -675,12 +617,12 @@ public final class Fulltext implements Iterable<byte[]> {
    }
    
    // export methods
-    public Export export(final File f, final String filter, final HandleSet set, final int format, final boolean dom) {
+    public Export export(final File f, final String filter, final int format, final boolean dom) {
        if ((this.exportthread != null) && (this.exportthread.isAlive())) {
            Log.logWarning("LURL-EXPORT", "cannot start another export thread, already one running");
            return this.exportthread;
        }
-        this.exportthread = new Export(f, filter, set, format, dom);
+        this.exportthread = new Export(f, filter, format, dom);
        this.exportthread.start();
        return this.exportthread;
    }
@ -691,22 +633,20 @@ public final class Fulltext implements Iterable<byte[]> {

    public class Export extends Thread {
        private final File f;
-        private final String filter;
+        private final Pattern pattern;
        private int count;
        private String failure;
        private final int format;
        private final boolean dom;
-        private final HandleSet set;

-        private Export(final File f, final String filter, final HandleSet set, final int format, boolean dom) {
+        private Export(final File f, final String filter, final int format, boolean dom) {
            // format: 0=text, 1=html, 2=rss/xml
            this.f = f;
-            this.filter = filter;
+            this.pattern = filter == null ? null : Pattern.compile(filter);
            this.count = 0;
            this.failure = null;
            this.format = format;
            this.dom = dom;
-            this.set = set;
            if ((dom) && (format == 2)) dom = false;
        }

@ -724,43 +664,54 @@ public final class Fulltext implements Iterable<byte[]> {
                    pw.println("<?xml-stylesheet type='text/xsl' href='/yacysearch.xsl' version='1.0'?>");
                    pw.println("<rss version=\"2.0\" xmlns:yacy=\"http://www.yacy.net/\" xmlns:opensearch=\"http://a9.com/-/spec/opensearch/1.1/\" xmlns:atom=\"http://www.w3.org/2005/Atom\">");
                    pw.println("<channel>");
-                    pw.println("<title>YaCy Peer-to-Peer - Web-Search LURL Export</title>");
+                    pw.println("<title>YaCy Peer-to-Peer - Web-Search URL Export</title>");
                    pw.println("<description></description>");
                    pw.println("<link>http://yacy.net</link>");
                }
                
+               
                if (this.dom) {
-                    final TreeSet<String> set = domainNameCollector(-1, domainSampleCollector());
-                    for (final String host: set) {
-                        if (!host.matches(this.filter)) continue;
+                    Map<String, ReversibleScoreMap<String>> scores = Fulltext.this.getSolr().getFacets(YaCySchema.httpstatus_i.getSolrFieldName() + ":200", 100000, YaCySchema.host_s.getSolrFieldName());
+                    ReversibleScoreMap<String> stats = scores.get(YaCySchema.host_s.getSolrFieldName());
+                    for (final String host: stats) {
+                        if (this.pattern != null && !this.pattern.matcher(host).matches()) continue;
                        if (this.format == 0) pw.println(host);
                        if (this.format == 1) pw.println("<a href=\"http://" + host + "\">" + host + "</a><br>");
                        this.count++;
                    }
                } else {
-                    final Iterator<URIMetadataNode> i = entries(); // iterates indexURLEntry objects
-                    URIMetadataNode entry;
-                    String url;
-                    while (i.hasNext()) {
-                        entry = i.next();
-                        if (this.set != null && !this.set.has(entry.hash())) continue;
-                        url = entry.url().toNormalform(true);
-                        if (!url.matches(this.filter)) continue;
+                    BlockingQueue<SolrDocument> docs = Fulltext.this.getSolr().concurrentQuery(YaCySchema.httpstatus_i.getSolrFieldName() + ":200", 0, 100000000, 10 * 60 * 60 * 1000, 100,
+                            YaCySchema.id.getSolrFieldName(), YaCySchema.sku.getSolrFieldName(), YaCySchema.title.getSolrFieldName(),
+                            YaCySchema.author.getSolrFieldName(), YaCySchema.description.getSolrFieldName(), YaCySchema.size_i.getSolrFieldName(), YaCySchema.last_modified.getSolrFieldName());
+                    SolrDocument doc;
+                    ArrayList<?> title;
+                    String url, author, description, hash;
+                    Integer size;
+                    Date date;
+                    while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
+                        hash = (String) doc.getFieldValue(YaCySchema.id.getSolrFieldName());
+                        url = (String) doc.getFieldValue(YaCySchema.sku.getSolrFieldName());
+                        title = (ArrayList<?>) doc.getFieldValue(YaCySchema.title.getSolrFieldName());
+                        author = (String) doc.getFieldValue(YaCySchema.author.getSolrFieldName());
+                        description = (String) doc.getFieldValue(YaCySchema.description.getSolrFieldName());
+                        size = (Integer) doc.getFieldValue(YaCySchema.size_i.getSolrFieldName());
+                        date = (Date) doc.getFieldValue(YaCySchema.last_modified.getSolrFieldName());
+                        if (this.pattern != null && !this.pattern.matcher(url).matches()) continue;
                        if (this.format == 0) {
                            pw.println(url);
                        }
                        if (this.format == 1) {
-                            pw.println("<a href=\"" + url + "\">" + CharacterCoding.unicode2xml(entry.dc_title(), true) + "</a><br>");
+                            if (title != null) pw.println("<a href=\"" + MultiProtocolURI.escape(url) + "\">" + CharacterCoding.unicode2xml((String) title.iterator().next(), true) + "</a>");
                        }
                        if (this.format == 2) {
                            pw.println("<item>");
-                            pw.println("<title>" + CharacterCoding.unicode2xml(entry.dc_title(), true) + "</title>");
+                            if (title != null) pw.println("<title>" + CharacterCoding.unicode2xml((String) title.iterator().next(), true) + "</title>");
                            pw.println("<link>" + MultiProtocolURI.escape(url) + "</link>");
-                            if (!entry.dc_creator().isEmpty()) pw.println("<author>" + CharacterCoding.unicode2xml(entry.dc_creator(), true) + "</author>");
-                            if (!entry.dc_subject().isEmpty()) pw.println("<description>" + CharacterCoding.unicode2xml(entry.dc_subject(), true) + "</description>");
-                            pw.println("<pubDate>" + entry.moddate().toString() + "</pubDate>");
-                            pw.println("<yacy:size>" + entry.size() + "</yacy:size>");
-                            pw.println("<guid isPermaLink=\"false\">" + ASCII.String(entry.hash()) + "</guid>");
+                            if (author != null && !author.isEmpty()) pw.println("<author>" + CharacterCoding.unicode2xml(author, true) + "</author>");
+                            if (description != null && !description.isEmpty()) pw.println("<description>" + CharacterCoding.unicode2xml(description, true) + "</description>");
+                            if (date != null) pw.println("<pubDate>" + DateUtil.formatDate(date) + "</pubDate>");
+                            if (size != null) pw.println("<yacy:size>" + size.intValue() + "</yacy:size>");
+                            pw.println("<guid isPermaLink=\"false\">" + hash + "</guid>");
                            pw.println("</item>");
                        }
                        this.count++;
@ -798,60 +749,6 @@ public final class Fulltext implements Iterable<byte[]> {

    }

-    /**
-     * collect domain samples: all url hashes from the metadata database is listed and the domain part
-     * of the url hashes is used to count how many of these domain hashes appear
-     * @return a map from domain hashes to hash statistics
-     * @throws IOException
-     */
-    public Map<String, URLHashCounter> domainSampleCollector() throws IOException {
-        final Map<String, URLHashCounter> map = new HashMap<String, URLHashCounter>();
-        // first collect all domains and calculate statistics about it
-        synchronized (this) {
-            final Iterator<byte[]> i = this.iterator();
-            String hosthash;
-            byte[] urlhashb;
-            URLHashCounter ds;
-            if (i != null) while (i.hasNext()) {
-                urlhashb = i.next();
-                hosthash = ASCII.String(urlhashb, 6, 6);
-                ds = map.get(hosthash);
-                if (ds == null) {
-                    ds = new URLHashCounter(urlhashb);
-                    map.put(hosthash, ds);
-                } else {
-                    ds.count++;
-                }
-            }
-        }
-        return map;
-    }
-
-    /**
-     * create a list of domain names in this database
-     * @param count number of entries or -1 for all
-     * @param domainSamples a map from domain hashes to hash statistics
-     * @return a set of domain names, ordered by name of the domains
-     */
-    private TreeSet<String> domainNameCollector(int count, final Map<String, URLHashCounter> domainSamples) {
-        // collect hashes from all domains
-
-        // fetch urls from the database to determine the host in clear text
-        DigestURI url;
-        if (count < 0 || count > domainSamples.size()) count = domainSamples.size();
-        this.statsDump = new ArrayList<HostStat>();
-        final TreeSet<String> set = new TreeSet<String>();
-        for (final URLHashCounter hs: domainSamples.values()) {
-            if (hs == null) continue;
-            url = this.getURL(hs.urlhashb);
-            if (url == null || url.getHost() == null) continue;
-            set.add(url.getHost());
-            count--;
-            if (count == 0) break;
-        }
-        return set;
-    }
-
    /**
     * calculate a score map for url hash samples: each sample is a single url hash
     * that stands for all entries for the corresponding domain. The map counts the number
--- a/source/net/yacy/search/query/QueryGoal.java
+++ b/source/net/yacy/search/query/QueryGoal.java
@ -246,7 +246,8 @@ public class QueryGoal {
        q.append(')');

        // add filter to prevent that results come from failed urls
-        q.append(" AND -").append(YaCySchema.failreason_t.getSolrFieldName()).append(":[* TO *]");
+        q.append(" AND ").append(YaCySchema.httpstatus_i.getSolrFieldName()).append(":200");
+        //q.append(" AND -").append(YaCySchema.failreason_t.getSolrFieldName()).append(":[* TO *]");

        return q;
    }