update to HostBrowser:

- time-out after 3 seconds to speed up display (may be incomplete) - showing also all links from the balancer queue in the host list (after the '/') and in the result browser view with tag 'loading'
13 years ago · 75dd706e1b
parent e2c4c3c7d3
commit 75dd706e1b
8 changed files with 81 additions and 33 deletions
--- a/htroot/HostBrowser.html
+++ b/htroot/HostBrowser.html
@ -77,7 +77,7 @@ function updatepage(str) {
      #{list}#
      <div style="float:left; padding:1px 5px 1px 5px;">
        <div style="width:160px; text-align:left; float: left; white-space:nowrap; overflow:hidden;"><div id="info"><a href="/HostBrowser.html?path=#[host]#">#[host]#</a><span>browse #[host]#</span></div></div>
-        <div style="width:80px; text-align:right; float: left; white-space:nowrap; overflow:hidden;">#[count]# URLs</div>
+        <div style="width:80px; text-align:right; float: left; white-space:nowrap; overflow:hidden;">#[count]##(crawler)#::/#[pending]##(/crawler)# URLs</div>
      </div>
      #{/list}#
    </fieldset>
--- a/htroot/HostBrowser.java
+++ b/htroot/HostBrowser.java
@ -41,6 +41,7 @@ import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.sorting.ClusteredScoreMap;
 import net.yacy.cora.sorting.ReversibleScoreMap;
+import net.yacy.crawler.data.NoticedURL.StackType;
 import net.yacy.crawler.retrieval.Request;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.data.meta.URIMetadataNode;
@ -130,8 +131,17 @@ public class HostBrowser {
        if (post.containsKey("hosts")) {
            // generate host list
            try {
-                int maxcount = 200;
+                int maxcount = 360; // == 6!/2 which makes nice matrixes for 3, 4, 5, 6 rows/colums
+                
+                // collect from index
                ReversibleScoreMap<String> score = fulltext.getSolr().getFacet(YaCySchema.host_s.name(), maxcount);
+                
+                // collect from crawler
+                final Map<String, Integer[]> crawler = (admin) ? sb.crawlQueues.noticeURL.getDomainStackHosts(StackType.LOCAL, sb.robots) : new HashMap<String, Integer[]>();
+                for (Map.Entry<String, Integer[]> host: crawler.entrySet()) {
+                    score.inc(host.getKey(), host.getValue()[0]);
+                }
+                
                int c = 0;
                Iterator<String> i = score.keys(false);
                String host;
@ -139,6 +149,9 @@ public class HostBrowser {
                    host = i.next();
                    prop.put("hosts_list_" + c + "_host", host);
                    prop.put("hosts_list_" + c + "_count", score.get(host));
+                    boolean inCrawler = crawler.containsKey(host);
+                    prop.put("hosts_list_" + c + "_crawler", inCrawler ? 1 : 0);
+                    if (inCrawler) prop.put("hosts_list_" + c + "_crawler_pending", crawler.get(host)[0]);
                    c++;
                }
                prop.put("hosts_list", c);
@ -166,9 +179,8 @@ public class HostBrowser {
            if (p < 8) {
                prop.put("files_root", 1);
            } else {
-                path = path.substring(0, p + 1);
                prop.put("files_root", 0);
-                prop.put("files_root_path", path);
+                prop.put("files_root_path", path.substring(0, p + 1));
            }
            try {
                // generate file list from path
@ -179,13 +191,14 @@ public class HostBrowser {
                String hosthash = ASCII.String(uri.hash(), 6, 6);
                
                // get all files for a specific host from the index
-                BlockingQueue<SolrDocument> docs = fulltext.getSolr().concurrentQuery(YaCySchema.host_s.name() + ":" + host, 0, 100000, 60000);
+                BlockingQueue<SolrDocument> docs = fulltext.getSolr().concurrentQuery(YaCySchema.host_s.name() + ":" + host, 0, 100000, 3000, 100);
                SolrDocument doc;
                Set<String> storedDocs = new HashSet<String>();
                Set<String> inboundLinks = new HashSet<String>();
                Map<String, ReversibleScoreMap<String>> outboundHosts = new HashMap<String, ReversibleScoreMap<String>>();
                int hostsize = 0;
                final List<byte[]> deleteIDs = new ArrayList<byte[]>();
+                long timeout = System.currentTimeMillis() + 3000;
                while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
                    String u = (String) doc.getFieldValue(YaCySchema.sku.getSolrFieldName());
                    hostsize++;
@ -221,10 +234,16 @@ public class HostBrowser {
                            }
                        } catch (MalformedURLException e) {}
                    }
+                    if (System.currentTimeMillis() > timeout) break;
                }
                if (deleteIDs.size() > 0) sb.index.fulltext().remove(deleteIDs, true);
                
-                // now combine both lists into one
+                // collect from crawler
+                List<Request> domainStackReferences = sb.crawlQueues.noticeURL.getDomainStackReferences(StackType.LOCAL, host, 1000, 3000);
+                Set<String> loadingLinks = new HashSet<String>();
+                for (Request crawlEntry: domainStackReferences) loadingLinks.add(crawlEntry.url().toNormalform(true));
+                
+                // now combine all lists into one
                Map<String, Boolean> files = new HashMap<String, Boolean>();
                for (String u: storedDocs) files.put(u, true);
                for (String u: inboundLinks) if (!storedDocs.contains(u)) files.put(u, false);
@ -268,8 +287,7 @@ public class HostBrowser {
                        prop.put("files_list_" + c + "_type_url", entry.getKey());
                        boolean indexed = ((Boolean) entry.getValue()).booleanValue();
                        try {uri = new DigestURI(entry.getKey());} catch (MalformedURLException e) {uri = null;}
-                        boolean loading = load.equals(entry.getKey()) ||
-                                (uri != null && sb.crawlQueues.urlExists(uri.hash()) != null);
+                        boolean loading = load.equals(entry.getKey()) || (uri != null && sb.crawlQueues.urlExists(uri.hash()) != null);
                        //String failr = fulltext.failReason(ASCII.String(uri.hash()));
                        prop.put("files_list_" + c + "_type_stored", indexed ? 1 : loading ? 2 : 0);
                        prop.put("files_list_" + c + "_type_stored_load", loadRight ? 1 : 0);
--- a/htroot/IndexCreateQueues_p.java
+++ b/htroot/IndexCreateQueues_p.java
@ -131,7 +131,7 @@ public class IndexCreateQueues_p {
                prop.putHTML("crawler_host_" + hc + "_queuename", stackType.name());
                prop.put("crawler_host_" + hc + "_hostcount", host.getValue()[0]);
                prop.put("crawler_host_" + hc + "_hostdelta", host.getValue()[1] == Integer.MIN_VALUE ? "not accessed" : Integer.toString(host.getValue()[1]));
-                List<Request> domainStackReferences = sb.crawlQueues.noticeURL.getDomainStackReferences(stackType, host.getKey(), urlsPerHost);
+                List<Request> domainStackReferences = sb.crawlQueues.noticeURL.getDomainStackReferences(stackType, host.getKey(), urlsPerHost, 10000);

                Seed initiator;
                String profileHandle;
--- a/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java
+++ b/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java
@ -22,6 +22,7 @@ package net.yacy.cora.federate.solr.connector;

 import java.io.IOException;
 import java.util.Iterator;
+import java.util.concurrent.ArrayBlockingQueue;
 import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.LinkedBlockingQueue;
 import java.util.concurrent.TimeUnit;
@ -67,10 +68,21 @@ public abstract class AbstractSolrConnector implements SolrConnector {
            return false;
        }
    }
-
+    
+    /**
+     * Get a query result from solr as a stream of documents.
+     * The result queue is considered as terminated if AbstractSolrConnector.POISON_DOCUMENT is returned.
+     * The method returns immediately and feeds the search results into the queue
+     * @param querystring the solr query string
+     * @param offset first result offset
+     * @param maxcount the maximum number of results
+     * @param maxtime the maximum time in milliseconds
+     * @param buffersize the size of an ArrayBlockingQueue; if <= 0 then a LinkedBlockingQueue is used
+     * @return a blocking queue which is terminated  with AbstractSolrConnector.POISON_DOCUMENT as last element
+     */
    @Override
-    public BlockingQueue<SolrDocument> concurrentQuery(final String querystring, final int offset, final int maxcount, final long maxtime) {
-        final BlockingQueue<SolrDocument> queue = new LinkedBlockingQueue<SolrDocument>();
+    public BlockingQueue<SolrDocument> concurrentQuery(final String querystring, final int offset, final int maxcount, final long maxtime, final int buffersize) {
+        final BlockingQueue<SolrDocument> queue = buffersize <= 0 ? new LinkedBlockingQueue<SolrDocument>() : new ArrayBlockingQueue<SolrDocument>(buffersize);
        final long endtime = System.currentTimeMillis() + maxtime;
        final Thread t = new Thread() {
            @Override
--- a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java
+++ b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java
@ -146,12 +146,14 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
     * Get a query result from solr as a stream of documents.
     * The result queue is considered as terminated if AbstractSolrConnector.POISON_DOCUMENT is returned.
     * The method returns immediately and feeds the search results into the queue
-     * @param querystring
-     * @param offset
-     * @param maxcount
-     * @return
-     */
-    public BlockingQueue<SolrDocument> concurrentQuery(final String querystring, final int offset, final int maxcount, final long maxtime);
+     * @param querystring the solr query string
+     * @param offset first result offset
+     * @param maxcount the maximum number of results
+     * @param maxtime the maximum time in milliseconds
+     * @param buffersize the size of an ArrayBlockingQueue; if <= 0 then a LinkedBlockingQueue is used
+     * @return a blocking queue which is terminated  with AbstractSolrConnector.POISON_DOCUMENT as last element
+     */
+    public BlockingQueue<SolrDocument> concurrentQuery(final String querystring, final int offset, final int maxcount, final long maxtime, final int buffersize);

    /**
     * get a document id result stream from a solr query.
--- a/source/net/yacy/crawler/Balancer.java
+++ b/source/net/yacy/crawler/Balancer.java
@ -339,16 +339,18 @@ public class Balancer {
     * get lists of crawl request entries for a specific host
     * @param host
     * @param maxcount
+     * @param maxtime
     * @return a list of crawl loader requests
     */
-    public List<Request> getDomainStackReferences(String host, int maxcount) {
-        HostHandles hh = this.domainStacks.get(host);
+    public List<Request> getDomainStackReferences(final String host, int maxcount, final long maxtime) {
+        final HostHandles hh = this.domainStacks.get(host);
        if (hh == null) return new ArrayList<Request>(0);
-        HandleSet domainList = hh.handleSet;
+        final HandleSet domainList = hh.handleSet;
        if (domainList.isEmpty()) return new ArrayList<Request>(0);
-        ArrayList<Request> cel = new ArrayList<Request>(maxcount);
+        maxcount = Math.min(maxcount, domainList.size());
+        final ArrayList<Request> cel = new ArrayList<Request>(maxcount);
+        long timeout = System.currentTimeMillis() + maxtime;
        for (int i = 0; i < maxcount; i++) {
-            if (domainList.size() <= i) break;
            final byte[] urlhash = domainList.getOne(i);
            if (urlhash == null) continue;
            Row.Entry rowEntry;
@ -365,6 +367,7 @@ public class Balancer {
                continue;
            }
            cel.add(crawlEntry);
+            if (System.currentTimeMillis() > timeout) break;
        }
        return cel;
    }
--- a/source/net/yacy/crawler/data/NoticedURL.java
+++ b/source/net/yacy/crawler/data/NoticedURL.java
@ -248,12 +248,12 @@ public class NoticedURL {
     * @param maxcount
     * @return a list of crawl loader requests
     */
-    public List<Request> getDomainStackReferences(final StackType stackType, String host, int maxcount) {
+    public List<Request> getDomainStackReferences(final StackType stackType, String host, int maxcount, final long maxtime) {
        switch (stackType) {
-            case LOCAL:     return this.coreStack.getDomainStackReferences(host, maxcount);
-            case GLOBAL:    return this.limitStack.getDomainStackReferences(host, maxcount);
-            case REMOTE:   return this.remoteStack.getDomainStackReferences(host, maxcount);
-            case NOLOAD:   return this.noloadStack.getDomainStackReferences(host, maxcount);
+            case LOCAL:     return this.coreStack.getDomainStackReferences(host, maxcount, maxtime);
+            case GLOBAL:    return this.limitStack.getDomainStackReferences(host, maxcount, maxtime);
+            case REMOTE:   return this.remoteStack.getDomainStackReferences(host, maxcount, maxtime);
+            case NOLOAD:   return this.noloadStack.getDomainStackReferences(host, maxcount, maxtime);
            default: return null;
        }
    }
--- a/source/net/yacy/search/index/Fulltext.java
+++ b/source/net/yacy/search/index/Fulltext.java
@ -318,7 +318,7 @@ public final class Fulltext implements Iterable<byte[]> {
        final String host = uri.getHost();
        Thread t = new Thread(){
            public void run() {
-                final BlockingQueue<SolrDocument> docs = getSolr().concurrentQuery(YaCySchema.host_s.name() + ":" + host, 0, 100000, 60000);
+                final BlockingQueue<SolrDocument> docs = getSolr().concurrentQuery(YaCySchema.host_s.name() + ":" + host, 0, 1000000, 600000, -1);
                try {
                    SolrDocument doc;
                    boolean removed = false;
@ -342,12 +342,25 @@ public final class Fulltext implements Iterable<byte[]> {
     * @param concurrently if true, then the method returnes immediately and runs concurrently
     */
    public void remove(final List<byte[]> deleteIDs, final boolean concurrently) {
+        if (deleteIDs == null || deleteIDs.size() == 0) return;
        Thread t = new Thread() {
            public void run() {
-                for (byte[] id: deleteIDs) {remove(id);}
-                Fulltext.this.solr.commit();
-            }
-        };
+                try {
+                    synchronized (Fulltext.this.solr) {
+                        for (byte[] urlHash: deleteIDs) {
+                            Fulltext.this.solr.delete(ASCII.String(urlHash));
+                        }
+                    }
+                } catch (final Throwable e) {
+                    Log.logException(e);
+                }
+                if (Fulltext.this.urlIndexFile != null) try {
+                    for (byte[] urlHash: deleteIDs) {
+                        final Row.Entry r = Fulltext.this.urlIndexFile.remove(urlHash);
+                        if (r != null) Fulltext.this.statsDump = null;
+                    }
+                } catch (final IOException e) {}
+        }};
        if (concurrently) t.start(); else t.run();
    }