From 75dd706e1b7e1e3044daa3882cd167d7c23a709d Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Fri, 2 Nov 2012 13:57:43 +0100 Subject: [PATCH] update to HostBrowser: - time-out after 3 seconds to speed up display (may be incomplete) - showing also all links from the balancer queue in the host list (after the '/') and in the result browser view with tag 'loading' --- htroot/HostBrowser.html | 2 +- htroot/HostBrowser.java | 32 +++++++++++++++---- htroot/IndexCreateQueues_p.java | 2 +- .../solr/connector/AbstractSolrConnector.java | 18 +++++++++-- .../solr/connector/SolrConnector.java | 14 ++++---- source/net/yacy/crawler/Balancer.java | 13 +++++--- source/net/yacy/crawler/data/NoticedURL.java | 10 +++--- source/net/yacy/search/index/Fulltext.java | 23 ++++++++++--- 8 files changed, 81 insertions(+), 33 deletions(-) diff --git a/htroot/HostBrowser.html b/htroot/HostBrowser.html index eb6247135..df8e51868 100644 --- a/htroot/HostBrowser.html +++ b/htroot/HostBrowser.html @@ -77,7 +77,7 @@ function updatepage(str) { #{list}#
#[host]#browse #[host]#
-
#[count]# URLs
+
#[count]##(crawler)#::/#[pending]##(/crawler)# URLs
#{/list}# diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java index 15adf415b..568088d07 100644 --- a/htroot/HostBrowser.java +++ b/htroot/HostBrowser.java @@ -41,6 +41,7 @@ import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.sorting.ClusteredScoreMap; import net.yacy.cora.sorting.ReversibleScoreMap; +import net.yacy.crawler.data.NoticedURL.StackType; import net.yacy.crawler.retrieval.Request; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataNode; @@ -130,8 +131,17 @@ public class HostBrowser { if (post.containsKey("hosts")) { // generate host list try { - int maxcount = 200; + int maxcount = 360; // == 6!/2 which makes nice matrixes for 3, 4, 5, 6 rows/colums + + // collect from index ReversibleScoreMap score = fulltext.getSolr().getFacet(YaCySchema.host_s.name(), maxcount); + + // collect from crawler + final Map crawler = (admin) ? sb.crawlQueues.noticeURL.getDomainStackHosts(StackType.LOCAL, sb.robots) : new HashMap(); + for (Map.Entry host: crawler.entrySet()) { + score.inc(host.getKey(), host.getValue()[0]); + } + int c = 0; Iterator i = score.keys(false); String host; @@ -139,6 +149,9 @@ public class HostBrowser { host = i.next(); prop.put("hosts_list_" + c + "_host", host); prop.put("hosts_list_" + c + "_count", score.get(host)); + boolean inCrawler = crawler.containsKey(host); + prop.put("hosts_list_" + c + "_crawler", inCrawler ? 1 : 0); + if (inCrawler) prop.put("hosts_list_" + c + "_crawler_pending", crawler.get(host)[0]); c++; } prop.put("hosts_list", c); @@ -166,9 +179,8 @@ public class HostBrowser { if (p < 8) { prop.put("files_root", 1); } else { - path = path.substring(0, p + 1); prop.put("files_root", 0); - prop.put("files_root_path", path); + prop.put("files_root_path", path.substring(0, p + 1)); } try { // generate file list from path @@ -179,13 +191,14 @@ public class HostBrowser { String hosthash = ASCII.String(uri.hash(), 6, 6); // get all files for a specific host from the index - BlockingQueue docs = fulltext.getSolr().concurrentQuery(YaCySchema.host_s.name() + ":" + host, 0, 100000, 60000); + BlockingQueue docs = fulltext.getSolr().concurrentQuery(YaCySchema.host_s.name() + ":" + host, 0, 100000, 3000, 100); SolrDocument doc; Set storedDocs = new HashSet(); Set inboundLinks = new HashSet(); Map> outboundHosts = new HashMap>(); int hostsize = 0; final List deleteIDs = new ArrayList(); + long timeout = System.currentTimeMillis() + 3000; while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { String u = (String) doc.getFieldValue(YaCySchema.sku.getSolrFieldName()); hostsize++; @@ -221,10 +234,16 @@ public class HostBrowser { } } catch (MalformedURLException e) {} } + if (System.currentTimeMillis() > timeout) break; } if (deleteIDs.size() > 0) sb.index.fulltext().remove(deleteIDs, true); - // now combine both lists into one + // collect from crawler + List domainStackReferences = sb.crawlQueues.noticeURL.getDomainStackReferences(StackType.LOCAL, host, 1000, 3000); + Set loadingLinks = new HashSet(); + for (Request crawlEntry: domainStackReferences) loadingLinks.add(crawlEntry.url().toNormalform(true)); + + // now combine all lists into one Map files = new HashMap(); for (String u: storedDocs) files.put(u, true); for (String u: inboundLinks) if (!storedDocs.contains(u)) files.put(u, false); @@ -268,8 +287,7 @@ public class HostBrowser { prop.put("files_list_" + c + "_type_url", entry.getKey()); boolean indexed = ((Boolean) entry.getValue()).booleanValue(); try {uri = new DigestURI(entry.getKey());} catch (MalformedURLException e) {uri = null;} - boolean loading = load.equals(entry.getKey()) || - (uri != null && sb.crawlQueues.urlExists(uri.hash()) != null); + boolean loading = load.equals(entry.getKey()) || (uri != null && sb.crawlQueues.urlExists(uri.hash()) != null); //String failr = fulltext.failReason(ASCII.String(uri.hash())); prop.put("files_list_" + c + "_type_stored", indexed ? 1 : loading ? 2 : 0); prop.put("files_list_" + c + "_type_stored_load", loadRight ? 1 : 0); diff --git a/htroot/IndexCreateQueues_p.java b/htroot/IndexCreateQueues_p.java index ce8da5799..fefebcf5a 100644 --- a/htroot/IndexCreateQueues_p.java +++ b/htroot/IndexCreateQueues_p.java @@ -131,7 +131,7 @@ public class IndexCreateQueues_p { prop.putHTML("crawler_host_" + hc + "_queuename", stackType.name()); prop.put("crawler_host_" + hc + "_hostcount", host.getValue()[0]); prop.put("crawler_host_" + hc + "_hostdelta", host.getValue()[1] == Integer.MIN_VALUE ? "not accessed" : Integer.toString(host.getValue()[1])); - List domainStackReferences = sb.crawlQueues.noticeURL.getDomainStackReferences(stackType, host.getKey(), urlsPerHost); + List domainStackReferences = sb.crawlQueues.noticeURL.getDomainStackReferences(stackType, host.getKey(), urlsPerHost, 10000); Seed initiator; String profileHandle; diff --git a/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java index d59e38243..a5222bcc5 100644 --- a/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java @@ -22,6 +22,7 @@ package net.yacy.cora.federate.solr.connector; import java.io.IOException; import java.util.Iterator; +import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; @@ -67,10 +68,21 @@ public abstract class AbstractSolrConnector implements SolrConnector { return false; } } - + + /** + * Get a query result from solr as a stream of documents. + * The result queue is considered as terminated if AbstractSolrConnector.POISON_DOCUMENT is returned. + * The method returns immediately and feeds the search results into the queue + * @param querystring the solr query string + * @param offset first result offset + * @param maxcount the maximum number of results + * @param maxtime the maximum time in milliseconds + * @param buffersize the size of an ArrayBlockingQueue; if <= 0 then a LinkedBlockingQueue is used + * @return a blocking queue which is terminated with AbstractSolrConnector.POISON_DOCUMENT as last element + */ @Override - public BlockingQueue concurrentQuery(final String querystring, final int offset, final int maxcount, final long maxtime) { - final BlockingQueue queue = new LinkedBlockingQueue(); + public BlockingQueue concurrentQuery(final String querystring, final int offset, final int maxcount, final long maxtime, final int buffersize) { + final BlockingQueue queue = buffersize <= 0 ? new LinkedBlockingQueue() : new ArrayBlockingQueue(buffersize); final long endtime = System.currentTimeMillis() + maxtime; final Thread t = new Thread() { @Override diff --git a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java index d02597183..78b7ddfe5 100644 --- a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java @@ -146,12 +146,14 @@ public interface SolrConnector extends Iterable /* Iterable of document * Get a query result from solr as a stream of documents. * The result queue is considered as terminated if AbstractSolrConnector.POISON_DOCUMENT is returned. * The method returns immediately and feeds the search results into the queue - * @param querystring - * @param offset - * @param maxcount - * @return - */ - public BlockingQueue concurrentQuery(final String querystring, final int offset, final int maxcount, final long maxtime); + * @param querystring the solr query string + * @param offset first result offset + * @param maxcount the maximum number of results + * @param maxtime the maximum time in milliseconds + * @param buffersize the size of an ArrayBlockingQueue; if <= 0 then a LinkedBlockingQueue is used + * @return a blocking queue which is terminated with AbstractSolrConnector.POISON_DOCUMENT as last element + */ + public BlockingQueue concurrentQuery(final String querystring, final int offset, final int maxcount, final long maxtime, final int buffersize); /** * get a document id result stream from a solr query. diff --git a/source/net/yacy/crawler/Balancer.java b/source/net/yacy/crawler/Balancer.java index 8f9ad115f..4d80b6296 100644 --- a/source/net/yacy/crawler/Balancer.java +++ b/source/net/yacy/crawler/Balancer.java @@ -339,16 +339,18 @@ public class Balancer { * get lists of crawl request entries for a specific host * @param host * @param maxcount + * @param maxtime * @return a list of crawl loader requests */ - public List getDomainStackReferences(String host, int maxcount) { - HostHandles hh = this.domainStacks.get(host); + public List getDomainStackReferences(final String host, int maxcount, final long maxtime) { + final HostHandles hh = this.domainStacks.get(host); if (hh == null) return new ArrayList(0); - HandleSet domainList = hh.handleSet; + final HandleSet domainList = hh.handleSet; if (domainList.isEmpty()) return new ArrayList(0); - ArrayList cel = new ArrayList(maxcount); + maxcount = Math.min(maxcount, domainList.size()); + final ArrayList cel = new ArrayList(maxcount); + long timeout = System.currentTimeMillis() + maxtime; for (int i = 0; i < maxcount; i++) { - if (domainList.size() <= i) break; final byte[] urlhash = domainList.getOne(i); if (urlhash == null) continue; Row.Entry rowEntry; @@ -365,6 +367,7 @@ public class Balancer { continue; } cel.add(crawlEntry); + if (System.currentTimeMillis() > timeout) break; } return cel; } diff --git a/source/net/yacy/crawler/data/NoticedURL.java b/source/net/yacy/crawler/data/NoticedURL.java index 40084e90e..8a4f78db6 100644 --- a/source/net/yacy/crawler/data/NoticedURL.java +++ b/source/net/yacy/crawler/data/NoticedURL.java @@ -248,12 +248,12 @@ public class NoticedURL { * @param maxcount * @return a list of crawl loader requests */ - public List getDomainStackReferences(final StackType stackType, String host, int maxcount) { + public List getDomainStackReferences(final StackType stackType, String host, int maxcount, final long maxtime) { switch (stackType) { - case LOCAL: return this.coreStack.getDomainStackReferences(host, maxcount); - case GLOBAL: return this.limitStack.getDomainStackReferences(host, maxcount); - case REMOTE: return this.remoteStack.getDomainStackReferences(host, maxcount); - case NOLOAD: return this.noloadStack.getDomainStackReferences(host, maxcount); + case LOCAL: return this.coreStack.getDomainStackReferences(host, maxcount, maxtime); + case GLOBAL: return this.limitStack.getDomainStackReferences(host, maxcount, maxtime); + case REMOTE: return this.remoteStack.getDomainStackReferences(host, maxcount, maxtime); + case NOLOAD: return this.noloadStack.getDomainStackReferences(host, maxcount, maxtime); default: return null; } } diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index 4d2de4288..1042e432a 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -318,7 +318,7 @@ public final class Fulltext implements Iterable { final String host = uri.getHost(); Thread t = new Thread(){ public void run() { - final BlockingQueue docs = getSolr().concurrentQuery(YaCySchema.host_s.name() + ":" + host, 0, 100000, 60000); + final BlockingQueue docs = getSolr().concurrentQuery(YaCySchema.host_s.name() + ":" + host, 0, 1000000, 600000, -1); try { SolrDocument doc; boolean removed = false; @@ -342,12 +342,25 @@ public final class Fulltext implements Iterable { * @param concurrently if true, then the method returnes immediately and runs concurrently */ public void remove(final List deleteIDs, final boolean concurrently) { + if (deleteIDs == null || deleteIDs.size() == 0) return; Thread t = new Thread() { public void run() { - for (byte[] id: deleteIDs) {remove(id);} - Fulltext.this.solr.commit(); - } - }; + try { + synchronized (Fulltext.this.solr) { + for (byte[] urlHash: deleteIDs) { + Fulltext.this.solr.delete(ASCII.String(urlHash)); + } + } + } catch (final Throwable e) { + Log.logException(e); + } + if (Fulltext.this.urlIndexFile != null) try { + for (byte[] urlHash: deleteIDs) { + final Row.Entry r = Fulltext.this.urlIndexFile.remove(urlHash); + if (r != null) Fulltext.this.statsDump = null; + } + } catch (final IOException e) {} + }}; if (concurrently) t.start(); else t.run(); }