update to HostBrowser:

- time-out after 3 seconds to speed up display (may be incomplete)
- showing also all links from the balancer queue in the host list (after
the '/') and in the result browser view with tag 'loading'
pull/1/head
Michael Peter Christen 12 years ago
parent e2c4c3c7d3
commit 75dd706e1b

@ -77,7 +77,7 @@ function updatepage(str) {
#{list}#
<div style="float:left; padding:1px 5px 1px 5px;">
<div style="width:160px; text-align:left; float: left; white-space:nowrap; overflow:hidden;"><div id="info"><a href="/HostBrowser.html?path=#[host]#">#[host]#</a><span>browse #[host]#</span></div></div>
<div style="width:80px; text-align:right; float: left; white-space:nowrap; overflow:hidden;">#[count]# URLs</div>
<div style="width:80px; text-align:right; float: left; white-space:nowrap; overflow:hidden;">#[count]##(crawler)#::/#[pending]##(/crawler)# URLs</div>
</div>
#{/list}#
</fieldset>

@ -41,6 +41,7 @@ import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.sorting.ClusteredScoreMap;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.crawler.data.NoticedURL.StackType;
import net.yacy.crawler.retrieval.Request;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataNode;
@ -130,8 +131,17 @@ public class HostBrowser {
if (post.containsKey("hosts")) {
// generate host list
try {
int maxcount = 200;
int maxcount = 360; // == 6!/2 which makes nice matrixes for 3, 4, 5, 6 rows/colums
// collect from index
ReversibleScoreMap<String> score = fulltext.getSolr().getFacet(YaCySchema.host_s.name(), maxcount);
// collect from crawler
final Map<String, Integer[]> crawler = (admin) ? sb.crawlQueues.noticeURL.getDomainStackHosts(StackType.LOCAL, sb.robots) : new HashMap<String, Integer[]>();
for (Map.Entry<String, Integer[]> host: crawler.entrySet()) {
score.inc(host.getKey(), host.getValue()[0]);
}
int c = 0;
Iterator<String> i = score.keys(false);
String host;
@ -139,6 +149,9 @@ public class HostBrowser {
host = i.next();
prop.put("hosts_list_" + c + "_host", host);
prop.put("hosts_list_" + c + "_count", score.get(host));
boolean inCrawler = crawler.containsKey(host);
prop.put("hosts_list_" + c + "_crawler", inCrawler ? 1 : 0);
if (inCrawler) prop.put("hosts_list_" + c + "_crawler_pending", crawler.get(host)[0]);
c++;
}
prop.put("hosts_list", c);
@ -166,9 +179,8 @@ public class HostBrowser {
if (p < 8) {
prop.put("files_root", 1);
} else {
path = path.substring(0, p + 1);
prop.put("files_root", 0);
prop.put("files_root_path", path);
prop.put("files_root_path", path.substring(0, p + 1));
}
try {
// generate file list from path
@ -179,13 +191,14 @@ public class HostBrowser {
String hosthash = ASCII.String(uri.hash(), 6, 6);
// get all files for a specific host from the index
BlockingQueue<SolrDocument> docs = fulltext.getSolr().concurrentQuery(YaCySchema.host_s.name() + ":" + host, 0, 100000, 60000);
BlockingQueue<SolrDocument> docs = fulltext.getSolr().concurrentQuery(YaCySchema.host_s.name() + ":" + host, 0, 100000, 3000, 100);
SolrDocument doc;
Set<String> storedDocs = new HashSet<String>();
Set<String> inboundLinks = new HashSet<String>();
Map<String, ReversibleScoreMap<String>> outboundHosts = new HashMap<String, ReversibleScoreMap<String>>();
int hostsize = 0;
final List<byte[]> deleteIDs = new ArrayList<byte[]>();
long timeout = System.currentTimeMillis() + 3000;
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
String u = (String) doc.getFieldValue(YaCySchema.sku.getSolrFieldName());
hostsize++;
@ -221,10 +234,16 @@ public class HostBrowser {
}
} catch (MalformedURLException e) {}
}
if (System.currentTimeMillis() > timeout) break;
}
if (deleteIDs.size() > 0) sb.index.fulltext().remove(deleteIDs, true);
// now combine both lists into one
// collect from crawler
List<Request> domainStackReferences = sb.crawlQueues.noticeURL.getDomainStackReferences(StackType.LOCAL, host, 1000, 3000);
Set<String> loadingLinks = new HashSet<String>();
for (Request crawlEntry: domainStackReferences) loadingLinks.add(crawlEntry.url().toNormalform(true));
// now combine all lists into one
Map<String, Boolean> files = new HashMap<String, Boolean>();
for (String u: storedDocs) files.put(u, true);
for (String u: inboundLinks) if (!storedDocs.contains(u)) files.put(u, false);
@ -268,8 +287,7 @@ public class HostBrowser {
prop.put("files_list_" + c + "_type_url", entry.getKey());
boolean indexed = ((Boolean) entry.getValue()).booleanValue();
try {uri = new DigestURI(entry.getKey());} catch (MalformedURLException e) {uri = null;}
boolean loading = load.equals(entry.getKey()) ||
(uri != null && sb.crawlQueues.urlExists(uri.hash()) != null);
boolean loading = load.equals(entry.getKey()) || (uri != null && sb.crawlQueues.urlExists(uri.hash()) != null);
//String failr = fulltext.failReason(ASCII.String(uri.hash()));
prop.put("files_list_" + c + "_type_stored", indexed ? 1 : loading ? 2 : 0);
prop.put("files_list_" + c + "_type_stored_load", loadRight ? 1 : 0);

@ -131,7 +131,7 @@ public class IndexCreateQueues_p {
prop.putHTML("crawler_host_" + hc + "_queuename", stackType.name());
prop.put("crawler_host_" + hc + "_hostcount", host.getValue()[0]);
prop.put("crawler_host_" + hc + "_hostdelta", host.getValue()[1] == Integer.MIN_VALUE ? "not accessed" : Integer.toString(host.getValue()[1]));
List<Request> domainStackReferences = sb.crawlQueues.noticeURL.getDomainStackReferences(stackType, host.getKey(), urlsPerHost);
List<Request> domainStackReferences = sb.crawlQueues.noticeURL.getDomainStackReferences(stackType, host.getKey(), urlsPerHost, 10000);
Seed initiator;
String profileHandle;

@ -22,6 +22,7 @@ package net.yacy.cora.federate.solr.connector;
import java.io.IOException;
import java.util.Iterator;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
@ -67,10 +68,21 @@ public abstract class AbstractSolrConnector implements SolrConnector {
return false;
}
}
/**
* Get a query result from solr as a stream of documents.
* The result queue is considered as terminated if AbstractSolrConnector.POISON_DOCUMENT is returned.
* The method returns immediately and feeds the search results into the queue
* @param querystring the solr query string
* @param offset first result offset
* @param maxcount the maximum number of results
* @param maxtime the maximum time in milliseconds
* @param buffersize the size of an ArrayBlockingQueue; if <= 0 then a LinkedBlockingQueue is used
* @return a blocking queue which is terminated with AbstractSolrConnector.POISON_DOCUMENT as last element
*/
@Override
public BlockingQueue<SolrDocument> concurrentQuery(final String querystring, final int offset, final int maxcount, final long maxtime) {
final BlockingQueue<SolrDocument> queue = new LinkedBlockingQueue<SolrDocument>();
public BlockingQueue<SolrDocument> concurrentQuery(final String querystring, final int offset, final int maxcount, final long maxtime, final int buffersize) {
final BlockingQueue<SolrDocument> queue = buffersize <= 0 ? new LinkedBlockingQueue<SolrDocument>() : new ArrayBlockingQueue<SolrDocument>(buffersize);
final long endtime = System.currentTimeMillis() + maxtime;
final Thread t = new Thread() {
@Override

@ -146,12 +146,14 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
* Get a query result from solr as a stream of documents.
* The result queue is considered as terminated if AbstractSolrConnector.POISON_DOCUMENT is returned.
* The method returns immediately and feeds the search results into the queue
* @param querystring
* @param offset
* @param maxcount
* @return
*/
public BlockingQueue<SolrDocument> concurrentQuery(final String querystring, final int offset, final int maxcount, final long maxtime);
* @param querystring the solr query string
* @param offset first result offset
* @param maxcount the maximum number of results
* @param maxtime the maximum time in milliseconds
* @param buffersize the size of an ArrayBlockingQueue; if <= 0 then a LinkedBlockingQueue is used
* @return a blocking queue which is terminated with AbstractSolrConnector.POISON_DOCUMENT as last element
*/
public BlockingQueue<SolrDocument> concurrentQuery(final String querystring, final int offset, final int maxcount, final long maxtime, final int buffersize);
/**
* get a document id result stream from a solr query.

@ -339,16 +339,18 @@ public class Balancer {
* get lists of crawl request entries for a specific host
* @param host
* @param maxcount
* @param maxtime
* @return a list of crawl loader requests
*/
public List<Request> getDomainStackReferences(String host, int maxcount) {
HostHandles hh = this.domainStacks.get(host);
public List<Request> getDomainStackReferences(final String host, int maxcount, final long maxtime) {
final HostHandles hh = this.domainStacks.get(host);
if (hh == null) return new ArrayList<Request>(0);
HandleSet domainList = hh.handleSet;
final HandleSet domainList = hh.handleSet;
if (domainList.isEmpty()) return new ArrayList<Request>(0);
ArrayList<Request> cel = new ArrayList<Request>(maxcount);
maxcount = Math.min(maxcount, domainList.size());
final ArrayList<Request> cel = new ArrayList<Request>(maxcount);
long timeout = System.currentTimeMillis() + maxtime;
for (int i = 0; i < maxcount; i++) {
if (domainList.size() <= i) break;
final byte[] urlhash = domainList.getOne(i);
if (urlhash == null) continue;
Row.Entry rowEntry;
@ -365,6 +367,7 @@ public class Balancer {
continue;
}
cel.add(crawlEntry);
if (System.currentTimeMillis() > timeout) break;
}
return cel;
}

@ -248,12 +248,12 @@ public class NoticedURL {
* @param maxcount
* @return a list of crawl loader requests
*/
public List<Request> getDomainStackReferences(final StackType stackType, String host, int maxcount) {
public List<Request> getDomainStackReferences(final StackType stackType, String host, int maxcount, final long maxtime) {
switch (stackType) {
case LOCAL: return this.coreStack.getDomainStackReferences(host, maxcount);
case GLOBAL: return this.limitStack.getDomainStackReferences(host, maxcount);
case REMOTE: return this.remoteStack.getDomainStackReferences(host, maxcount);
case NOLOAD: return this.noloadStack.getDomainStackReferences(host, maxcount);
case LOCAL: return this.coreStack.getDomainStackReferences(host, maxcount, maxtime);
case GLOBAL: return this.limitStack.getDomainStackReferences(host, maxcount, maxtime);
case REMOTE: return this.remoteStack.getDomainStackReferences(host, maxcount, maxtime);
case NOLOAD: return this.noloadStack.getDomainStackReferences(host, maxcount, maxtime);
default: return null;
}
}

@ -318,7 +318,7 @@ public final class Fulltext implements Iterable<byte[]> {
final String host = uri.getHost();
Thread t = new Thread(){
public void run() {
final BlockingQueue<SolrDocument> docs = getSolr().concurrentQuery(YaCySchema.host_s.name() + ":" + host, 0, 100000, 60000);
final BlockingQueue<SolrDocument> docs = getSolr().concurrentQuery(YaCySchema.host_s.name() + ":" + host, 0, 1000000, 600000, -1);
try {
SolrDocument doc;
boolean removed = false;
@ -342,12 +342,25 @@ public final class Fulltext implements Iterable<byte[]> {
* @param concurrently if true, then the method returnes immediately and runs concurrently
*/
public void remove(final List<byte[]> deleteIDs, final boolean concurrently) {
if (deleteIDs == null || deleteIDs.size() == 0) return;
Thread t = new Thread() {
public void run() {
for (byte[] id: deleteIDs) {remove(id);}
Fulltext.this.solr.commit();
}
};
try {
synchronized (Fulltext.this.solr) {
for (byte[] urlHash: deleteIDs) {
Fulltext.this.solr.delete(ASCII.String(urlHash));
}
}
} catch (final Throwable e) {
Log.logException(e);
}
if (Fulltext.this.urlIndexFile != null) try {
for (byte[] urlHash: deleteIDs) {
final Row.Entry r = Fulltext.this.urlIndexFile.remove(urlHash);
if (r != null) Fulltext.this.statsDump = null;
}
} catch (final IOException e) {}
}};
if (concurrently) t.start(); else t.run();
}

Loading…
Cancel
Save