added a hack which makes the HostBrowser more performant when the given

host has a lot of urls. If the number of urls is > 1000, then the list
of documents is restricted to such which have no subpath, if the root
path is selected. However, this can cause a problem if no documents on
the root path exist but only on paths below that root path.
pull/1/head
Michael Peter Christen 12 years ago
parent 619bf7e875
commit c6a6f4c4e6

@ -76,7 +76,7 @@ function updatepage(str) {
<fieldset><legend>Host List</legend>
#{list}#
<div style="float:left; padding:1px 5px 1px 5px;">
<div style="width:160px; text-align:left; float: left; white-space:nowrap; overflow:hidden;"><div id="info"><a href="/HostBrowser.html?path=#[host]#">#[host]#</a><span>browse #[host]#</span></div></div>
<div style="width:160px; text-align:left; float: left; white-space:nowrap; overflow:hidden;"><div id="info"><a href="/HostBrowser.html?path=#[host]#&facetcount=#[count]#">#[host]#</a><span>browse #[host]#</span></div></div>
<div style="width:80px; text-align:right; float: left; white-space:nowrap; overflow:hidden;">#[count]##(crawler)#::/#[pending]##(/crawler)# URLs</div>
</div>
#{/list}#

@ -167,7 +167,7 @@ public class HostBrowser {
// delete the complete path!! That includes everything that matches with this prefix.
delete = true;
}
int facetcount=post.getInt("facetcount", 0);
boolean complete = post.getBoolean("complete");
if (complete) { // we want only root paths for complete lists
p = path.indexOf('/', 10);
@ -192,9 +192,17 @@ public class HostBrowser {
String[] pathparts = uri.getPaths();
// get all files for a specific host from the index
String query = YaCySchema.host_s.name() + ":" + host;
for (String pe: pathparts) if (pe.length() > 0) query += " AND " + YaCySchema.url_paths_sxt.name() + ":" + pe;
BlockingQueue<SolrDocument> docs = fulltext.getSolr().concurrentQuery(query, 0, 100000, 3000, 100);
StringBuilder q = new StringBuilder();
q.append(YaCySchema.host_s.name()).append(':').append(host);
if (pathparts.length > 0 && pathparts[0].length() > 0) {
for (String pe: pathparts) {
if (pe.length() > 0) q.append(" AND ").append(YaCySchema.url_paths_sxt.name()).append(':').append(pe);
}
} else {
if (facetcount > 1000 && !post.containsKey("nepr")) q.append(" AND ").append(YaCySchema.url_paths_sxt.name()).append(":[* TO *]");
}
q.append(" AND -").append(YaCySchema.failreason_t.name()).append(":[* TO *]");
BlockingQueue<SolrDocument> docs = fulltext.getSolr().concurrentQuery(q.toString(), 0, 100000, 3000, 100);
SolrDocument doc;
Set<String> storedDocs = new HashSet<String>();
Set<String> inboundLinks = new HashSet<String>();

Loading…
Cancel
Save