From c6a6f4c4e6472766229cd29db7f6677ac0a3fe68 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Mon, 5 Nov 2012 18:57:21 +0100 Subject: [PATCH] added a hack which makes the HostBrowser more performant when the given host has a lot of urls. If the number of urls is > 1000, then the list of documents is restricted to such which have no subpath, if the root path is selected. However, this can cause a problem if no documents on the root path exist but only on paths below that root path. --- htroot/HostBrowser.html | 2 +- htroot/HostBrowser.java | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/htroot/HostBrowser.html b/htroot/HostBrowser.html index df8e51868..80bbfbaaf 100644 --- a/htroot/HostBrowser.html +++ b/htroot/HostBrowser.html @@ -76,7 +76,7 @@ function updatepage(str) {
Host List #{list}#
-
#[host]#browse #[host]#
+
#[host]#browse #[host]#
#[count]##(crawler)#::/#[pending]##(/crawler)# URLs
#{/list}# diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java index 18d3e68f9..39d0dea0d 100644 --- a/htroot/HostBrowser.java +++ b/htroot/HostBrowser.java @@ -167,7 +167,7 @@ public class HostBrowser { // delete the complete path!! That includes everything that matches with this prefix. delete = true; } - + int facetcount=post.getInt("facetcount", 0); boolean complete = post.getBoolean("complete"); if (complete) { // we want only root paths for complete lists p = path.indexOf('/', 10); @@ -192,9 +192,17 @@ public class HostBrowser { String[] pathparts = uri.getPaths(); // get all files for a specific host from the index - String query = YaCySchema.host_s.name() + ":" + host; - for (String pe: pathparts) if (pe.length() > 0) query += " AND " + YaCySchema.url_paths_sxt.name() + ":" + pe; - BlockingQueue docs = fulltext.getSolr().concurrentQuery(query, 0, 100000, 3000, 100); + StringBuilder q = new StringBuilder(); + q.append(YaCySchema.host_s.name()).append(':').append(host); + if (pathparts.length > 0 && pathparts[0].length() > 0) { + for (String pe: pathparts) { + if (pe.length() > 0) q.append(" AND ").append(YaCySchema.url_paths_sxt.name()).append(':').append(pe); + } + } else { + if (facetcount > 1000 && !post.containsKey("nepr")) q.append(" AND ").append(YaCySchema.url_paths_sxt.name()).append(":[* TO *]"); + } + q.append(" AND -").append(YaCySchema.failreason_t.name()).append(":[* TO *]"); + BlockingQueue docs = fulltext.getSolr().concurrentQuery(q.toString(), 0, 100000, 3000, 100); SolrDocument doc; Set storedDocs = new HashSet(); Set inboundLinks = new HashSet();