From cc98496ff37c1a3170e5d4485c2518117bea344e Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 16 Oct 2012 17:13:18 +0200 Subject: [PATCH] enhanced the HostBrowser: - showing also outbound links to other domains if there are any - the outbound links browser shows also the link structure image - showing even inbound links if the web structure graph has information about that - removed the left menu and made the HostBrowser a part of the top menu for search - moved the file search also to the top menu - added hover information in the HostBrowser to explain what the click means - because the HostBrowser also links to the Metadata viewer ViewFile, there should be a button to switch back to the HostBrowser: added that also. --- htroot/HostBrowser.html | 31 ++++++- htroot/HostBrowser.java | 89 +++++++++++++++---- htroot/ViewFile.html | 5 +- htroot/env/templates/header.template | 2 +- htroot/env/templates/simpleheader.template | 12 +-- htroot/yacyinteractive.html | 2 +- .../yacy/search/index/SolrConfiguration.java | 20 +++++ 7 files changed, 134 insertions(+), 27 deletions(-) diff --git a/htroot/HostBrowser.html b/htroot/HostBrowser.html index 67e709594..e46b41792 100644 --- a/htroot/HostBrowser.html +++ b/htroot/HostBrowser.html @@ -59,7 +59,7 @@ function updatepage(str) { - #%env/templates/header.template%# + #%env/templates/simpleheader.template%#

Host Browser

Browse the index of #[ucount]# documents. Enter a host or an URL for a file list or select one of a list of hosts.

@@ -77,7 +77,7 @@ function updatepage(str) {
Host List #{list}#
-
#[host]#
+
#[host]#browse #[host]#
#[count]# URLs
#{/list}# @@ -103,7 +103,7 @@ function updatepage(str) { #{list}# #(type)# - +
Show Metadata
#[file]#  #(stored)# #(load)#::load & index#(/load)#:: @@ -122,7 +122,32 @@ function updatepage(str) {
#(/files)# + + #(outbound)#:: +
Outbound Links, outgoing from #[host]# - Host List + #{list}# +
+
#[host]#browse #[host]#
+
#[count]# URLs
+
+ #{/list}# +


+ +

+
+ #(/outbound)# + #(inbound)#:: +
Inbound Links, incoming to #[host]# - Host List + #{list}# +
+
#[host]#browse #[host]#
+
#[count]# URLs
+
+ #{/list}# +
+ #(/inbound)# + #(admin)#::
Administration Options diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java index 0cc31934d..7f77b8c6a 100644 --- a/htroot/HostBrowser.java +++ b/htroot/HostBrowser.java @@ -20,7 +20,6 @@ import java.io.IOException; import java.net.MalformedURLException; -import java.util.Collection; import java.util.Date; import java.util.HashMap; import java.util.HashSet; @@ -32,13 +31,18 @@ import java.util.concurrent.BlockingQueue; import org.apache.solr.common.SolrDocument; +import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.UTF8; import net.yacy.cora.federate.solr.YaCySchema; import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; import net.yacy.cora.protocol.RequestHeader; +import net.yacy.cora.sorting.ClusteredScoreMap; import net.yacy.cora.sorting.ReversibleScoreMap; import net.yacy.crawler.retrieval.Request; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; +import net.yacy.peers.graphics.WebStructureGraph.StructureEntry; import net.yacy.search.Switchboard; import net.yacy.search.index.Fulltext; import net.yacy.search.index.SolrConfiguration; @@ -152,35 +156,49 @@ public class HostBrowser { // generate file list from path DigestURI uri = new DigestURI(path); String host = uri.getHost(); + prop.putHTML("outbound_host", host); + prop.putHTML("inbound_host", host); + String hosthash = ASCII.String(uri.hash(), 6, 12); // get all files for a specific host from the index BlockingQueue docs = fulltext.getSolr().concurrentQuery(YaCySchema.host_s.name() + ":" + host, 0, 100000, 60000); SolrDocument doc; Set storedDocs = new HashSet(); - Set linkedDocs = new HashSet(); + Set inboundLinks = new HashSet(); + Map> outboundHosts = new HashMap>(); int hostsize = 0; while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { String u = (String) doc.getFieldValue(YaCySchema.sku.name()); hostsize++; if (u.startsWith(path)) storedDocs.add(u); - Collection urlstub = doc.getFieldValues(YaCySchema.inboundlinks_urlstub_txt.name()); - Collection urlprot = urlstub == null ? null : SolrConfiguration.indexedList2protocolList(doc.getFieldValues(YaCySchema.inboundlinks_protocol_sxt.name()), urlstub.size()); - if (urlprot != null && urlstub != null) { - assert urlprot.size() == urlstub.size(); - Object[] urlprota = urlprot.toArray(); - Object[] urlstuba = urlstub.toArray(); - for (int i = 0; i < urlprota.length; i++) { - u = ((String) urlprota[i]) + "://" + ((String) urlstuba[i]); - int hp = u.indexOf('#'); - if (hp > 0) u = u.substring(0, hp); - if (u.startsWith(path) && !storedDocs.contains(u)) linkedDocs.add(u); - } + // collect inboundlinks to browse the host + Iterator links = SolrConfiguration.getLinks(doc, true); + while (links.hasNext()) { + u = links.next(); + if (u.startsWith(path) && !storedDocs.contains(u)) inboundLinks.add(u); + } + + // collect outboundlinks to browse to the outbound + links = SolrConfiguration.getLinks(doc, false); + while (links.hasNext()) { + u = links.next(); + try { + MultiProtocolURI mu = new MultiProtocolURI(u); + if (mu.getHost() != null) { + ReversibleScoreMap lks = outboundHosts.get(mu.getHost()); + if (lks == null) { + lks = new ClusteredScoreMap(UTF8.insensitiveUTF8Comparator); + outboundHosts.put(mu.getHost(), lks); + } + lks.set(u, u.length()); + } + } catch (MalformedURLException e) {} } } // now combine both lists into one Map files = new HashMap(); for (String u: storedDocs) files.put(u, true); - for (String u: linkedDocs) if (!storedDocs.contains(u)) files.put(u, false); + for (String u: inboundLinks) if (!storedDocs.contains(u)) files.put(u, false); // distinguish files and folders Map list = new TreeMap(); @@ -235,6 +253,47 @@ public class HostBrowser { prop.put("files_hostsize", hostsize); prop.put("files_subpathsize", storedDocs.size()); prop.put("files", 1); + + // generate inbound-links table + StructureEntry struct = sb.webStructure.incomingReferences(hosthash); + if (struct != null && struct.references.size() > 0) { + maxcount = 200; + ReversibleScoreMap score = new ClusteredScoreMap(UTF8.insensitiveUTF8Comparator); + for (Map.Entry entry: struct.references.entrySet()) score.set(entry.getKey(), entry.getValue()); + c = 0; + Iterator i = score.keys(false); + while (i.hasNext() && c < maxcount) { + host = i.next(); + prop.put("inbound_list_" + c + "_host", sb.webStructure.hostHash2hostName(host)); + prop.put("inbound_list_" + c + "_count", score.get(host)); + c++; + } + prop.put("inbound_list", c); + prop.put("inbound", 1); + } else { + prop.put("inbound", 0); + } + + // generate outbound-links table + if (outboundHosts.size() > 0) { + maxcount = 200; + ReversibleScoreMap score = new ClusteredScoreMap(UTF8.insensitiveUTF8Comparator); + for (Map.Entry> entry: outboundHosts.entrySet()) score.set(entry.getKey(), entry.getValue().size()); + c = 0; + Iterator i = score.keys(false); + while (i.hasNext() && c < maxcount) { + host = i.next(); + prop.put("outbound_list_" + c + "_host", host); + prop.put("outbound_list_" + c + "_count", score.get(host)); + prop.put("outbound_list_" + c + "_link", outboundHosts.get(host).getMinKey()); + c++; + } + prop.put("outbound_list", c); + prop.put("outbound", 1); + } else { + prop.put("outbound", 0); + } + } catch (Throwable e) { Log.logException(e); } diff --git a/htroot/ViewFile.html b/htroot/ViewFile.html index d72d53d3b..19071955a 100644 --- a/htroot/ViewFile.html +++ b/htroot/ViewFile.html @@ -80,8 +80,9 @@ function updatepage(str) {
URL:
- - + + +
diff --git a/htroot/env/templates/header.template b/htroot/env/templates/header.template index a68bbdf6a..e08caf966 100644 --- a/htroot/env/templates/header.template +++ b/htroot/env/templates/header.template @@ -9,7 +9,7 @@