From f45f7fc12e18ae6936b62767803035097c8f1ef8 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Fri, 28 Sep 2012 22:45:16 +0200 Subject: [PATCH] added new Host Browser to main menu: this new search interface is something completely new for search, but completely common on desktops: browser a web space like one would browse a file system in a file browser. The file listing is created using the search index and a faceted restriction to specific domains. --- defaults/solr.keys.list | 10 +- htroot/HostBrowser.html | 106 ++++---- htroot/HostBrowser.java | 238 ++++++++++++------ htroot/Ranking_p.html | 6 +- htroot/env/templates/header.template | 1 + .../yacy/cora/document/MultiProtocolURI.java | 30 ++- .../solr/connector/SolrServerConnector.java | 3 +- source/net/yacy/search/Switchboard.java | 7 +- 8 files changed, 253 insertions(+), 148 deletions(-) diff --git a/defaults/solr.keys.list b/defaults/solr.keys.list index e3d951a6d..67f463977 100644 --- a/defaults/solr.keys.list +++ b/defaults/solr.keys.list @@ -178,10 +178,10 @@ h6_txt #inboundlinks_tag_txt ## internal links, only the protocol -#inboundlinks_protocol_sxt +inboundlinks_protocol_sxt ## internal links, the url only without the protocol -#inboundlinks_urlstub_txt +inboundlinks_urlstub_txt ## internal links, the name property of the a-tag #inboundlinks_name_txt @@ -208,10 +208,10 @@ h6_txt #outboundlinks_tag_txt ## external links, only the protocol -#outboundlinks_protocol_sxt +outboundlinks_protocol_sxt ## external links, the url only without the protocol -#outboundlinks_urlstub_txt +outboundlinks_urlstub_txt ## external links, the name property of the a-tag #outboundlinks_name_txt @@ -325,7 +325,7 @@ host_s #host_dnc_s ## either the second level domain or, if a ccSLD is used, the third level domain -#host_organization_s +host_organization_s ## the organization and dnc concatenated with '.' #host_organizationdnc_s diff --git a/htroot/HostBrowser.html b/htroot/HostBrowser.html index 036867997..4cd36e60c 100644 --- a/htroot/HostBrowser.html +++ b/htroot/HostBrowser.html @@ -8,7 +8,7 @@ //URL from index (total results = " + totalResults + ")<\/td>"; for (var i = 0; i < firstChannel.items.length; i++) { item = firstChannel.items[i]; - html += "" + item.link + "<\/a><\/td>"; + html += "" + item.link + "<\/a><\/td>"; } html += "<\/table>"; } @@ -60,58 +60,62 @@ function updatepage(str) { #%env/templates/header.template%# - #%env/templates/submenuIndexControl.template%# -

URL References Administration

-

The local index currently contains #[ucount]# URL references

-
-
URL Retrieval -
- -
Retrieve by URL:
-
-
-
-
- -
+

Host Browser

+

Browse the index of #[ucount]# documents. Enter a host or an URL for a file list or select one of a list of hosts.

+ +
+ Host/URL: + +
+
-
+ #[result]# - #(statisticslines)#:: -

Statistics about the top-#[domains]# domains in the database:

- - - - - - #{domains}# - - - - - #{/domains}# -
DomainURLs
#[domain]##[count]#
- #(/statisticslines)# - - - #(genUrlProfile)# - ::No entry found for URL-hash #[urlhash]# - ::
-
- - API - These document details can be retrieved as XHTML+RDFa - document containg RDF annotations in Dublin Core vocabulary. - The XHTML+RDFa data format is both a XML content format and a HTML display format and is considered as an important Semantic Web content format. - The same content can also be retrieved as pure XML metadata with DC tag name vocabulary. - Click the API icon to see an example call to the search rss API. - To see a list of all APIs, please visit the API wiki page. -
- #(/genUrlProfile)# + #(hosts)#:: +
Host List + #{list}# +
+ +
#[count]# URLs
+
+ #{/list}# +
+ #(/hosts)# - #[result]# + #(files)#:: +
Files in #[path]# +

Documents in domain: #[hostsize]#; Documents in subpath: #[subpathsize]#

+ + + + + + + #(root)# + + + + + :: + #(/root)# + #{list}# + #(type)# + + + + :: + + + + + + #(/type)# + #{/list}# +
PathDocuments
..
#[file]#
#[file]##[count]#
+
+ #(/files)# #%env/templates/footer.template%# diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java index 5072cf25b..ab4f68784 100644 --- a/htroot/HostBrowser.java +++ b/htroot/HostBrowser.java @@ -1,107 +1,197 @@ +/** + * HostBrowser + * Copyright 2012 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany + * First released 27.09.2012 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ -import java.net.MalformedURLException; +import java.io.IOException; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.atomic.AtomicInteger; -import net.yacy.cora.document.ASCII; +import org.apache.solr.common.SolrDocument; + +import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.federate.solr.YaCySchema; +import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; import net.yacy.cora.protocol.RequestHeader; -import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadata; -import net.yacy.kelondro.data.word.Word; +import net.yacy.cora.sorting.ReversibleScoreMap; +import net.yacy.kelondro.logging.Log; import net.yacy.search.Switchboard; -import net.yacy.search.index.Segment; +import net.yacy.search.index.Fulltext; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; public class HostBrowser { - public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) { + public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { // return variable that accumulates replacements final Switchboard sb = (Switchboard) env; + Fulltext fulltext = sb.index.fulltext(); + final boolean searchAllowed = sb.getConfigBool("publicSearchpage", true) || sb.verifyAuthentication(header); final serverObjects prop = new serverObjects(); - - Segment segment = sb.index; - + // set default values - prop.put("urlstring", ""); - prop.put("urlhash", ""); + prop.put("path", ""); prop.put("result", ""); - prop.putNum("ucount", segment.fulltext().size()); - prop.put("otherHosts", ""); - prop.put("genUrlProfile", 0); - prop.put("statistics", 1); - prop.put("statistics_lines", 100); - prop.put("statisticslines", 0); + prop.putNum("ucount", fulltext.size()); + prop.put("hosts", 0); + prop.put("files", 0); + if (!searchAllowed) { + prop.put("result", "You are not allowed to use this page. Please ask an administrator for permission."); + return prop; + } + if (post == null || env == null) { - return prop; // nothing to do + return prop; } - // post values that are set on numerous input fields with same name - String urlstring = post.get("urlstring", "").trim(); + String path = post.get("path", "").trim(); + int p = path.lastIndexOf('/'); + if (p < 0 && path.length() > 0) path = path + "/"; else if (p > 7) path = path.substring(0, p + 1); // the search path shall always end with "/" + if (path.length() > 0 && ( + !path.startsWith("http://") && + !path.startsWith("https://") && + !path.startsWith("ftp://") && + !path.startsWith("smb://") && + !path.startsWith("file://"))) { path = "http://" + path; } + prop.putHTML("path", path); - if (!urlstring.startsWith("http://") && - !urlstring.startsWith("https://") && - !urlstring.startsWith("ftp://") && - !urlstring.startsWith("smb://") && - !urlstring.startsWith("file://")) { urlstring = "http://" + urlstring; } - - prop.putHTML("urlstring", urlstring); - prop.put("result", " "); + if (post.containsKey("hosts")) { + // generate host list + try { + int maxcount = 200; + ReversibleScoreMap score = fulltext.getSolr().getFacet(YaCySchema.host_s.name(), maxcount); + int c = 0; + Iterator i = score.keys(false); + String host; + while (i.hasNext() && c < maxcount) { + host = i.next(); + prop.put("hosts_list_" + c + "_host", host); + prop.put("hosts_list_" + c + "_count", score.get(host)); + c++; + } + prop.put("hosts_list", c); + prop.put("hosts", 1); + } catch (IOException e) { + Log.logException(e); + } + } + + if (path.length() > 0) { - if (post.containsKey("urlstringsearch")) { + p = path.substring(0, path.length() - 1).lastIndexOf('/'); + if (p < 8) { + prop.put("files_root", 1); + } else { + prop.put("files_root", 0); + prop.put("files_root_path", path.substring(0, p + 1)); + } try { - final DigestURI url = new DigestURI(urlstring); - String urlhash = ASCII.String(url.hash()); - prop.put("urlhash", urlhash); - final URIMetadata entry = segment.fulltext().getMetadata(ASCII.getBytes(urlhash)); - if (entry == null) { - prop.putHTML("result", "No Entry for URL " + url.toNormalform(true, true)); - prop.putHTML("urlstring", urlstring); - prop.put("urlhash", ""); - } else { - prop.putAll(genUrlProfile(segment, entry, urlhash)); - prop.put("statistics", 0); + // generate file list from path + MultiProtocolURI uri = new MultiProtocolURI(path); + String host = uri.getHost(); + + // get all files for a specific host from the index + BlockingQueue docs = fulltext.getSolr().concurrentQuery(YaCySchema.host_s.name() + ":" + host, 0, 100000, 60000); + SolrDocument doc; + Set storedDocs = new HashSet(); + Set linkedDocs = new HashSet(); + int hostsize = 0; + while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { + String u = (String) doc.getFieldValue(YaCySchema.sku.name()); + hostsize++; + if (u.startsWith(path)) storedDocs.add(u); + Collection urlprot = doc.getFieldValues(YaCySchema.inboundlinks_protocol_sxt.name()); + Collection urlstub = doc.getFieldValues(YaCySchema.inboundlinks_urlstub_txt.name()); + if (urlprot != null && urlstub != null) { + assert urlprot.size() == urlstub.size(); + Object[] urlprota = urlprot.toArray(); + Object[] urlstuba = urlstub.toArray(); + for (int i = 0; i < urlprota.length; i++) { + u = ((String) urlprota[i]) + "://" + ((String) urlstuba[i]); + if (u.startsWith(path) && !storedDocs.contains(u)) linkedDocs.add(u); + } + } + } + // now combine both lists into one + Map files = new HashMap(); + for (String u: storedDocs) files.put(u, true); + for (String u: linkedDocs) if (!storedDocs.contains(u)) files.put(u, false); + + // distinguish files and folders + Map list = new TreeMap(); + for (String url: files.keySet()) { + String file = url.substring(path.length()); + p = file.indexOf('/'); + if (p < 0) { + // this is a file in the root path + list.put(url, files.get(url)); // Boolean value: this is a file + } else { + // this is a directory path + String dir = path + file.substring(0, p + 1); + Object c = list.get(dir); + if (c == null) { + list.put(dir, new AtomicInteger(1)); + } else if (c instanceof AtomicInteger) { + ((AtomicInteger) c).incrementAndGet(); + } + } + } + + int maxcount = 1000; + int c = 0; + for (Map.Entry entry: list.entrySet()) { + if (entry.getValue() instanceof Boolean) { + // this is a file + prop.put("files_list_" + c + "_type", 0); + prop.put("files_list_" + c + "_type_file", entry.getKey()); + prop.put("files_list_" + c + "_type_stored", ((Boolean) entry.getValue()).booleanValue() ? 1 : 0); + } else { + // this is a folder + prop.put("files_list_" + c + "_type", 1); + prop.put("files_list_" + c + "_type_file", entry.getKey()); + prop.put("files_list_" + c + "_type_count", ((AtomicInteger) entry.getValue()).intValue()); + } + if (++c >= maxcount) break; } - } catch (final MalformedURLException e) { - prop.putHTML("result", "bad url: " + urlstring); - prop.put("urlhash", ""); + prop.put("files_list", c); + prop.putHTML("files_path", path); + prop.put("files_hostsize", hostsize); + prop.put("files_subpathsize", storedDocs.size()); + prop.put("files", 1); + } catch (Throwable e) { + Log.logException(e); } - prop.put("lurlexport", 0); } // insert constants - prop.putNum("ucount", segment.fulltext().size()); + prop.putNum("ucount", fulltext.size()); // return rewrite properties return prop; } - private static serverObjects genUrlProfile(final Segment segment, final URIMetadata entry, final String urlhash) { - final serverObjects prop = new serverObjects(); - if (entry == null) { - prop.put("genUrlProfile", "1"); - prop.put("genUrlProfile_urlhash", urlhash); - return prop; - } - final URIMetadata le = (entry.referrerHash() == null || entry.referrerHash().length != Word.commonHashLength) ? null : segment.fulltext().getMetadata(entry.referrerHash()); - if (entry.url() == null) { - prop.put("genUrlProfile", "1"); - prop.put("genUrlProfile_urlhash", urlhash); - return prop; - } - prop.put("genUrlProfile", "2"); - prop.putHTML("genUrlProfile_urlNormalform", entry.url().toNormalform(false, true)); - prop.put("genUrlProfile_urlhash", urlhash); - prop.put("genUrlProfile_urlDescr", entry.dc_title()); - prop.put("genUrlProfile_moddate", entry.moddate().toString()); - prop.put("genUrlProfile_loaddate", entry.loaddate().toString()); - prop.put("genUrlProfile_referrer", (le == null) ? 0 : 1); - prop.putHTML("genUrlProfile_referrer_url", (le == null) ? "" : le.url().toNormalform(false, true)); - prop.put("genUrlProfile_referrer_hash", (le == null) ? "" : ASCII.String(le.hash())); - prop.put("genUrlProfile_doctype", String.valueOf(entry.doctype())); - prop.put("genUrlProfile_language", entry.language()); - prop.put("genUrlProfile_size", entry.size()); - prop.put("genUrlProfile_wordCount", entry.wordCount()); - return prop; - } } diff --git a/htroot/Ranking_p.html b/htroot/Ranking_p.html index fba2ac1c8..f506672fc 100644 --- a/htroot/Ranking_p.html +++ b/htroot/Ranking_p.html @@ -7,7 +7,7 @@ $(function() { $("select").each(function(){ var name = $(this).attr("name"); - $("
").insertAfter($(this)).slider({ + $("
").insertAfter($(this)).slider({ min: 0, max: 15, range: "min", @@ -38,8 +38,8 @@ Pre-Ranking
#{attrPre}#
 info#[info]#
-
- #{select}# #{/select}#
#{/attrPre}# diff --git a/htroot/env/templates/header.template b/htroot/env/templates/header.template index 4678f2aa6..d07c6e4b4 100644 --- a/htroot/env/templates/header.template +++ b/htroot/env/templates/header.template @@ -9,6 +9,7 @@