this new search interface is something completely new for search, but completely common on desktops: browser a web space like one would browse a file system in a file browser. The file listing is created using the search index and a faceted restriction to specific domains.pull/1/head
parent
8556a3d521
commit
f45f7fc12e
@ -1,107 +1,197 @@
|
||||
/**
|
||||
* HostBrowser
|
||||
* Copyright 2012 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
|
||||
* First released 27.09.2012 at http://yacy.net
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import net.yacy.cora.document.ASCII;
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.cora.federate.solr.YaCySchema;
|
||||
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
|
||||
import net.yacy.cora.protocol.RequestHeader;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.meta.URIMetadata;
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
import net.yacy.cora.sorting.ReversibleScoreMap;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.search.Switchboard;
|
||||
import net.yacy.search.index.Segment;
|
||||
import net.yacy.search.index.Fulltext;
|
||||
import net.yacy.server.serverObjects;
|
||||
import net.yacy.server.serverSwitch;
|
||||
|
||||
public class HostBrowser {
|
||||
|
||||
public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) {
|
||||
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
|
||||
// return variable that accumulates replacements
|
||||
final Switchboard sb = (Switchboard) env;
|
||||
Fulltext fulltext = sb.index.fulltext();
|
||||
final boolean searchAllowed = sb.getConfigBool("publicSearchpage", true) || sb.verifyAuthentication(header);
|
||||
|
||||
final serverObjects prop = new serverObjects();
|
||||
|
||||
Segment segment = sb.index;
|
||||
|
||||
|
||||
// set default values
|
||||
prop.put("urlstring", "");
|
||||
prop.put("urlhash", "");
|
||||
prop.put("path", "");
|
||||
prop.put("result", "");
|
||||
prop.putNum("ucount", segment.fulltext().size());
|
||||
prop.put("otherHosts", "");
|
||||
prop.put("genUrlProfile", 0);
|
||||
prop.put("statistics", 1);
|
||||
prop.put("statistics_lines", 100);
|
||||
prop.put("statisticslines", 0);
|
||||
prop.putNum("ucount", fulltext.size());
|
||||
prop.put("hosts", 0);
|
||||
prop.put("files", 0);
|
||||
|
||||
if (!searchAllowed) {
|
||||
prop.put("result", "You are not allowed to use this page. Please ask an administrator for permission.");
|
||||
return prop;
|
||||
}
|
||||
|
||||
if (post == null || env == null) {
|
||||
return prop; // nothing to do
|
||||
return prop;
|
||||
}
|
||||
|
||||
// post values that are set on numerous input fields with same name
|
||||
String urlstring = post.get("urlstring", "").trim();
|
||||
String path = post.get("path", "").trim();
|
||||
int p = path.lastIndexOf('/');
|
||||
if (p < 0 && path.length() > 0) path = path + "/"; else if (p > 7) path = path.substring(0, p + 1); // the search path shall always end with "/"
|
||||
if (path.length() > 0 && (
|
||||
!path.startsWith("http://") &&
|
||||
!path.startsWith("https://") &&
|
||||
!path.startsWith("ftp://") &&
|
||||
!path.startsWith("smb://") &&
|
||||
!path.startsWith("file://"))) { path = "http://" + path; }
|
||||
prop.putHTML("path", path);
|
||||
|
||||
if (!urlstring.startsWith("http://") &&
|
||||
!urlstring.startsWith("https://") &&
|
||||
!urlstring.startsWith("ftp://") &&
|
||||
!urlstring.startsWith("smb://") &&
|
||||
!urlstring.startsWith("file://")) { urlstring = "http://" + urlstring; }
|
||||
|
||||
prop.putHTML("urlstring", urlstring);
|
||||
prop.put("result", " ");
|
||||
if (post.containsKey("hosts")) {
|
||||
// generate host list
|
||||
try {
|
||||
int maxcount = 200;
|
||||
ReversibleScoreMap<String> score = fulltext.getSolr().getFacet(YaCySchema.host_s.name(), maxcount);
|
||||
int c = 0;
|
||||
Iterator<String> i = score.keys(false);
|
||||
String host;
|
||||
while (i.hasNext() && c < maxcount) {
|
||||
host = i.next();
|
||||
prop.put("hosts_list_" + c + "_host", host);
|
||||
prop.put("hosts_list_" + c + "_count", score.get(host));
|
||||
c++;
|
||||
}
|
||||
prop.put("hosts_list", c);
|
||||
prop.put("hosts", 1);
|
||||
} catch (IOException e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
}
|
||||
|
||||
if (path.length() > 0) {
|
||||
|
||||
if (post.containsKey("urlstringsearch")) {
|
||||
p = path.substring(0, path.length() - 1).lastIndexOf('/');
|
||||
if (p < 8) {
|
||||
prop.put("files_root", 1);
|
||||
} else {
|
||||
prop.put("files_root", 0);
|
||||
prop.put("files_root_path", path.substring(0, p + 1));
|
||||
}
|
||||
try {
|
||||
final DigestURI url = new DigestURI(urlstring);
|
||||
String urlhash = ASCII.String(url.hash());
|
||||
prop.put("urlhash", urlhash);
|
||||
final URIMetadata entry = segment.fulltext().getMetadata(ASCII.getBytes(urlhash));
|
||||
if (entry == null) {
|
||||
prop.putHTML("result", "No Entry for URL " + url.toNormalform(true, true));
|
||||
prop.putHTML("urlstring", urlstring);
|
||||
prop.put("urlhash", "");
|
||||
} else {
|
||||
prop.putAll(genUrlProfile(segment, entry, urlhash));
|
||||
prop.put("statistics", 0);
|
||||
// generate file list from path
|
||||
MultiProtocolURI uri = new MultiProtocolURI(path);
|
||||
String host = uri.getHost();
|
||||
|
||||
// get all files for a specific host from the index
|
||||
BlockingQueue<SolrDocument> docs = fulltext.getSolr().concurrentQuery(YaCySchema.host_s.name() + ":" + host, 0, 100000, 60000);
|
||||
SolrDocument doc;
|
||||
Set<String> storedDocs = new HashSet<String>();
|
||||
Set<String> linkedDocs = new HashSet<String>();
|
||||
int hostsize = 0;
|
||||
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
|
||||
String u = (String) doc.getFieldValue(YaCySchema.sku.name());
|
||||
hostsize++;
|
||||
if (u.startsWith(path)) storedDocs.add(u);
|
||||
Collection<Object> urlprot = doc.getFieldValues(YaCySchema.inboundlinks_protocol_sxt.name());
|
||||
Collection<Object> urlstub = doc.getFieldValues(YaCySchema.inboundlinks_urlstub_txt.name());
|
||||
if (urlprot != null && urlstub != null) {
|
||||
assert urlprot.size() == urlstub.size();
|
||||
Object[] urlprota = urlprot.toArray();
|
||||
Object[] urlstuba = urlstub.toArray();
|
||||
for (int i = 0; i < urlprota.length; i++) {
|
||||
u = ((String) urlprota[i]) + "://" + ((String) urlstuba[i]);
|
||||
if (u.startsWith(path) && !storedDocs.contains(u)) linkedDocs.add(u);
|
||||
}
|
||||
}
|
||||
}
|
||||
// now combine both lists into one
|
||||
Map<String, Boolean> files = new HashMap<String, Boolean>();
|
||||
for (String u: storedDocs) files.put(u, true);
|
||||
for (String u: linkedDocs) if (!storedDocs.contains(u)) files.put(u, false);
|
||||
|
||||
// distinguish files and folders
|
||||
Map<String, Object> list = new TreeMap<String, Object>();
|
||||
for (String url: files.keySet()) {
|
||||
String file = url.substring(path.length());
|
||||
p = file.indexOf('/');
|
||||
if (p < 0) {
|
||||
// this is a file in the root path
|
||||
list.put(url, files.get(url)); // Boolean value: this is a file
|
||||
} else {
|
||||
// this is a directory path
|
||||
String dir = path + file.substring(0, p + 1);
|
||||
Object c = list.get(dir);
|
||||
if (c == null) {
|
||||
list.put(dir, new AtomicInteger(1));
|
||||
} else if (c instanceof AtomicInteger) {
|
||||
((AtomicInteger) c).incrementAndGet();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int maxcount = 1000;
|
||||
int c = 0;
|
||||
for (Map.Entry<String, Object> entry: list.entrySet()) {
|
||||
if (entry.getValue() instanceof Boolean) {
|
||||
// this is a file
|
||||
prop.put("files_list_" + c + "_type", 0);
|
||||
prop.put("files_list_" + c + "_type_file", entry.getKey());
|
||||
prop.put("files_list_" + c + "_type_stored", ((Boolean) entry.getValue()).booleanValue() ? 1 : 0);
|
||||
} else {
|
||||
// this is a folder
|
||||
prop.put("files_list_" + c + "_type", 1);
|
||||
prop.put("files_list_" + c + "_type_file", entry.getKey());
|
||||
prop.put("files_list_" + c + "_type_count", ((AtomicInteger) entry.getValue()).intValue());
|
||||
}
|
||||
if (++c >= maxcount) break;
|
||||
}
|
||||
} catch (final MalformedURLException e) {
|
||||
prop.putHTML("result", "bad url: " + urlstring);
|
||||
prop.put("urlhash", "");
|
||||
prop.put("files_list", c);
|
||||
prop.putHTML("files_path", path);
|
||||
prop.put("files_hostsize", hostsize);
|
||||
prop.put("files_subpathsize", storedDocs.size());
|
||||
prop.put("files", 1);
|
||||
} catch (Throwable e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
prop.put("lurlexport", 0);
|
||||
}
|
||||
|
||||
// insert constants
|
||||
prop.putNum("ucount", segment.fulltext().size());
|
||||
prop.putNum("ucount", fulltext.size());
|
||||
// return rewrite properties
|
||||
return prop;
|
||||
}
|
||||
|
||||
private static serverObjects genUrlProfile(final Segment segment, final URIMetadata entry, final String urlhash) {
|
||||
final serverObjects prop = new serverObjects();
|
||||
if (entry == null) {
|
||||
prop.put("genUrlProfile", "1");
|
||||
prop.put("genUrlProfile_urlhash", urlhash);
|
||||
return prop;
|
||||
}
|
||||
final URIMetadata le = (entry.referrerHash() == null || entry.referrerHash().length != Word.commonHashLength) ? null : segment.fulltext().getMetadata(entry.referrerHash());
|
||||
if (entry.url() == null) {
|
||||
prop.put("genUrlProfile", "1");
|
||||
prop.put("genUrlProfile_urlhash", urlhash);
|
||||
return prop;
|
||||
}
|
||||
prop.put("genUrlProfile", "2");
|
||||
prop.putHTML("genUrlProfile_urlNormalform", entry.url().toNormalform(false, true));
|
||||
prop.put("genUrlProfile_urlhash", urlhash);
|
||||
prop.put("genUrlProfile_urlDescr", entry.dc_title());
|
||||
prop.put("genUrlProfile_moddate", entry.moddate().toString());
|
||||
prop.put("genUrlProfile_loaddate", entry.loaddate().toString());
|
||||
prop.put("genUrlProfile_referrer", (le == null) ? 0 : 1);
|
||||
prop.putHTML("genUrlProfile_referrer_url", (le == null) ? "<unknown>" : le.url().toNormalform(false, true));
|
||||
prop.put("genUrlProfile_referrer_hash", (le == null) ? "" : ASCII.String(le.hash()));
|
||||
prop.put("genUrlProfile_doctype", String.valueOf(entry.doctype()));
|
||||
prop.put("genUrlProfile_language", entry.language());
|
||||
prop.put("genUrlProfile_size", entry.size());
|
||||
prop.put("genUrlProfile_wordCount", entry.wordCount());
|
||||
return prop;
|
||||
}
|
||||
|
||||
}
|
||||
|
Loading…
Reference in new issue