From c0d9a3e9a782607acbe90f70abb5c09eb83fea41 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Fri, 11 Dec 2020 00:50:52 +0100 Subject: [PATCH] turned HostBrowser into a admin-only page, now called IndexBrowser This was required because spiders and bots crawled through this page and created load on the peer without use for the user or the YaCy network. --- defaults/solr.collection.schema | 2 +- defaults/yacy.init | 4 +- htroot/ConfigSearchPage_p.html | 4 +- htroot/ConfigSearchPage_p.java | 6 +- htroot/HostBrowserAdmin_p.html | 8 - .../{HostBrowser.html => IndexBrowser_p.html} | 151 ++++----- .../{HostBrowser.java => IndexBrowser_p.java} | 300 ++++++++---------- .../{HostBrowser.xml => IndexBrowser_p.xml} | 4 +- htroot/ViewFile.html | 2 +- htroot/env/templates/header.template | 2 +- .../env/templates/simpleSearchHeader.template | 1 - htroot/env/templates/simpleheader.template | 1 - .../templates/submenuWebStructure.template | 2 +- htroot/robots.txt | 2 +- htroot/yacysearchitem.html | 2 +- htroot/yacysearchitem.java | 4 +- locales/de.lng | 6 +- locales/es.lng | 6 +- locales/fr.lng | 6 +- locales/it.lng | 6 +- locales/ja.lng | 6 +- locales/master.lng.xlf | 7 +- locales/ru.lng | 6 +- locales/zh.lng | 6 +- .../yacy/search/schema/CollectionSchema.java | 2 +- 25 files changed, 223 insertions(+), 323 deletions(-) delete mode 100644 htroot/HostBrowserAdmin_p.html rename htroot/{HostBrowser.html => IndexBrowser_p.html} (68%) rename htroot/{HostBrowser.java => IndexBrowser_p.java} (78%) rename htroot/{HostBrowser.xml => IndexBrowser_p.xml} (95%) diff --git a/defaults/solr.collection.schema b/defaults/solr.collection.schema index 14cd15268..89b4dd45c 100644 --- a/defaults/solr.collection.schema +++ b/defaults/solr.collection.schema @@ -51,7 +51,7 @@ url_file_ext_s ## either the second level domain or, if a ccSLD is used, the third level domain. Needed to search in the url host_organization_s -## internal links, only the protocol. Needed for HostBrowser +## internal links, only the protocol. Needed for IndexBrowser inboundlinks_protocol_sxt ## internal links, the url only without the protocol. For correct assembly of inboundlinks inboundlinks_protocol_sxt + inboundlinks_urlstub_sxt is needed diff --git a/defaults/yacy.init b/defaults/yacy.init index e78583475..f6330b1e5 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -939,7 +939,7 @@ search.result.show.citation = true search.result.show.pictures = false search.result.show.cache = true search.result.show.proxy = false -search.result.show.hostbrowser = true +search.result.show.indexbrowser = true search.result.show.vocabulary = false # Set of comma separated vocabulary names not to be used as search results facets search.result.show.vocabulary.omit = @@ -1142,7 +1142,7 @@ svnRevision=0 currentSkin=pdblue # flag to show if pages shall be usable for non-admin users -# this can be applied to the Surftips.html, yacysearch.html and HostBrowser.html pages +# this can be applied to the Surftips.html, yacysearch.html and IndexBrowser_p.html pages publicSurftips = true publicSearchpage = true diff --git a/htroot/ConfigSearchPage_p.html b/htroot/ConfigSearchPage_p.html index 4d591002a..00e8fef59 100644 --- a/htroot/ConfigSearchPage_p.html +++ b/htroot/ConfigSearchPage_p.html @@ -294,7 +294,7 @@ var solr= $.getJSON("solr/collection1/select?q=*:*&defType=edismax&start=0&rows=  | Pictures  | Cache  | View via Proxy -  | Browse index +  | Browse index  | JPG Snapshot #(search.result.show.ranking)#:: | Ranking: 1.12195955E9#(/search.result.show.ranking)# @@ -314,7 +314,7 @@ var solr= $.getJSON("solr/collection1/select?q=*:*&defType=edismax&start=0&rows= menu: System Administration > Advanced Settings - + #(search.result.show.ranking)#:: info diff --git a/htroot/ConfigSearchPage_p.java b/htroot/ConfigSearchPage_p.java index be7e58660..09967e60f 100644 --- a/htroot/ConfigSearchPage_p.java +++ b/htroot/ConfigSearchPage_p.java @@ -97,7 +97,7 @@ public class ConfigSearchPage_p { sb.setConfig("search.result.show.pictures", post.getBoolean("search.result.show.pictures")); sb.setConfig("search.result.show.cache", post.getBoolean("search.result.show.cache")); sb.setConfig("search.result.show.proxy", post.getBoolean("search.result.show.proxy")); - sb.setConfig("search.result.show.hostbrowser", post.getBoolean("search.result.show.hostbrowser")); + sb.setConfig("search.result.show.indexbrowser", post.getBoolean("search.result.show.indexbrowser")); sb.setConfig("search.result.show.snapshots", post.getBoolean("search.result.show.snapshots")); // construct navigation String @@ -187,7 +187,7 @@ public class ConfigSearchPage_p { sb.setConfig("search.result.show.pictures", config.getProperty("search.result.show.pictures","false")); sb.setConfig("search.result.show.cache", config.getProperty("search.result.show.cache","true")); sb.setConfig("search.result.show.proxy", config.getProperty("search.result.show.proxy","false")); - sb.setConfig("search.result.show.hostbrowser", config.getProperty("search.result.show.hostbrowser","true")); + sb.setConfig("search.result.show.indexbrowser", config.getProperty("search.result.show.indexbrowser","true")); sb.setConfig("search.result.show.snapshots", config.getProperty("search.result.show.snapshots","true")); sb.setConfig(SwitchboardConstants.SEARCH_NAVIGATION_MAXCOUNT, config.getProperty(SwitchboardConstants.SEARCH_NAVIGATION_MAXCOUNT, @@ -247,7 +247,7 @@ public class ConfigSearchPage_p { prop.put("search.result.show.pictures", sb.getConfigBool("search.result.show.pictures", false) ? 1 : 0); prop.put("search.result.show.cache", sb.getConfigBool("search.result.show.cache", false) ? 1 : 0); prop.put("search.result.show.proxy", sb.getConfigBool("search.result.show.proxy", false) ? 1 : 0); - prop.put("search.result.show.hostbrowser", sb.getConfigBool("search.result.show.hostbrowser", false) ? 1 : 0); + prop.put("search.result.show.indexbrowser", sb.getConfigBool("search.result.show.indexbrowser", false) ? 1 : 0); prop.put("search.result.show.snapshots", sb.getConfigBool("search.result.show.snapshots", false) ? 1 : 0); prop.put("search.result.show.ranking", sb.getConfigBool(SwitchboardConstants.SEARCH_RESULT_SHOW_RANKING, SwitchboardConstants.SEARCH_RESULT_SHOW_RANKING_DEFAULT) ? 1 : 0); diff --git a/htroot/HostBrowserAdmin_p.html b/htroot/HostBrowserAdmin_p.html deleted file mode 100644 index 069480dbd..000000000 --- a/htroot/HostBrowserAdmin_p.html +++ /dev/null @@ -1,8 +0,0 @@ - - #(hosts)#:: -
-
Administration Options -
Delete all Load Errors from index
-
-
- #(/hosts)# diff --git a/htroot/HostBrowser.html b/htroot/IndexBrowser_p.html similarity index 68% rename from htroot/HostBrowser.html rename to htroot/IndexBrowser_p.html index dd540b19c..237fb5310 100644 --- a/htroot/HostBrowser.html +++ b/htroot/IndexBrowser_p.html @@ -47,7 +47,7 @@ function updatepage(str) { html += ""; for (var i = 0; i < firstChannel.items.length; i++) { item = firstChannel.items[i]; - html += "" + item.link + "<\/a><\/td>"; + html += "" + item.link + "<\/a><\/td>"; } html += "<\/table>"; } @@ -60,27 +60,12 @@ function updatepage(str) { - #(topmenu)# - - #%env/templates/embeddedheader.template%# - :: - #%env/templates/simpleheader.template%# - - :: - #%env/templates/header.template%# + #%env/templates/header.template%# #%env/templates/submenuWebStructure.template%# - #(/topmenu)#

Index Browser

-

Browse the index of #[ucount]# documents. Enter a host or an URL for a file list or view a list of all hosts#(authorized)#::, only hosts with urls pending in the crawler or only with load errors#(/authorized)#.

- #[result]# - + #(hosts)#::
Host List
    #{list}#
  1. - +
    #[count]##(crawler)#::/#[pending]##(/crawler)##(errors)#::/#[exclcount]#/#[failcount]##(/errors)# URLs
  2. #{/list}# -
+
Count Colors:    Documents without Errors @@ -117,7 +102,7 @@ function updatepage(str) { #(/authorized)#
- + @@ -125,100 +110,100 @@ function updatepage(str) {
#(/hosts)# - + #(hostanalysis)#::
Host Analysis #{facets}# - + #{facet}# - + #{/facet}#
#[facetname]###
#[key]##[count]##[count]#
   #{/facets}#
#(/hostanalysis)# - + #(files)#::
Browser for #[path]# -

documents stored for host: #[hostsize]#; documents stored for subpath: #[subpathloadsize]#; unloaded documents detected in subpath: #[subpathdetectedsize]# +

documents stored for host: #[hostsize]#; documents stored for subpath: #[subpathloadsize]#; unloaded documents detected in subpath: #[subpathdetectedsize]#

- - - - - - - - - + + + + + + + + + #(root)# - + :: #(/root)# #{list}# #(type)# - + #(stored)# - #(load)#:::: + #(load)#:::: :: - :: - + :: + #(/stored)# :: - + @@ -231,7 +216,7 @@ var solr= $.getJSON("solr/collection1/select?q=*:*&defType=edismax&start=0&rows=
Pathstoredlinkedpendingexcludedfailed
Pathstoredlinkedpendingexcludedfailed
....
Show Metadata
#[url]#link, detected from contextload & index#(/load)#link, detected from contextload & index#(/load)#indexed#[comment]#loading#[error]#loading#[error]#
Directory#[url]##[url]# #[stored]# #[linked]# #[pending]#
- #(linkgraph)#
:: + #(linkgraph)#
::
@@ -246,38 +231,38 @@ var solr= $.getJSON("solr/collection1/select?q=*:*&defType=edismax&start=0&rows=
    #{list}#
  1. - +
    #[count]# URLs
  2. #{/list}#
- #(admin)#::


- #(/admin)# #(/outbound)# #(inbound)#::
Inbound Links, incoming to #[host]# - Host List
    - #{list}# -
  1. - -
    #[count]# URLs
    -
  2. + #{list}# +
  3. + +
    #[count]# URLs
    +
  4. #{/list}#
#(/inbound)# - #(authorized)#:: - #(admin)#:: - #%HostBrowserAdmin_p.html%# - #(/admin)# - #(/authorized)# - + #(hosts)#:: +
+
Administration Options +
Delete all Load Errors from index
+
+
+ #(/hosts)# + #%env/templates/footer.template%# diff --git a/htroot/HostBrowser.java b/htroot/IndexBrowser_p.java similarity index 78% rename from htroot/HostBrowser.java rename to htroot/IndexBrowser_p.java index 99c27812e..e0135a469 100644 --- a/htroot/HostBrowser.java +++ b/htroot/IndexBrowser_p.java @@ -1,5 +1,5 @@ /** - * HostBrowser + * IndexBrowser * Copyright 2012 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany * First released 27.09.2012 at http://yacy.net * @@ -71,14 +71,14 @@ import net.yacy.server.serverSwitch; /** * Browser for indexed resources */ -public class HostBrowser { - +public class IndexBrowser_p { + final static long TIMEOUT = 10000L; - + public static enum StoreType { LINK, INDEX, EXCLUDED, FAILED, RELOAD; } - + /** *

Retrieve local index entries for a path, or for hosts with the most references. Also allow some maintaining operations on entries with load errors.

*

Some parameters need administrator authentication or unauthenticated local host requests to be allowed : load, deleteLoadErrors, delete, reload404, @@ -87,36 +87,34 @@ public class HostBrowser { *

* Configuration settings : *

    - *
  • browser.autoload : allow the administrator to stack URLs to the local crawl queue, manually with the "load" parameter, - * or automatically when the "path" parameter is filled with an unknown URL
  • + *
  • browser.autoload : allow the administrator to stack URLs to the local crawl queue, manually with the "load" parameter, + * or automatically when the "path" parameter is filled with an unknown URL
  • *
  • browser.load4everyone : allow everyone to stack URLs to the local crawl queue. - * "browser.autoload" has also to be set to true to enable automatic loading on an unknown path
  • + * "browser.autoload" has also to be set to true to enable automatic loading on an unknown path *
  • publicSearchpage : set to false to restrict use of this servlet to authenticated administrator only
  • - *
  • publicTopmenu : set to false to hide the top navigation bar to non authenticated users
  • *
  • decoration.hostanalysis : add supplementary hosts information for debug/analysis purpose
  • *
  • decoration.grafics.linkstructure : display a link structure graph when the path parameter is filled
  • *
*

* @param header servlet request header * @param post request parameters. Supported keys :
    - *
  • admin : when "true", display in the html page render the administration context (menu and top navbar)
  • - *
  • path : root URL or host name to browse (ignored when the hosts parameter is filled). When not yet locally indexed, this URL can be automatically crawled and indexed - * when "browser.autoload" or "browser.load4everyone" configuration settings are set to true.
  • - *
  • load : URL to crawl and index.
  • - *
  • deleteLoadErrors : delete from the local index documents with load error (HTTP status different from 200 or any other failure).
  • - *
  • hosts : generate hosts with most references list. Supported values : - *
      - *
    • "crawling" : restrict to host currently crawled
    • - *
    • "error" : restrict to hosts with having at least one resource load error
    • - *
    - *
  • - *
  • delete : delete from the index whole documents tree matching the path prefix
  • - *
  • reload404 : reload documents matching the path prefix and which previously failed to load due to a network error
  • - *
  • facetcount :
  • - *
  • complete : we want only root paths for complete lists
  • - *
  • nepr :
  • - *
  • showlinkstructure : when present, display a link graph for path
  • - *
+ *
  • path : root URL or host name to browse (ignored when the hosts parameter is filled). When not yet locally indexed, this URL can be automatically crawled and indexed + * when "browser.autoload" or "browser.load4everyone" configuration settings are set to true.
  • + *
  • load : URL to crawl and index.
  • + *
  • deleteLoadErrors : delete from the local index documents with load error (HTTP status different from 200 or any other failure).
  • + *
  • hosts : generate hosts with most references list. Supported values : + *
      + *
    • "crawling" : restrict to host currently crawled
    • + *
    • "error" : restrict to hosts with having at least one resource load error
    • + *
    + *
  • + *
  • delete : delete from the index whole documents tree matching the path prefix
  • + *
  • reload404 : reload documents matching the path prefix and which previously failed to load due to a network error
  • + *
  • facetcount :
  • + *
  • complete : we want only root paths for complete lists
  • + *
  • nepr :
  • + *
  • showlinkstructure : when present, display a link graph for path
  • + * * @param env server environment * @return the servlet answer object */ @@ -125,53 +123,23 @@ public class HostBrowser { // return variable that accumulates replacements final Switchboard sb = (Switchboard) env; Fulltext fulltext = sb.index.fulltext(); - final boolean authorized = sb.verifyAuthentication(header); - final boolean autoload = authorized && sb.getConfigBool("browser.autoload", true); + final boolean autoload = sb.getConfigBool("browser.autoload", true); final boolean load4everyone = sb.getConfigBool("browser.load4everyone", false); final boolean loadRight = autoload || load4everyone; // add config later - final boolean searchAllowed = sb.getConfigBool(SwitchboardConstants.PUBLIC_SEARCHPAGE, true) || authorized; final serverObjects prop = new serverObjects(); - + // set default values prop.put("path", ""); prop.put("result", ""); prop.put("hosts", 0); prop.put("files", 0); prop.put("hostanalysis", 0); - - prop.put("admin", "false"); - boolean admin = false; String referer = header.get("Referer", ""); - if ((post != null && post.getBoolean("admin")) || referer.contains("HostBrowser.html?admin=true")) { - prop.put("topmenu", 2); - prop.put("admin", "true"); - admin = true; - } else if (authorized) { // show top nav to admins - prop.put("topmenu", 1); - } else { // for other respect setting in Search Design Configuration - prop.put("topmenu", sb.getConfigBool("publicTopmenu", true) ? 1 : 0); - } - final String promoteSearchPageGreeting = - (env.getConfigBool(SwitchboardConstants.GREETING_NETWORK_NAME, false)) ? - env.getConfig("network.unit.description", "") : - env.getConfig(SwitchboardConstants.GREETING, ""); - prop.put("topmenu_promoteSearchPageGreeting", promoteSearchPageGreeting); - - if (!searchAllowed) { - prop.put("result", "You are not allowed to use this page. Please ask an administrator for permission."); - prop.putNum("ucount", 0); - return prop; - } - - if(authorized) { - /* Fill the "admin" parameter for authorized links */ - prop.put("authorized_admin", Boolean.toString(admin)); - } String path = post == null ? "" : post.get("path", "").trim(); - if (authorized) sb.index.fulltext().commit(true); + sb.index.fulltext().commit(true); if (post == null || env == null) { prop.putNum("ucount", fulltext.collectionSize()); return prop; @@ -186,8 +154,8 @@ public class HostBrowser { !path.startsWith("smb://") && !path.startsWith("file://"))) { path = "http://" + path; } prop.putHTML("path", path); - prop.put("delete", authorized && path.length() > 0 ? 1 : 0); - + prop.put("delete", path.length() > 0 ? 1 : 0); + DigestURL pathURI = null; try {pathURI = new DigestURL(path);} catch (final MalformedURLException e) {} @@ -231,61 +199,60 @@ public class HostBrowser { } } - if (authorized && post.containsKey("deleteLoadErrors")) { + if (post.containsKey("deleteLoadErrors")) { try { fulltext.getDefaultConnector().deleteByQuery("-" + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200 AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM); // make sure field exists - ConcurrentLog.info ("HostBrowser:", "delete documents with httpstatus_i <> 200"); + ConcurrentLog.info ("IndexBrowser_p:", "delete documents with httpstatus_i <> 200"); fulltext.getDefaultConnector().deleteByQuery(CollectionSchema.failtype_s.getSolrFieldName() + ":\"" + FailType.fail.name() + "\"" ); - ConcurrentLog.info ("HostBrowser:", "delete documents with failtype_s = fail"); + ConcurrentLog.info ("IndexBrowser_p:", "delete documents with failtype_s = fail"); fulltext.getDefaultConnector().deleteByQuery(CollectionSchema.failtype_s.getSolrFieldName() + ":\"" + FailType.excl.name() + "\"" ); - ConcurrentLog.info ("HostBrowser:", "delete documents with failtype_s = excl"); + ConcurrentLog.info ("IndexBrowser_p:", "delete documents with failtype_s = excl"); prop.putNum("ucount", fulltext.collectionSize()); return prop; } catch (final IOException ex) { ConcurrentLog.logException(ex); } } - + if (post.containsKey("hosts")) { // generate host list try { boolean onlyCrawling = "crawling".equals(post.get("hosts", "")); boolean onlyErrors = "error".equals(post.get("hosts", "")); - - int maxcount = authorized ? 2 * 3 * 2 * 5 * 7 * 2 * 3 : 360; // which makes nice matrixes for 2, 3, 4, 5, 6, 7, 8, 9 rows/colums - + + int maxcount = 2 * 3 * 2 * 5 * 7 * 2 * 3; // which makes nice matrixes for 2, 3, 4, 5, 6, 7, 8, 9 rows/colums + // collect hosts from index ReversibleScoreMap hostscore = fulltext.getDefaultConnector().getFacets(CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", maxcount, CollectionSchema.host_s.getSolrFieldName()).get(CollectionSchema.host_s.getSolrFieldName()); if (hostscore == null) hostscore = new ClusteredScoreMap(true); - + // collect hosts from crawler - final Map crawler = (authorized) ? sb.crawlQueues.noticeURL.getDomainStackHosts(StackType.LOCAL, sb.robots) : new HashMap(); + final Map crawler = sb.crawlQueues.noticeURL.getDomainStackHosts(StackType.LOCAL, sb.robots); final Map hostNameToPendingCount = new HashMap<>(); for(EntrycrawlerEntry: crawler.entrySet()) { /* The local stack returns keys composed of "hostname:port" : we now sum pending URLs counts by host name */ - String hostName = Domains.stripToHostName(crawlerEntry.getKey()); - Integer pendingCount = hostNameToPendingCount.get(hostName); - if(pendingCount == null) { - pendingCount = 0; - } - pendingCount += crawlerEntry.getValue()[0]; - hostNameToPendingCount.put(hostName, pendingCount); + String hostName = Domains.stripToHostName(crawlerEntry.getKey()); + Integer pendingCount = hostNameToPendingCount.get(hostName); + if(pendingCount == null) { + pendingCount = 0; + } + pendingCount += crawlerEntry.getValue()[0]; + hostNameToPendingCount.put(hostName, pendingCount); } - + // collect the errorurls - Map> exclfacets = authorized ? fulltext.getDefaultConnector().getFacets(CollectionSchema.failtype_s.getSolrFieldName() + ":" + FailType.excl.name(), maxcount, CollectionSchema.host_s.getSolrFieldName()) : null; + Map> exclfacets = fulltext.getDefaultConnector().getFacets(CollectionSchema.failtype_s.getSolrFieldName() + ":" + FailType.excl.name(), maxcount, CollectionSchema.host_s.getSolrFieldName()); ReversibleScoreMap exclscore = exclfacets == null ? new ClusteredScoreMap(true) : exclfacets.get(CollectionSchema.host_s.getSolrFieldName()); - Map> failfacets = authorized ? fulltext.getDefaultConnector().getFacets(CollectionSchema.failtype_s.getSolrFieldName() + ":" + FailType.fail.name(), maxcount, CollectionSchema.host_s.getSolrFieldName()) : null; + Map> failfacets = fulltext.getDefaultConnector().getFacets(CollectionSchema.failtype_s.getSolrFieldName() + ":" + FailType.fail.name(), maxcount, CollectionSchema.host_s.getSolrFieldName()); ReversibleScoreMap failscore = failfacets == null ? new ClusteredScoreMap(true) : failfacets.get(CollectionSchema.host_s.getSolrFieldName()); - + int c = 0; Iterator i = hostscore.keys(false); String host; while (i.hasNext() && c < maxcount) { host = i.next(); - prop.put("hosts_list_" + c + "_admin", admin ? "true" : "false"); prop.putHTML("hosts_list_" + c + "_host", host); boolean inCrawler = hostNameToPendingCount.containsKey(host); int exclcount = exclscore.get(host); @@ -294,7 +261,7 @@ public class HostBrowser { prop.put("hosts_list_" + c + "_count", hostscore.get(host)); prop.put("hosts_list_" + c + "_crawler", inCrawler ? 1 : 0); if (inCrawler) { - prop.put("hosts_list_" + c + "_crawler_pending", hostNameToPendingCount.get(host)); + prop.put("hosts_list_" + c + "_crawler_pending", hostNameToPendingCount.get(host)); } prop.put("hosts_list_" + c + "_errors", errors > 0 ? 1 : 0); if (errors > 0) { @@ -311,18 +278,18 @@ public class HostBrowser { } } prop.put("hosts_list", c); - prop.put("hosts_authorized", authorized ? 1 : 0); + prop.put("hosts_authorized", 1); prop.put("hosts", 1); } catch (final IOException e) { ConcurrentLog.logException(e); } } - + if (path.length() > 0) { try { DigestURL uri = new DigestURL(path); String host = uri.getHost(); - + // write host analysis if path after host is empty if (uri.getPath().length() <= 1 && host != null && host.length() > 0 && sb.getConfigBool("decoration.hostanalysis", false)) { //how many documents per crawldepth_i; get crawldepth_i facet for host @@ -364,27 +331,25 @@ public class HostBrowser { prop.put("hostanalysis_facets", fc); prop.put("hostanalysis", 1); } - - + // write file list for subpath boolean delete = false; boolean reload404 = false; - if (authorized && post.containsKey("delete")) { + if (post.containsKey("delete")) { // delete the complete path!! That includes everything that matches with this prefix. delete = true; } - if (authorized && post.containsKey("reload404")) { + if (post.containsKey("reload404")) { // try to re-load all urls that have load errors and matches with this prefix. reload404 = true; } - int facetcount=post.getInt("facetcount", 0); + int facetcount = post.getInt("facetcount", 0); boolean complete = post.getBoolean("complete"); if (complete) { // we want only root paths for complete lists p = path.indexOf('/', 10); if (p > 0) path = path.substring(0, p + 1); } prop.put("files_complete", complete ? 1 : 0); - prop.put("files_complete_admin", admin ? "true" : "false"); prop.putHTML("files_complete_path", path); p = path.substring(0, path.length() - 1).lastIndexOf('/'); if (p < 8) { @@ -392,15 +357,13 @@ public class HostBrowser { } else { prop.put("files_root", 0); prop.putHTML("files_root_path", path.substring(0, p + 1)); - prop.put("files_root_admin", admin ? "true" : "false"); } // generate file list from path prop.putHTML("outbound_host", host); - if (authorized) prop.putHTML("outbound_admin_host", host); //used for WebStructurePicture_p link prop.putHTML("inbound_host", host); String hosthash = uri.hosthash(); String[] pathparts = uri.getPaths(); - + // get all files for a specific host from the index StringBuilder q = new StringBuilder(); if (host == null) { @@ -438,7 +401,7 @@ public class HostBrowser { CollectionSchema.references_external_i.getSolrFieldName(), CollectionSchema.references_exthosts_i.getSolrFieldName(), CollectionSchema.cr_host_chance_d.getSolrFieldName(), - CollectionSchema.cr_host_norm_i.getSolrFieldName() + CollectionSchema.cr_host_norm_i.getSolrFieldName() )); solrQueryTask.start(); Set storedDocs = new HashSet(); @@ -455,68 +418,68 @@ public class HostBrowser { long timeoutReferences = System.currentTimeMillis() + 6000; ReferenceReportCache rrCache = sb.index.getReferenceReportCache(); try { - SolrDocument doc = docs.poll(remainingTime, TimeUnit.MILLISECONDS); - while (doc != AbstractSolrConnector.POISON_DOCUMENT && doc != null) { - String u = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()); - String errortype = (String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName()); - FailType error = errortype == null ? null : FailType.valueOf(errortype); - String ids = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()); - infoCache.put(ids, new InfoCacheEntry(sb.index.fulltext(), rrCache, doc, ids, System.currentTimeMillis() < timeoutReferences)); - if (u.startsWith(path)) { - if (delete) { - deleteIDs.add(ids); - } else { - if (error == null) storedDocs.add(u); else { - if (reload404 && error == FailType.fail) { - ArrayList collections = (ArrayList) doc.getFieldValue(CollectionSchema.collection_sxt.getSolrFieldName()); - if (collections != null) reloadURLCollection.addAll(collections); - reloadURLs.add(u); - } - if (authorized) errorDocs.put(u, error); - } - } - } else if (complete) { - if (error == null) storedDocs.add(u); else { - if (authorized) errorDocs.put(u, error); - } - } - if ((complete || u.startsWith(path)) && !storedDocs.contains(u)) inboundLinks.add(u); // add the current link - if (error == null) { - hostsize++; - // collect inboundlinks to browse the host - Iterator links = URIMetadataNode.getLinks(doc, true); - while (links.hasNext()) { - u = links.next(); - if ((complete || u.startsWith(path)) && !storedDocs.contains(u)) inboundLinks.add(u); - } - - // collect referrer links - links = URIMetadataNode.getLinks(doc, false); - while (links.hasNext()) { - u = links.next(); - try { - MultiProtocolURL mu = new MultiProtocolURL(u); - if (mu.getHost() != null) { - ReversibleScoreMap lks = outboundHosts.get(mu.getHost()); - if (lks == null) { - lks = new ClusteredScoreMap(UTF8.insensitiveUTF8Comparator); - outboundHosts.put(mu.getHost(), lks); - } - lks.set(u, u.length()); - } - } catch (final MalformedURLException e) {} - } - } - - remainingTime = timeoutList - System.currentTimeMillis(); - if (remainingTime <= 0) { - break; - } - doc = docs.poll(remainingTime, TimeUnit.MILLISECONDS); - } + SolrDocument doc = docs.poll(remainingTime, TimeUnit.MILLISECONDS); + while (doc != AbstractSolrConnector.POISON_DOCUMENT && doc != null) { + String u = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()); + String errortype = (String) doc.getFieldValue(CollectionSchema.failtype_s.getSolrFieldName()); + FailType error = errortype == null ? null : FailType.valueOf(errortype); + String ids = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()); + infoCache.put(ids, new InfoCacheEntry(sb.index.fulltext(), rrCache, doc, ids, System.currentTimeMillis() < timeoutReferences)); + if (u.startsWith(path)) { + if (delete) { + deleteIDs.add(ids); + } else { + if (error == null) storedDocs.add(u); else { + if (reload404 && error == FailType.fail) { + ArrayList collections = (ArrayList) doc.getFieldValue(CollectionSchema.collection_sxt.getSolrFieldName()); + if (collections != null) reloadURLCollection.addAll(collections); + reloadURLs.add(u); + } + errorDocs.put(u, error); + } + } + } else if (complete) { + if (error == null) storedDocs.add(u); else { + errorDocs.put(u, error); + } + } + if ((complete || u.startsWith(path)) && !storedDocs.contains(u)) inboundLinks.add(u); // add the current link + if (error == null) { + hostsize++; + // collect inboundlinks to browse the host + Iterator links = URIMetadataNode.getLinks(doc, true); + while (links.hasNext()) { + u = links.next(); + if ((complete || u.startsWith(path)) && !storedDocs.contains(u)) inboundLinks.add(u); + } + + // collect referrer links + links = URIMetadataNode.getLinks(doc, false); + while (links.hasNext()) { + u = links.next(); + try { + MultiProtocolURL mu = new MultiProtocolURL(u); + if (mu.getHost() != null) { + ReversibleScoreMap lks = outboundHosts.get(mu.getHost()); + if (lks == null) { + lks = new ClusteredScoreMap(UTF8.insensitiveUTF8Comparator); + outboundHosts.put(mu.getHost(), lks); + } + lks.set(u, u.length()); + } + } catch (final MalformedURLException e) {} + } + } + + remainingTime = timeoutList - System.currentTimeMillis(); + if (remainingTime <= 0) { + break; + } + doc = docs.poll(remainingTime, TimeUnit.MILLISECONDS); + } } finally { - /* Ensure termination and proper resources release of the query thread */ - solrQueryTask.interrupt(); + /* Ensure termination and proper resources release of the query thread */ + solrQueryTask.interrupt(); } if (deleteIDs.size() > 0) sb.remove(deleteIDs); if (reloadURLs.size() > 0) { @@ -524,19 +487,19 @@ public class HostBrowser { for (String collection: reloadURLCollection) cm.put(collection, QueryParams.catchall_pattern); sb.reload(reloadURLs, cm.size() > 0 ? cm : CrawlProfile.collectionParser("user"), false); } - + // collect from crawler - List domainStackReferences = (authorized) ? sb.crawlQueues.noticeURL.getDomainStackReferences(StackType.LOCAL, host, 1000, 3000) : new ArrayList(0); + List domainStackReferences = sb.crawlQueues.noticeURL.getDomainStackReferences(StackType.LOCAL, host, 1000, 3000); Set loadingLinks = new HashSet(); for (Request crawlEntry: domainStackReferences) loadingLinks.add(crawlEntry.url().toNormalform(true)); - + // now combine all lists into one Map files = new HashMap(); for (String u: storedDocs) files.put(u, StoreType.INDEX); for (Map.Entry e: errorDocs.entrySet()) files.put(e.getKey(), e.getValue() == FailType.fail ? StoreType.FAILED : StoreType.EXCLUDED); for (String u: inboundLinks) if (!files.containsKey(u)) files.put(u, StoreType.LINK); for (String u: loadingLinks) if (u.startsWith(path) && !files.containsKey(u)) files.put(u, StoreType.LINK); - ConcurrentLog.info("HostBrowser", "collected " + files.size() + " urls for path " + path); + ConcurrentLog.info("IndexBrowser_p", "collected " + files.size() + " urls for path " + path); // distinguish files and folders Map list = new TreeMap(); // a directory list; if object is boolean, its a file; if its a int[], then its a folder @@ -575,7 +538,7 @@ public class HostBrowser { } } } - + int maxcount = 1000; int c = 0; // first list only folders @@ -587,7 +550,6 @@ public class HostBrowser { // this is a folder prop.put("files_list_" + c + "_type", 1); prop.putHTML("files_list_" + c + "_type_url", entry.getKey()); - prop.putHTML("files_list_" + c + "_type_admin", admin ? "true" : "false"); int linked = ((int[]) entry.getValue())[0]; int stored = ((int[]) entry.getValue())[1]; int crawler = ((int[]) entry.getValue())[2]; @@ -610,7 +572,6 @@ public class HostBrowser { // this is a file prop.put("files_list_" + c + "_type", 0); prop.putHTML("files_list_" + c + "_type_url", entry.getKey()); - prop.putHTML("files_list_" + c + "_type_admin", admin ? "true" : "false"); StoreType type = (StoreType) entry.getValue(); try {uri = new DigestURL(entry.getKey());} catch (final MalformedURLException e) {uri = null;} HarvestProcess process = uri == null ? null : sb.crawlQueues.exists(uri.hash()); // todo: cannot identify errors @@ -640,7 +601,6 @@ public class HostBrowser { if (loadRight) { prop.putHTML("files_list_" + c + "_type_stored_load_url", entry.getKey()); prop.putHTML("files_list_" + c + "_type_stored_load_path", path); - prop.putHTML("files_list_" + c + "_type_stored_load_admin", Boolean.toString(admin)); } if (++c >= maxcount) break; } @@ -669,7 +629,6 @@ public class HostBrowser { Iterator i = score.keys(false); while (i.hasNext() && c < maxcount) { host = i.next(); - prop.put("inbound_list_" + c + "_admin", admin ? "true" : "false"); prop.putHTML("inbound_list_" + c + "_host", sb.webStructure.hostHash2hostName(host)); prop.put("inbound_list_" + c + "_count", score.get(host)); c++; @@ -679,7 +638,7 @@ public class HostBrowser { } else { prop.put("inbound", 0); } - + // generate outbound-links table if (outboundHosts.size() > 0) { maxcount = 200; @@ -692,7 +651,6 @@ public class HostBrowser { prop.putHTML("outbound_list_" + c + "_host", host); prop.put("outbound_list_" + c + "_count", score.get(host)); prop.put("outbound_list_" + c + "_link", outboundHosts.get(host).getMinKey()); - prop.put("outbound_list_" + c + "_admin", admin ? "true" : "false"); c++; } prop.put("outbound_list", c); @@ -700,7 +658,7 @@ public class HostBrowser { } else { prop.put("outbound", 0); } - + } catch (final Throwable e) { ConcurrentLog.logException(e); } diff --git a/htroot/HostBrowser.xml b/htroot/IndexBrowser_p.xml similarity index 95% rename from htroot/HostBrowser.xml rename to htroot/IndexBrowser_p.xml index e56fa856a..5be6b5671 100644 --- a/htroot/HostBrowser.xml +++ b/htroot/IndexBrowser_p.xml @@ -1,5 +1,5 @@ - + #(hosts)#:: #{list}# @@ -29,4 +29,4 @@ #{/list}# #(/inbound)# - \ No newline at end of file + \ No newline at end of file diff --git a/htroot/ViewFile.html b/htroot/ViewFile.html index cb46b1ca0..cbc0a0f5d 100644 --- a/htroot/ViewFile.html +++ b/htroot/ViewFile.html @@ -84,7 +84,7 @@ function updatepage(str) {
    - #(moar)#::#(/moar)# + #(moar)#::#(/moar)#
    #(moar)#:: diff --git a/htroot/env/templates/header.template b/htroot/env/templates/header.template index b9d823b4c..2ce0e2d3a 100644 --- a/htroot/env/templates/header.template +++ b/htroot/env/templates/header.template @@ -191,7 +191,7 @@
  • Monitoring

  • System Status
  • #(navigation-p2p)#::
  • Peer-to-Peer Network
  • #(/navigation-p2p)# - #(navigation-crawlmonitor)#::
  • Index Browser
  • #(/navigation-crawlmonitor)# + #(navigation-crawlmonitor)#::
  • Index Browser
  • #(/navigation-crawlmonitor)#
  • Network Access
  • #(navigation-crawlmonitor)#::
  • Crawler Monitor
  • #(/navigation-crawlmonitor)# diff --git a/htroot/env/templates/simpleSearchHeader.template b/htroot/env/templates/simpleSearchHeader.template index 65793ea27..91a86b3c3 100644 --- a/htroot/env/templates/simpleSearchHeader.template +++ b/htroot/env/templates/simpleSearchHeader.template @@ -40,7 +40,6 @@
  • Web Search
  • File Search
  • Compare Search
  • -
  • Index Browser
  • URL Viewer
  • diff --git a/htroot/env/templates/simpleheader.template b/htroot/env/templates/simpleheader.template index ad7adfbd9..1544df208 100644 --- a/htroot/env/templates/simpleheader.template +++ b/htroot/env/templates/simpleheader.template @@ -20,7 +20,6 @@
  • Web Search
  • File Search
  • Compare Search
  • -
  • Index Browser
  • URL Viewer
  • diff --git a/htroot/env/templates/submenuWebStructure.template b/htroot/env/templates/submenuWebStructure.template index 71ca72893..5f15a2ebd 100644 --- a/htroot/env/templates/submenuWebStructure.template +++ b/htroot/env/templates/submenuWebStructure.template @@ -1,7 +1,7 @@