introduced more structure in HostBrowser, table view, better counting,

distinguishing of error cases (fail/excluded)
pull/1/head
Michael Peter Christen 12 years ago
parent efd2c4622d
commit bf42179982

@ -55,6 +55,7 @@ function updatepage(str) {
}
//]]>
</script>
<script type="text/javascript" src="/js/sorttable.js"></script>
</head>
<body id="IndexControl">
#%env/templates/simpleheader.template%#
@ -77,51 +78,59 @@ function updatepage(str) {
#{list}#
<div style="float:left; padding:1px 5px 1px 5px;">
<div style="width:180px; text-align:left; float: left; white-space:nowrap; overflow:hidden;"><div id="info"><img src="/env/grafics/#(type)#invisible.gif::burn-e.gif::construction.gif#(/type)#" align="left" width="12" height="8">&nbsp;<a href="/HostBrowser.html?path=#[host]#&facetcount=#[count]#">#[host]#</a></div></div>
<div style="width:100px; text-align:right; float: left; white-space:nowrap; overflow:hidden;"><span class="commit">#[count]#</span>#(crawler)#::/<span class="pending">#[pending]#</span>#(/crawler)##(errors)#::/<span class="error">#[count]#</span>#(/errors)# URLs</div>
<div style="width:120px; text-align:right; float: left; white-space:nowrap; overflow:hidden;"><span class="commit">#[count]#</span>#(crawler)#::/<span class="pending">#[pending]#</span>#(/crawler)##(errors)#::/<span class="error">#[exclcount]#/#[failcount]#</span>#(/errors)# URLs</div>
</div>
#{/list}#
<div style="clear:both; float:left; padding:10px 5px 1px 5px;">
<div style="float:left;clear:both;">Count Colors:</div>
<div class="commit" style="float:left;">&nbsp;&nbsp;&nbsp;Documents without Errors</div>
<div class="pending" style="float:left;">&nbsp;&nbsp;&nbsp;Pending in Crawler</div>
<div class="error" style="float:left;">&nbsp;&nbsp;&nbsp;Load Errors</div>
<div class="error" style="float:left;">&nbsp;&nbsp;&nbsp;Load Errors (exclusion/failure)</div>
</div>
</fieldset>
#(/hosts)#
#(files)#::
<fieldset><legend>Browser for #[path]#</legend>
<p>Documents on host: #[hostsize]#; Documents in subpath: #[subpathsize]# <!-- #(complete)#;<a href="/HostBrowser.html?complete=true&path=#[path]#">get complete list</a>::<a href="/HostBrowser.html?path=#[path]#">directory view</a>#(/complete)#-->
<p>documents stored for host: #[hostsize]#; documents stored for subpath: #[subpathloadsize]#; unloaded documents detected in subpath: #[subpathdetectedsize]# <!-- #(complete)#;<a href="/HostBrowser.html?complete=true&path=#[path]#">get complete list</a>::<a href="/HostBrowser.html?path=#[path]#">directory view</a>#(/complete)#-->
</p>
<table border="0" cellpadding="2" cellspacing="2" style="float:left">
<table class="sortable" border="0" cellpadding="2" cellspacing="2" style="float:left">
<tr>
<th align="center" width="32"></th>
<th align="left" width="600" class="listing">Path</th>
<th align="right" width="80" class="listing"></th>
<td align="right" class="listingem">stored</td>
<td align="right" class="listingem">linked</td>
<td align="right" class="listingem">pending</td>
<td align="right" class="listingem">excluded</td>
<td align="right" class="listingem">failed</td>
</tr>
#(root)#
<tr class="TableCell#(dark)#Light::Dark::Summary#(/dark)#">
<td align="center"></td>
<td align="left" nowrap ><a href="/HostBrowser.html?path=#[path]#" class="listing">..</a></td>
<td align="right" nowrap></td>
<td align="right" colspan="5" nowrap></td>
</tr>::
#(/root)#
#{list}#
#(type)#
#(type)#<!--file-->
<tr class="TableCell#(dark)#Light::Dark::Summary#(/dark)#">
<td align="center"><div id="info"><a href="/ViewFile.html?url=#[url]#"><img src="/env/grafics/doc.gif"/></a><span>Show Metadata</span></div></td>
<td align="left" nowrap class=#(stored)#"listingem"::"listing"#(/stored)#>#[url]#&nbsp;<a href="#[url]#"><img src="/env/grafics/link.gif"/></a></td>
#(stored)#
#(load)#::<td align="right" nowrap class="listingnok"><a href="/HostBrowser.html?load=#[url]#&path=#[path]#">load &amp; index</a>#(/load)#</td>::
<td align="right" nowrap class="listingok">indexed</td>::
<td align="right" nowrap class="pending">loading</td>::
<td align="right" nowrap class="listingnok">load fail: #[error]#</td>
#(load)#<td align="left" colspan="5" nowrap class="listingem">link, detected from context</td>::<td align="left" colspan="5" nowrap class="listingnok"><a href="/HostBrowser.html?load=#[url]#&path=#[path]#">load &amp; index</a>#(/load)#</td>::
<td align="left" colspan="5" nowrap class="listingok">indexed</td>::
<td align="left" colspan="5" nowrap class="pending">loading</td>::
<td align="left" colspan="5" nowrap class="listingnok">load fail: #[error]#</td>
#(/stored)#
</tr>::
</tr>::<!--folder-->
<tr class="TableCell#(dark)#Light::Dark::Summary#(/dark)#">
<td align="center"><img src="/env/grafics/dir.gif"/></td>
<td align="left" nowrap class="listing"><a href="/HostBrowser.html?path=#[url]#" class="listing">#[url]#</a></td>
<td align="right" nowrap class="listing">#[count]# files</td>
<td align="right" class="commit">#[stored]#</td>
<td align="right" class="listing">#[linked]#</td>
<td align="right" #(pendingVisible)#class="listingem"::class="pending"#(/pendingVisible)#>#[pending]#</td>
<td align="right" #(excludedVisible)#class="listingem"::class="error"#(/excludedVisible)#>#[excluded]#</td>
<td align="right" #(failedVisible)#class="listingem"::class="error"#(/failedVisible)#>#[failed]#</td>
</tr>
#(/type)#
#{/list}#

@ -36,6 +36,7 @@ import org.apache.solr.common.SolrDocument;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.federate.solr.FailType;
import net.yacy.cora.federate.solr.YaCySchema;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
import net.yacy.cora.protocol.RequestHeader;
@ -57,7 +58,7 @@ public class HostBrowser {
final static long TIMEOUT = 10000L;
public static enum StoreType {
LINK, INDEX, ERROR;
LINK, INDEX, EXCLUDED, FAILED;
}
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
@ -153,8 +154,10 @@ public class HostBrowser {
}
// collect the errorurls
ReversibleScoreMap<String> errorscore = admin ? fulltext.getSolr().getFacets(YaCySchema.failreason_t.getSolrFieldName() + ":[* TO *]", maxcount, YaCySchema.host_s.getSolrFieldName()).get(YaCySchema.host_s.getSolrFieldName()) : null;
if (errorscore == null) errorscore = new ClusteredScoreMap<String>();
Map<String, ReversibleScoreMap<String>> exclfacets = admin ? fulltext.getSolr().getFacets(YaCySchema.failtype_s.getSolrFieldName() + ":" + FailType.excl.name(), maxcount, YaCySchema.host_s.getSolrFieldName()) : null;
ReversibleScoreMap<String> exclscore = exclfacets == null ? new ClusteredScoreMap<String>() : exclfacets.get(YaCySchema.host_s.getSolrFieldName());
Map<String, ReversibleScoreMap<String>> failfacets = admin ? fulltext.getSolr().getFacets(YaCySchema.failtype_s.getSolrFieldName() + ":" + FailType.fail.name(), maxcount, YaCySchema.host_s.getSolrFieldName()) : null;
ReversibleScoreMap<String> failscore = failfacets == null ? new ClusteredScoreMap<String>() : failfacets.get(YaCySchema.host_s.getSolrFieldName());
int c = 0;
Iterator<String> i = hostscore.keys(false);
@ -163,12 +166,17 @@ public class HostBrowser {
host = i.next();
prop.put("hosts_list_" + c + "_host", host);
boolean inCrawler = crawler.containsKey(host);
int errors = errorscore.get(host);
int exclcount = exclscore.get(host);
int failcount = failscore.get(host);
int errors = exclcount + failcount;
prop.put("hosts_list_" + c + "_count", hostscore.get(host) - errors);
prop.put("hosts_list_" + c + "_crawler", inCrawler ? 1 : 0);
if (inCrawler) prop.put("hosts_list_" + c + "_crawler_pending", crawler.get(host)[0]);
prop.put("hosts_list_" + c + "_errors", errors > 0 ? 1 : 0);
if (errors > 0) prop.put("hosts_list_" + c + "_errors_count", errors);
if (errors > 0) {
prop.put("hosts_list_" + c + "_errors_exclcount", exclcount);
prop.put("hosts_list_" + c + "_errors_failcount", failcount);
}
prop.put("hosts_list_" + c + "_type", inCrawler ? 2 : errors > 0 ? 1 : 0);
if (onlyCrawling) {
if (inCrawler) c++;
@ -231,6 +239,7 @@ public class HostBrowser {
YaCySchema.id.getSolrFieldName(),
YaCySchema.sku.getSolrFieldName(),
YaCySchema.failreason_t.getSolrFieldName(),
YaCySchema.failtype_s.getSolrFieldName(),
YaCySchema.inboundlinks_protocol_sxt.getSolrFieldName(),
YaCySchema.inboundlinks_urlstub_txt.getSolrFieldName(),
YaCySchema.outboundlinks_protocol_sxt.getSolrFieldName(),
@ -238,7 +247,7 @@ public class HostBrowser {
);
SolrDocument doc;
Set<String> storedDocs = new HashSet<String>();
Map<String, String> errorDocs = new HashMap<String, String>();
Map<String, FailType> errorDocs = new HashMap<String, FailType>();
Set<String> inboundLinks = new HashSet<String>();
Map<String, ReversibleScoreMap<String>> outboundHosts = new HashMap<String, ReversibleScoreMap<String>>();
int hostsize = 0;
@ -246,7 +255,8 @@ public class HostBrowser {
long timeout = System.currentTimeMillis() + TIMEOUT;
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
String u = (String) doc.getFieldValue(YaCySchema.sku.getSolrFieldName());
String error = (String) doc.getFieldValue(YaCySchema.failreason_t.getSolrFieldName());
String errortype = (String) doc.getFieldValue(YaCySchema.failtype_s.getSolrFieldName());
FailType error = errortype == null ? null : FailType.valueOf(errortype);
if (u.startsWith(path)) {
if (delete) {
deleteIDs.add(ASCII.getBytes((String) doc.getFieldValue(YaCySchema.id.getSolrFieldName())));
@ -298,7 +308,7 @@ public class HostBrowser {
// now combine all lists into one
Map<String, StoreType> files = new HashMap<String, StoreType>();
for (String u: storedDocs) files.put(u, StoreType.INDEX);
for (String u: errorDocs.keySet()) files.put(u, StoreType.ERROR);
for (Map.Entry<String, FailType> e: errorDocs.entrySet()) files.put(e.getKey(), e.getValue() == FailType.fail ? StoreType.FAILED : StoreType.EXCLUDED);
for (String u: inboundLinks) if (!storedDocs.contains(u)) files.put(u, StoreType.LINK);
for (String u: loadingLinks) if (u.startsWith(path) && !storedDocs.contains(u)) files.put(u, StoreType.LINK);
Log.logInfo("HostBrowser", "collected " + files.size() + " urls for path " + path);
@ -343,6 +353,31 @@ public class HostBrowser {
int maxcount = 1000;
int c = 0;
// first list only folders
int filecounter = 0;
for (Map.Entry<String, Object> entry: list.entrySet()) {
if ((entry.getValue() instanceof StoreType)) {
filecounter++;
} else {
// this is a folder
prop.put("files_list_" + c + "_type", 1);
prop.put("files_list_" + c + "_type_url", entry.getKey());
int linked = ((int[]) entry.getValue())[0];
int stored = ((int[]) entry.getValue())[1];
int crawler = ((int[]) entry.getValue())[2];
int error = ((int[]) entry.getValue())[3];
prop.put("files_list_" + c + "_type_stored", stored);
prop.put("files_list_" + c + "_type_linked", linked);
prop.put("files_list_" + c + "_type_pendingVisible", crawler > 0 ? 1 : 0);
prop.put("files_list_" + c + "_type_pending", crawler);
prop.put("files_list_" + c + "_type_excludedVisible", 0);
prop.put("files_list_" + c + "_type_excluded", 0);
prop.put("files_list_" + c + "_type_failedVisible", error > 0 ? 1 : 0);
prop.put("files_list_" + c + "_type_failed", error);
if (++c >= maxcount) break;
}
}
// then list files
for (Map.Entry<String, Object> entry: list.entrySet()) {
if (entry.getValue() instanceof StoreType) {
// this is a file
@ -351,30 +386,21 @@ public class HostBrowser {
StoreType type = (StoreType) entry.getValue();
try {uri = new DigestURI(entry.getKey());} catch (MalformedURLException e) {uri = null;}
boolean loading = load.equals(entry.getKey()) || (uri != null && sb.crawlQueues.urlExists(uri.hash()) != null);
//String failr = fulltext.failReason(ASCII.String(uri.hash()));
prop.put("files_list_" + c + "_type_stored", type == StoreType.INDEX ? 1 : loading ? 2 : type == StoreType.ERROR ? 3 : 0 /*linked*/);
prop.put("files_list_" + c + "_type_stored", type == StoreType.INDEX ? 1 : (type == StoreType.EXCLUDED || type == StoreType.FAILED) ? 3 : loading ? 2 : 0 /*linked*/);
prop.put("files_list_" + c + "_type_stored_load", loadRight ? 1 : 0);
if (type == StoreType.ERROR) prop.put("files_list_" + c + "_type_stored_error", errorDocs.get(entry.getKey()));
if (type == StoreType.EXCLUDED || type == StoreType.FAILED) prop.put("files_list_" + c + "_type_stored_error", errorDocs.get(entry.getKey()).name());
if (loadRight) {
prop.put("files_list_" + c + "_type_stored_load_url", entry.getKey());
prop.put("files_list_" + c + "_type_stored_load_path", path);
}
} else {
// this is a folder
prop.put("files_list_" + c + "_type", 1);
prop.put("files_list_" + c + "_type_url", entry.getKey());
int linked = ((int[]) entry.getValue())[0];
int stored = ((int[]) entry.getValue())[1];
int crawler = ((int[]) entry.getValue())[2];
int error = ((int[]) entry.getValue())[3];
prop.put("files_list_" + c + "_type_count", stored + " stored / " + linked + " linked" + (crawler > 0 ? (" / " + crawler + " pending") : "") + (error > 0 ? (" / " + error + " errors") : ""));
if (++c >= maxcount) break;
}
if (++c >= maxcount) break;
}
prop.put("files_list", c);
prop.putHTML("files_path", path);
prop.put("files_hostsize", hostsize);
prop.put("files_subpathsize", storedDocs.size());
prop.put("files_subpathloadsize", storedDocs.size());
prop.put("files_subpathdetectedsize", filecounter - storedDocs.size());
prop.put("files", 1);
// generate inbound-links table

Loading…
Cancel
Save