diff --git a/htroot/CrawlResults.html b/htroot/CrawlResults.html index ffb2d1ab8..e4081076c 100644 --- a/htroot/CrawlResults.html +++ b/htroot/CrawlResults.html @@ -76,6 +76,29 @@ #(table)#

The stack is empty.

:: +

Statistics about #[domains]# domains in this stack: + + + + + + + #{domains}# + + + + + + #{/domains}# +
+
+
+ + +
+
+
DomainURLs
#[domain]##[count]#

+

#(size)# Showing all #[all]# entries in this stack. @@ -85,7 +108,6 @@

- #(showControl)#:: - #(/showControl)# #(showInit)#::#(/showInit)# #(showExec)#::#(/showExec)# #(showDate)#::#(/showDate)# diff --git a/htroot/CrawlResults.java b/htroot/CrawlResults.java index 7c863be40..e7be9c9ae 100644 --- a/htroot/CrawlResults.java +++ b/htroot/CrawlResults.java @@ -26,6 +26,7 @@ import java.text.SimpleDateFormat; import java.util.Date; +import java.util.Iterator; import java.util.Locale; import de.anomic.http.httpRequestHeader; @@ -47,7 +48,6 @@ public class CrawlResults { final serverObjects prop = new serverObjects(); int lines = 500; - boolean showControl = env.getConfigBool("IndexMonitorControl", true); boolean showInit = env.getConfigBool("IndexMonitorInit", false); boolean showExec = env.getConfigBool("IndexMonitorExec", false); boolean showDate = env.getConfigBool("IndexMonitorDate", true); @@ -109,8 +109,6 @@ public class CrawlResults { if (post.containsKey("moreIndexed")) { lines = Integer.parseInt(post.get("showIndexed", "500")); } - if (post.get("sc") != null) - if (post.get("sc").equals("0")) showControl = false; else showControl = true; if (post.get("si") != null) if (post.get("si").equals("0")) showInit = false; else showInit = true; if (post.get("se") != null) @@ -141,12 +139,8 @@ public class CrawlResults { } prop.put("table_size_all", sb.crawlResults.getStackSize(tabletype)); - if (showControl) { - prop.put("table_showControl", "1"); - prop.putHTML("table_showControl_feedbackpage", "CrawlResults.html"); - prop.put("table_showControl_tabletype", tabletype); - } else - prop.put("table_showControl", "0"); + prop.putHTML("table_feedbackpage", "CrawlResults.html"); + prop.put("table_tabletype", tabletype); prop.put("table_showInit", (showInit) ? "1" : "0"); prop.put("table_showExec", (showExec) ? "1" : "0"); prop.put("table_showDate", (showDate) ? "1" : "0"); @@ -165,9 +159,7 @@ public class CrawlResults { for (i = sb.crawlResults.getStackSize(tabletype) - 1; i >= (sb.crawlResults.getStackSize(tabletype) - lines); i--) { initiatorHash = sb.crawlResults.getInitiatorHash(tabletype, i); executorHash = sb.crawlResults.getExecutorHash(tabletype, i); -// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps initiatorHash=" + initiatorHash + " executorHash=" + executorHash); urlHash = sb.crawlResults.getUrlHash(tabletype, i); -// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urlHash=" + urlHash); try { urle = sb.webIndex.getURL(urlHash, null, 0); if(urle == null) { @@ -182,18 +174,13 @@ public class CrawlResults { urltxt = nxTools.shortenURLString(urlstr, 72); // shorten the string text like a URL cachepath = plasmaHTCache.getCachePath(new yacyURL(urlstr, null)).toString().replace('\\', '/').substring(plasmaHTCache.cachePath.toString().length() + 1); } -// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urle=" + urle.toString()); initiatorSeed = sb.webIndex.seedDB.getConnected(initiatorHash); executorSeed = sb.webIndex.seedDB.getConnected(executorHash); prop.put("table_indexed_" + cnt + "_dark", (dark) ? "1" : "0"); - if (showControl) { - prop.put("table_indexed_" + cnt + "_showControl", "1"); - prop.put("table_indexed_" + cnt + "_showControl_feedbackpage", "CrawlResults.html"); - prop.put("table_indexed_" + cnt + "_showControl_tabletype", tabletype); - prop.put("table_indexed_" + cnt + "_showControl_urlhash", urlHash); - } else - prop.put("table_indexed_" + cnt + "_showControl", "0"); + prop.put("table_indexed_" + cnt + "_feedbackpage", "CrawlResults.html"); + prop.put("table_indexed_" + cnt + "_tabletype", tabletype); + prop.put("table_indexed_" + cnt + "_urlhash", urlHash); if (showInit) { prop.put("table_indexed_" + cnt + "_showInit", "1"); @@ -260,6 +247,21 @@ public class CrawlResults { } } prop.put("table_indexed", cnt); + + cnt = 0; + dark = true; + Iterator j = sb.crawlResults.domains(tabletype); + String domain; + while (j.hasNext() && cnt < 100) { + domain = j.next(); + if (domain == null) break; + prop.put("table_domains_" + cnt + "_dark", (dark) ? "1" : "0"); + prop.put("table_domains_" + cnt + "_domain", domain); + prop.put("table_domains_" + cnt + "_count", sb.crawlResults.domainCount(tabletype, domain)); + dark = !dark; + cnt++; + } + prop.put("table_domains", cnt); } prop.put("process", tabletype); // return rewrite properties diff --git a/source/de/anomic/crawler/ResultURLs.java b/source/de/anomic/crawler/ResultURLs.java index f30332fd0..5c22ec737 100644 --- a/source/de/anomic/crawler/ResultURLs.java +++ b/source/de/anomic/crawler/ResultURLs.java @@ -35,11 +35,13 @@ package de.anomic.crawler; import java.net.MalformedURLException; import java.util.Date; +import java.util.Iterator; import java.util.LinkedList; import java.util.List; import de.anomic.index.indexURLReference; import de.anomic.kelondro.kelondroBitfield; +import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.server.logging.serverLog; import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.yacyURL; @@ -56,6 +58,13 @@ public final class ResultURLs { private final LinkedList lcrawlResultStack; // 5 - local index: result of local crawling private final LinkedList gcrawlResultStack; // 6 - local index: triggered external + private final kelondroMScoreCluster externResultDomains; + private final kelondroMScoreCluster searchResultDomains; + private final kelondroMScoreCluster transfResultDomains; + private final kelondroMScoreCluster proxyResultDomains; + private final kelondroMScoreCluster lcrawlResultDomains; + private final kelondroMScoreCluster gcrawlResultDomains; + public ResultURLs() { // init result stacks externResultStack = new LinkedList(); @@ -64,6 +73,13 @@ public final class ResultURLs { proxyResultStack = new LinkedList(); lcrawlResultStack = new LinkedList(); gcrawlResultStack = new LinkedList(); + // init result domain statistics + externResultDomains = new kelondroMScoreCluster(); + searchResultDomains = new kelondroMScoreCluster(); + transfResultDomains = new kelondroMScoreCluster(); + proxyResultDomains = new kelondroMScoreCluster(); + lcrawlResultDomains = new kelondroMScoreCluster(); + gcrawlResultDomains = new kelondroMScoreCluster(); } public synchronized void stack(final indexURLReference e, final String initiatorHash, final String executorHash, final int stackType) { @@ -72,18 +88,22 @@ public final class ResultURLs { if (e == null) { return; } try { final List resultStack = getStack(stackType); - if(resultStack != null) { + if (resultStack != null) { resultStack.add(e.hash() + initiatorHash + executorHash); } - return; } catch (final Exception ex) { System.out.println("INTERNAL ERROR in newEntry/2: " + ex.toString()); return; } - } - - public synchronized void notifyGCrawl(final String urlHash, final String initiatorHash, final String executorHash) { - gcrawlResultStack.add(urlHash + initiatorHash + executorHash); + try { + final kelondroMScoreCluster domains = getDomains(stackType); + if (domains != null) { + domains.incScore(e.comp().url().getHost()); + } + } catch (final Exception ex) { + System.out.println("INTERNAL ERROR in newEntry/3: " + ex.toString()); + return; + } } public synchronized int getStackSize(final int stack) { @@ -166,6 +186,24 @@ public final class ResultURLs { return resultStack.get(pos); } + /** + * iterate all domains in the result domain statistic + * @return iterator of domains in reverse order (downwards) + */ + public Iterator domains(final int stack) { + return getDomains(stack).scores(false); + } + + /** + * return the count of the domain + * @param stack type + * @param domain name + * @return the number of occurrences of the domain in the stack statistics + */ + public int domainCount(final int stack, String domain) { + return getDomains(stack).getScore(domain); + } + /** * returns the stack indentified by the id stack * @@ -184,6 +222,18 @@ public final class ResultURLs { return null; } } + private kelondroMScoreCluster getDomains(final int stack) { + switch (stack) { + case 1: return externResultDomains; + case 2: return searchResultDomains; + case 3: return transfResultDomains; + case 4: return proxyResultDomains; + case 5: return lcrawlResultDomains; + case 6: return gcrawlResultDomains; + default: + return null; + } + } /** * tests if a stack with id stack exists @@ -196,16 +246,6 @@ public final class ResultURLs { } public synchronized boolean removeStack(final int stack, final int pos) { -// Object prevElement = null; -// switch (stack) { -// case 1: prevElement = externResultStack.remove(pos); break; -// case 2: prevElement = searchResultStack.remove(pos); break; -// case 3: prevElement = transfResultStack.remove(pos); break; -// case 4: prevElement = proxyResultStack.remove(pos); break; -// case 5: prevElement = lcrawlResultStack.remove(pos); break; -// case 6: prevElement = gcrawlResultStack.remove(pos); break; -// } -// return prevElement != null; final List resultStack = getStack(stack); if(resultStack == null) { return false; @@ -215,17 +255,10 @@ public final class ResultURLs { public synchronized void clearStack(final int stack) { final List resultStack = getStack(stack); - if(resultStack != null) { - resultStack.clear(); - } -// switch (stack) { -// case 1: externResultStack.clear(); break; -// case 2: searchResultStack.clear(); break; -// case 3: transfResultStack.clear(); break; -// case 4: proxyResultStack.clear(); break; -// case 5: lcrawlResultStack.clear(); break; -// case 6: gcrawlResultStack.clear(); break; -// } + if (resultStack != null) resultStack.clear(); + final kelondroMScoreCluster resultDomains = getDomains(stack); + if (resultDomains != null) resultDomains.clear(); + } public synchronized boolean remove(final String urlHash) {
@@ -94,7 +116,6 @@
InitiatorExecutorModified