- added statistical evaluation about domains that appear during crawling

- added tables that show this statistics in CrawlResults web pages

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5113 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent 4a4f388ca5
commit 80a7bc93d6

@ -76,6 +76,29 @@
#(table)#
<p><em>The stack is empty.</em></p>
::
<p><em>Statistics about #[domains]# domains in this stack:</em>
<table cellpadding="2" cellspacing="1" >
<tr class="TableHeader">
<td align="center">
<form action="#[feedbackpage]#" method="post" enctype="multipart/form-data">
<div>
<input type="hidden" name="process" value="#[tabletype]#" />
<input type="submit" name="clearlist" value="clear list" />
</div>
</form>
</td>
<td><strong>Domain</strong></td>
<td><strong>URLs</strong></td>
</tr>
#{domains}#
<tr class="TableCell#(dark)#Light::Dark#(/dark)#">
<td></td>
<td><a href="http://#[domain]#/" target="_">#[domain]#</a></td>
<td>#[count]#</td>
</tr>
#{/domains}#
</table><br>
<p><em>
#(size)#
Showing all #[all]# entries in this stack.
@ -85,7 +108,6 @@
</em></p>
<table cellpadding="2" cellspacing="1" >
<tr class="TableHeader">
#(showControl)#::
<td align="center">
<form action="#[feedbackpage]#" method="post" enctype="multipart/form-data">
<div>
@ -94,7 +116,6 @@
</div>
</form>
</td>
#(/showControl)#
#(showInit)#::<td><strong>Initiator</strong></td>#(/showInit)#
#(showExec)#::<td><strong>Executor</strong></td>#(/showExec)#
#(showDate)#::<td><strong>Modified</strong></td>#(/showDate)#

@ -26,6 +26,7 @@
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Iterator;
import java.util.Locale;
import de.anomic.http.httpRequestHeader;
@ -47,7 +48,6 @@ public class CrawlResults {
final serverObjects prop = new serverObjects();
int lines = 500;
boolean showControl = env.getConfigBool("IndexMonitorControl", true);
boolean showInit = env.getConfigBool("IndexMonitorInit", false);
boolean showExec = env.getConfigBool("IndexMonitorExec", false);
boolean showDate = env.getConfigBool("IndexMonitorDate", true);
@ -109,8 +109,6 @@ public class CrawlResults {
if (post.containsKey("moreIndexed")) {
lines = Integer.parseInt(post.get("showIndexed", "500"));
}
if (post.get("sc") != null)
if (post.get("sc").equals("0")) showControl = false; else showControl = true;
if (post.get("si") != null)
if (post.get("si").equals("0")) showInit = false; else showInit = true;
if (post.get("se") != null)
@ -141,12 +139,8 @@ public class CrawlResults {
}
prop.put("table_size_all", sb.crawlResults.getStackSize(tabletype));
if (showControl) {
prop.put("table_showControl", "1");
prop.putHTML("table_showControl_feedbackpage", "CrawlResults.html");
prop.put("table_showControl_tabletype", tabletype);
} else
prop.put("table_showControl", "0");
prop.putHTML("table_feedbackpage", "CrawlResults.html");
prop.put("table_tabletype", tabletype);
prop.put("table_showInit", (showInit) ? "1" : "0");
prop.put("table_showExec", (showExec) ? "1" : "0");
prop.put("table_showDate", (showDate) ? "1" : "0");
@ -165,9 +159,7 @@ public class CrawlResults {
for (i = sb.crawlResults.getStackSize(tabletype) - 1; i >= (sb.crawlResults.getStackSize(tabletype) - lines); i--) {
initiatorHash = sb.crawlResults.getInitiatorHash(tabletype, i);
executorHash = sb.crawlResults.getExecutorHash(tabletype, i);
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps initiatorHash=" + initiatorHash + " executorHash=" + executorHash);
urlHash = sb.crawlResults.getUrlHash(tabletype, i);
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urlHash=" + urlHash);
try {
urle = sb.webIndex.getURL(urlHash, null, 0);
if(urle == null) {
@ -182,18 +174,13 @@ public class CrawlResults {
urltxt = nxTools.shortenURLString(urlstr, 72); // shorten the string text like a URL
cachepath = plasmaHTCache.getCachePath(new yacyURL(urlstr, null)).toString().replace('\\', '/').substring(plasmaHTCache.cachePath.toString().length() + 1);
}
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urle=" + urle.toString());
initiatorSeed = sb.webIndex.seedDB.getConnected(initiatorHash);
executorSeed = sb.webIndex.seedDB.getConnected(executorHash);
prop.put("table_indexed_" + cnt + "_dark", (dark) ? "1" : "0");
if (showControl) {
prop.put("table_indexed_" + cnt + "_showControl", "1");
prop.put("table_indexed_" + cnt + "_showControl_feedbackpage", "CrawlResults.html");
prop.put("table_indexed_" + cnt + "_showControl_tabletype", tabletype);
prop.put("table_indexed_" + cnt + "_showControl_urlhash", urlHash);
} else
prop.put("table_indexed_" + cnt + "_showControl", "0");
prop.put("table_indexed_" + cnt + "_feedbackpage", "CrawlResults.html");
prop.put("table_indexed_" + cnt + "_tabletype", tabletype);
prop.put("table_indexed_" + cnt + "_urlhash", urlHash);
if (showInit) {
prop.put("table_indexed_" + cnt + "_showInit", "1");
@ -260,6 +247,21 @@ public class CrawlResults {
}
}
prop.put("table_indexed", cnt);
cnt = 0;
dark = true;
Iterator<String> j = sb.crawlResults.domains(tabletype);
String domain;
while (j.hasNext() && cnt < 100) {
domain = j.next();
if (domain == null) break;
prop.put("table_domains_" + cnt + "_dark", (dark) ? "1" : "0");
prop.put("table_domains_" + cnt + "_domain", domain);
prop.put("table_domains_" + cnt + "_count", sb.crawlResults.domainCount(tabletype, domain));
dark = !dark;
cnt++;
}
prop.put("table_domains", cnt);
}
prop.put("process", tabletype);
// return rewrite properties

@ -35,11 +35,13 @@ package de.anomic.crawler;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import de.anomic.index.indexURLReference;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacySeedDB;
import de.anomic.yacy.yacyURL;
@ -56,6 +58,13 @@ public final class ResultURLs {
private final LinkedList<String> lcrawlResultStack; // 5 - local index: result of local crawling
private final LinkedList<String> gcrawlResultStack; // 6 - local index: triggered external
private final kelondroMScoreCluster<String> externResultDomains;
private final kelondroMScoreCluster<String> searchResultDomains;
private final kelondroMScoreCluster<String> transfResultDomains;
private final kelondroMScoreCluster<String> proxyResultDomains;
private final kelondroMScoreCluster<String> lcrawlResultDomains;
private final kelondroMScoreCluster<String> gcrawlResultDomains;
public ResultURLs() {
// init result stacks
externResultStack = new LinkedList<String>();
@ -64,6 +73,13 @@ public final class ResultURLs {
proxyResultStack = new LinkedList<String>();
lcrawlResultStack = new LinkedList<String>();
gcrawlResultStack = new LinkedList<String>();
// init result domain statistics
externResultDomains = new kelondroMScoreCluster<String>();
searchResultDomains = new kelondroMScoreCluster<String>();
transfResultDomains = new kelondroMScoreCluster<String>();
proxyResultDomains = new kelondroMScoreCluster<String>();
lcrawlResultDomains = new kelondroMScoreCluster<String>();
gcrawlResultDomains = new kelondroMScoreCluster<String>();
}
public synchronized void stack(final indexURLReference e, final String initiatorHash, final String executorHash, final int stackType) {
@ -72,18 +88,22 @@ public final class ResultURLs {
if (e == null) { return; }
try {
final List<String> resultStack = getStack(stackType);
if(resultStack != null) {
if (resultStack != null) {
resultStack.add(e.hash() + initiatorHash + executorHash);
}
return;
} catch (final Exception ex) {
System.out.println("INTERNAL ERROR in newEntry/2: " + ex.toString());
return;
}
}
public synchronized void notifyGCrawl(final String urlHash, final String initiatorHash, final String executorHash) {
gcrawlResultStack.add(urlHash + initiatorHash + executorHash);
try {
final kelondroMScoreCluster<String> domains = getDomains(stackType);
if (domains != null) {
domains.incScore(e.comp().url().getHost());
}
} catch (final Exception ex) {
System.out.println("INTERNAL ERROR in newEntry/3: " + ex.toString());
return;
}
}
public synchronized int getStackSize(final int stack) {
@ -166,6 +186,24 @@ public final class ResultURLs {
return resultStack.get(pos);
}
/**
* iterate all domains in the result domain statistic
* @return iterator of domains in reverse order (downwards)
*/
public Iterator<String> domains(final int stack) {
return getDomains(stack).scores(false);
}
/**
* return the count of the domain
* @param stack type
* @param domain name
* @return the number of occurrences of the domain in the stack statistics
*/
public int domainCount(final int stack, String domain) {
return getDomains(stack).getScore(domain);
}
/**
* returns the stack indentified by the id <em>stack</em>
*
@ -184,6 +222,18 @@ public final class ResultURLs {
return null;
}
}
private kelondroMScoreCluster<String> getDomains(final int stack) {
switch (stack) {
case 1: return externResultDomains;
case 2: return searchResultDomains;
case 3: return transfResultDomains;
case 4: return proxyResultDomains;
case 5: return lcrawlResultDomains;
case 6: return gcrawlResultDomains;
default:
return null;
}
}
/**
* tests if a stack with id <em>stack</em> exists
@ -196,16 +246,6 @@ public final class ResultURLs {
}
public synchronized boolean removeStack(final int stack, final int pos) {
// Object prevElement = null;
// switch (stack) {
// case 1: prevElement = externResultStack.remove(pos); break;
// case 2: prevElement = searchResultStack.remove(pos); break;
// case 3: prevElement = transfResultStack.remove(pos); break;
// case 4: prevElement = proxyResultStack.remove(pos); break;
// case 5: prevElement = lcrawlResultStack.remove(pos); break;
// case 6: prevElement = gcrawlResultStack.remove(pos); break;
// }
// return prevElement != null;
final List<String> resultStack = getStack(stack);
if(resultStack == null) {
return false;
@ -215,17 +255,10 @@ public final class ResultURLs {
public synchronized void clearStack(final int stack) {
final List<String> resultStack = getStack(stack);
if(resultStack != null) {
resultStack.clear();
}
// switch (stack) {
// case 1: externResultStack.clear(); break;
// case 2: searchResultStack.clear(); break;
// case 3: transfResultStack.clear(); break;
// case 4: proxyResultStack.clear(); break;
// case 5: lcrawlResultStack.clear(); break;
// case 6: gcrawlResultStack.clear(); break;
// }
if (resultStack != null) resultStack.clear();
final kelondroMScoreCluster<String> resultDomains = getDomains(stack);
if (resultDomains != null) resultDomains.clear();
}
public synchronized boolean remove(final String urlHash) {

Loading…
Cancel
Save