complete redesign of crawl queue monitoring: do not look at a

ready-prepared crawl list but at the stacks of the domains that are
stored for balanced crawling. This affects also the balancer since that
does not need to prepare the pre-selected crawl list for monitoring. As
a effect:
- it is no more possible to see the correct order of next to-be-crawled
links, since that depends on the actual state of the balancer stack the
next time another url is requested for loading
- the balancer works better since the next url can be selected according
to the current situation and not according to a pre-selected order.
pull/1/head
Michael Peter Christen 13 years ago
parent 7e4e3fe5b6
commit 9ad1d8dde2

@ -6,14 +6,22 @@
<script type="text/javascript" src="/js/ajax.js"></script> <script type="text/javascript" src="/js/ajax.js"></script>
<script type="text/javascript" src="/js/xml.js"></script> <script type="text/javascript" src="/js/xml.js"></script>
<script type="text/javascript" src="/js/html.js"></script> <script type="text/javascript" src="/js/html.js"></script>
<script type="text/javascript" src="/js/Crawler.js"></script></head> <script type="text/javascript" src="/js/Crawler.js"></script>
<body id="Crawler" onload="initCrawler();"> <script type="text/javascript">
function refreshiframe()
{
var f = document.getElementById('QueuesTable');
f.contentWindow.location.reload(true);
setTimeout("refreshiframe()", 1000);
}
</script>
</head>
<body id="Crawler" onload="initCrawler();refreshiframe();">
#%env/templates/header.template%# #%env/templates/header.template%#
#%env/templates/submenuCrawlMonitor.template%# #%env/templates/submenuCrawlMonitor.template%#
<h2>Crawler Queues</h2> <h2>Crawler Queues</h2>
<noscript><p>(Please enable JavaScript to automatically update this page!)</p></noscript> <noscript><p>(Please enable JavaScript to automatically update this page!)</p></noscript>
<p> Next update in <input type="text" id="nextUpdate" onfocus="changeInterval()" onblur="newInterval()" size="2" /> seconds. <img src="/env/grafics/empty.gif" id="ajax" alt="empty"/>
&nbsp;See a access timing <a href="/api/latency_p.xml">here</a></p>
<table border="0" cellpadding="2" cellspacing="1" class="watchCrawler"> <table border="0" cellpadding="2" cellspacing="1" class="watchCrawler">
<tbody> <tbody>
<tr class="TableHeader"> <tr class="TableHeader">
@ -71,20 +79,6 @@
</tbody> </tbody>
</table> </table>
<form action="Crawler_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<table border="0" cellpadding="2" cellspacing="1" class="watchCrawler">
<tbody>
<tr class="TableHeader">
<th colspan="3">Speed</th>
</tr>
<tr class="TableCellLight">
<td align="left" #(crawlingSpeedMinChecked)#::class="TableCellDark"#(/crawlingSpeedMinChecked)#><input type="submit" name="crawlingPerformance" value="minimum" /></td>
<td align="left" #(crawlingSpeedCustChecked)#::class="TableCellDark"#(/crawlingSpeedCustChecked)#><input name="customPPM" type="text" size="5" maxlength="5" value="#[customPPMdefault]#" />PPM <input type="submit" name="crawlingPerformance" value="custom" /></td>
<td align="left" #(crawlingSpeedMaxChecked)#::class="TableCellDark"#(/crawlingSpeedMaxChecked)#><input type="submit" name="crawlingPerformance" value="maximum" /></td>
</tr>
</tbody>
</table>
</form>
<table border="0" cellpadding="2" cellspacing="1" class="watchCrawler"> <table border="0" cellpadding="2" cellspacing="1" class="watchCrawler">
<tbody> <tbody>
@ -103,15 +97,24 @@
</tbody> </tbody>
</table> </table>
<form action="Crawler_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<table border="0" cellpadding="2" cellspacing="1" class="watchCrawler"> <table border="0" cellpadding="2" cellspacing="1" class="watchCrawler">
<tbody> <tbody>
<tr class="TableHeader"> <tr class="TableHeader">
<th>Indicator</th> <th>Indicator</th>
<th colspan="2">Level</th> <th colspan="2">Level</th>
</tr> </tr>
<tr class="TableCellLight">
<td align="left">Speed</td>
<td align="left" colspan="2">
<input #(crawlingSpeedMinChecked)#::class="TableCellDark"#(/crawlingSpeedMinChecked)# type="submit" name="crawlingPerformance" value="minimum" />
<input #(crawlingSpeedCustChecked)#::class="TableCellDark"#(/crawlingSpeedCustChecked)# name="customPPM" type="text" size="5" maxlength="5" value="#[customPPMdefault]#" />PPM <input type="submit" name="crawlingPerformance" value="custom" />
<input #(crawlingSpeedMaxChecked)#::class="TableCellDark"#(/crawlingSpeedMaxChecked)# type="submit" name="crawlingPerformance" value="maximum" />
</td>
</tr>
<tr class="TableCellLight"> <tr class="TableCellLight">
<td align="left">PPM (Pages Per Minute)</td> <td align="left">PPM (Pages Per Minute)</td>
<td align="left"><span id="ppmNum">&nbsp;&nbsp;&nbsp;</span></td> <td align="left" width="20"><span id="ppmNum">&nbsp;&nbsp;&nbsp;</span></td>
<td align="left"><span id="ppmSpan">&nbsp;&nbsp;&nbsp;</span></td> <td align="left"><span id="ppmSpan">&nbsp;&nbsp;&nbsp;</span></td>
</tr> </tr>
<tr class="TableCellLight"> <tr class="TableCellLight">
@ -126,6 +129,7 @@
</tr> </tr>
</tbody> </tbody>
</table> </table>
</form>
<p class="watchCrawler"> #(info)# <p class="watchCrawler"> #(info)#
<!-- 0 --> <!-- 0 -->
@ -157,22 +161,9 @@
<!-- crawl queues --> <!-- crawl queues -->
<p id="crawlingQueues"><strong>Crawl Queue:</strong></p> <p>See an <a href="/api/latency_p.xml">access timing</a></p>
<table border="0" cellpadding="2" cellspacing="1" id="queueTable">
<tbody> <iframe id="QueuesTable" src="IndexCreateQueues_p.html?embed=&urlsPerHost=1" width="100%" height="0" align="left" scrolling="no" marginheight="0" marginwidth="0" frameborder="0" ></iframe>
<tr class="TableHeader">
<th>Queue</th>
<th>Profile</th>
<th>Initiator</th>
<th>Depth</th>
<th>Modified Date</th>
<th>Anchor Name</th>
<th>URL</th>
<th>Size</th>
<th>Delete</th>
</tr>
</tbody>
</table>
#%env/templates/footer.template%# #%env/templates/footer.template%#
</body> </body>

@ -0,0 +1,95 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>YaCy '#[clientname]#': '#[queuename]#' Crawl Queue</title>
#%env/templates/metas.template%#
</head>
<body id="IndexCreateQueues">
<div id="fullcontent">
#(embed)#
#%env/templates/header.template%#
#%env/templates/submenuCrawlMonitor.template%#
<h2>'#[queuename]#' Crawl Queue</h2>
::#(/embed)#
#(crawler)#
<p>This crawler queue is empty</p>
::
#(embed)#
<form action="IndexCreateQueues_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset>
Delete Entries:
<input type="text" name="pattern" value="#[deletepattern]#" size="40" maxlength="200" />
<select name="option" size="1">
<option value="5">Initiator</option>
<option value="3">Profile</option>
<option value="4">Depth</option>
<option value="6">Modified Date</option>
<option value="2">Anchor Name</option>
<option value="1" selected="selected">URL</option>
</select>
<input type="hidden" name="stack" value="#[queuename]#" />
<input type="submit" name="delete" value="Delete" />
</fieldset>
</form>
::#(/embed)#
<table border="0" cellpadding="2" cellspacing="1">
<colgroup>
<col width="5" />
<col width="10" />
<col width="30" />
<col width="10" />
<col width="10" />
<col width="10" />
<col width="10" />
<col width="10" />
<col width="10" />
<col />
</colgroup>
<tr class="TableHeader">
<th>Count</th>
<th>Delta/ms</th>
<th>Host</th>
<th>Initiator</th>
<th>Profile</th>
<th>Depth</th>
<th>Modified Date</th>
<th>Anchor Name</th>
<th>Delta/ms</th>
<th>URL</th>
</tr>
#{host}#
<tr class="TableCellDark">
<td>#[hostcount]#</td>
<td>#[hostdelta]#</td>
<td><a href="IndexCreateQueues_p.html?#(embed)#::embed=&#(/embed)#delete=&stack=#[queuename]#&option=1&pattern=.*#[hostname]#.*&urlsPerHost=#[urlsPerHost]#"><img src="env/grafics/trash.gif"></a>&nbsp;#[hostname]#</td>
<td colspan="7"></td>
</tr>
#{list}#
<tr class="TableCellLight">
<td colspan="3"></td>
<td>#[initiator]#</td>
<td>#[profile]#</td>
<td>#[depth]#</td>
<td>#[modified]#</td>
<td>#[anchor]#</td>
<td>#[delta]#</td>
<td><a href="#[url]#">#[url]#</a></td>
</tr>
#{/list}#
</td>
</tr>
#{/host}#
#(/crawler)#
#(embed)#
#%env/templates/footer.template%#
::#(/embed)#
</div>
<script type="text/javascript">
<!--
parentPage = parent.document.getElementById('QueuesTable');
if (parentPage != null) parentPage.height = document.getElementById('fullcontent').offsetHeight + 30;
-->
</script>
</body>
</html>

@ -1,31 +1,3 @@
// IndexCreateWWWLocalQueue_p.java
// -------------------------------
// part of the AnomicHTTPD caching proxy
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004, 2005
//
//$LastChangedDate$
//$LastChangedRevision$
//$LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
// You must compile this file with
// javac -classpath .:../classes IndexCreate_p.java
// if the shell's current path is HTROOT
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
import java.util.ArrayList; import java.util.ArrayList;
@ -33,6 +5,7 @@ import java.util.Date;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Locale; import java.util.Locale;
import java.util.Map;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException; import java.util.regex.PatternSyntaxException;
@ -41,15 +14,14 @@ import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.peers.Seed; import net.yacy.peers.Seed;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.NoticedURL;
import de.anomic.crawler.CrawlSwitchboard; import de.anomic.crawler.CrawlSwitchboard;
import de.anomic.crawler.NoticedURL.StackType;
import de.anomic.crawler.retrieval.Request; import de.anomic.crawler.retrieval.Request;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch; import de.anomic.server.serverSwitch;
public class IndexCreateWWWLocalQueue_p { public class IndexCreateQueues_p {
private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US); private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US);
private static String daydate(final Date date) { private static String daydate(final Date date) {
@ -69,24 +41,26 @@ public class IndexCreateWWWLocalQueue_p {
// return variable that accumulates replacements // return variable that accumulates replacements
final Switchboard sb = (Switchboard) env; final Switchboard sb = (Switchboard) env;
final serverObjects prop = new serverObjects(); final serverObjects prop = new serverObjects();
StackType stackType = StackType.LOCAL;
int urlsPerHost = 5;
boolean embed = false;
String deletepattern = ".*";
int showLimit = 100;
if (post != null) { if (post != null) {
showLimit = post.getInt("limit", 100); stackType = StackType.valueOf(post.get("stack", stackType.name()).toUpperCase());
urlsPerHost = post.getInt("urlsPerHost", urlsPerHost);
if (post.containsKey("deleteEntries")) { if (post.containsKey("embed")) embed = true;
int c = 0;
final String pattern = post.get("pattern", ".*").trim(); if (post.containsKey("delete")) {
deletepattern = post.get("pattern", deletepattern).trim();
final int option = post.getInt("option", INVALID); final int option = post.getInt("option", INVALID);
if (".*".equals(pattern)) { if (".*".equals(deletepattern)) {
c = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.CORE); sb.crawlQueues.noticeURL.clear(stackType);
sb.crawlQueues.noticeURL.clear(NoticedURL.StackType.CORE);
try { sb.cleanProfiles(); } catch (final InterruptedException e) {/* ignore this */} try { sb.cleanProfiles(); } catch (final InterruptedException e) {/* ignore this */}
} else if (option > INVALID) { } else if (option > INVALID) {
try { try {
// compiling the regular expression // compiling the regular expression
final Pattern compiledPattern = Pattern.compile(pattern); final Pattern compiledPattern = Pattern.compile(deletepattern);
if (option == PROFILE) { if (option == PROFILE) {
// search and delete the crawl profile (_much_ faster, independant of queue size) // search and delete the crawl profile (_much_ faster, independant of queue size)
@ -107,7 +81,7 @@ public class IndexCreateWWWLocalQueue_p {
} }
} else { } else {
// iterating through the list of URLs // iterating through the list of URLs
final Iterator<Request> iter = sb.crawlQueues.noticeURL.iterator(NoticedURL.StackType.CORE); final Iterator<Request> iter = sb.crawlQueues.noticeURL.iterator(stackType);
Request entry; Request entry;
final List<byte[]> removehashes = new ArrayList<byte[]>(); final List<byte[]> removehashes = new ArrayList<byte[]>();
while (iter.hasNext()) { while (iter.hasNext()) {
@ -127,7 +101,7 @@ public class IndexCreateWWWLocalQueue_p {
if (value != null && compiledPattern.matcher(value).matches()) removehashes.add(entry.url().hash()); if (value != null && compiledPattern.matcher(value).matches()) removehashes.add(entry.url().hash());
} }
Log.logInfo("IndexCreateWWWLocalQueue", "created a remove list with " + removehashes.size() + " entries for pattern '" + pattern + "'"); Log.logInfo("IndexCreateQueues_p", "created a remove list with " + removehashes.size() + " entries for pattern '" + deletepattern + "'");
for (final byte[] b: removehashes) { for (final byte[] b: removehashes) {
sb.crawlQueues.noticeURL.removeByURLHash(b); sb.crawlQueues.noticeURL.removeByURLHash(b);
} }
@ -136,56 +110,59 @@ public class IndexCreateWWWLocalQueue_p {
Log.logException(e); Log.logException(e);
} }
} }
prop.put("info", "3");//crawling queue cleared
prop.putNum("info_numEntries", c);
} else if (post.containsKey("deleteEntry")) {
final String urlHash = post.get("deleteEntry");
sb.crawlQueues.noticeURL.removeByURLHash(urlHash.getBytes());
prop.put("LOCATION","");
return prop;
} }
} }
int showNum = 0, stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.CORE); int stackSize = sb.crawlQueues.noticeURL.stackSize(stackType);
if (stackSize == 0) { if (stackSize == 0) {
prop.put("crawler-queue", "0"); prop.put("crawler", "0");
} else { } else {
prop.put("crawler-queue", "1"); prop.put("crawler", "1");
final List<Request> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.StackType.CORE, (int) (showLimit * 1.20)); prop.put("crawler_embed", embed ? 1 : 0);
prop.put("crawler_embed_deletepattern", deletepattern);
prop.put("crawler_embed_queuename", stackType.name());
final Map<String, Integer[]> hosts = sb.crawlQueues.noticeURL.getDomainStackHosts(stackType);
int hc = 0;
for (Map.Entry<String, Integer[]> host: hosts.entrySet()) {
prop.putHTML("crawler_host_" + hc + "_hostname", host.getKey());
prop.put("crawler_host_" + hc + "_embed", embed ? 1 : 0);
prop.put("crawler_host_" + hc + "_urlsPerHost", urlsPerHost);
prop.putHTML("crawler_host_" + hc + "_queuename", stackType.name());
prop.put("crawler_host_" + hc + "_hostcount", host.getValue()[0]);
prop.put("crawler_host_" + hc + "_hostdelta", host.getValue()[1]);
List<Request> domainStackReferences = sb.crawlQueues.noticeURL.getDomainStackReferences(stackType, host.getKey(), urlsPerHost);
Request urle;
boolean dark = true;
Seed initiator; Seed initiator;
String profileHandle; String profileHandle;
CrawlProfile profileEntry; CrawlProfile profileEntry;
int i; int count = 0;
for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) { for (Request request: domainStackReferences) {
urle = crawlerList.get(i); if (request == null) continue;
if ((urle != null)&&(urle.url()!=null)) { initiator = sb.peers.getConnected(request.initiator() == null ? "" : ASCII.String(request.initiator()));
initiator = sb.peers.getConnected(urle.initiator() == null ? "" : ASCII.String(urle.initiator())); profileHandle = request.profileHandle();
profileHandle = urle.profileHandle();
profileEntry = profileHandle == null ? null : sb.crawler.getActive(profileHandle.getBytes()); profileEntry = profileHandle == null ? null : sb.crawler.getActive(profileHandle.getBytes());
prop.put("crawler-queue_list_"+showNum+"_dark", dark ? "1" : "0"); prop.putHTML("crawler_host_" + hc + "_list_" + count + "_initiator", ((initiator == null) ? "proxy" : initiator.getName()) );
prop.putHTML("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) ); prop.put("crawler_host_" + hc + "_list_" + count + "_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));
prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name())); prop.put("crawler_host_" + hc + "_list_" + count + "_depth", request.depth());
prop.put("crawler-queue_list_"+showNum+"_depth", urle.depth()); prop.put("crawler_host_" + hc + "_list_" + count + "_modified", daydate(request.appdate()) );
prop.put("crawler-queue_list_"+showNum+"_modified", daydate(urle.appdate()) ); prop.putHTML("crawler_host_" + hc + "_list_" + count + "_anchor", request.name());
prop.putHTML("crawler-queue_list_"+showNum+"_anchor", urle.name()); prop.put("crawler_host_" + hc + "_list_" + count + "_delta", sb.crawlQueues.noticeURL.getDomainSleepTime(stackType, sb.crawler, request));
prop.putHTML("crawler-queue_list_"+showNum+"_url", urle.url().toNormalform(false, true)); prop.putHTML("crawler_host_" + hc + "_list_" + count + "_url", request.url().toNormalform(false, true));
prop.put("crawler-queue_list_"+showNum+"_hash", urle.url().hash()); prop.put("crawler_host_" + hc + "_list_" + count + "_hash", request.url().hash());
dark = !dark; count++;
showNum++;
} else {
stackSize--;
} }
prop.putNum("crawler_host_" + hc + "_list", count);
hc++;
} }
prop.putNum("crawler-queue_list", showNum); prop.put("crawler_host", hc);
prop.putNum("crawler-queue_num", stackSize);//num Entries
prop.putNum("crawler-queue_show-num", showNum); //showin sjow-num most recent
} }
prop.put("embed", embed ? 1 : 0);
prop.put("queuename", stackType.name().charAt(0) + stackType.name().substring(1).toLowerCase());
prop.put("embed_queuename", stackType.name().charAt(0) + stackType.name().substring(1).toLowerCase());
// return rewrite properties // return rewrite properties
return prop; return prop;
} }

@ -1,58 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>YaCy '#[clientname]#': Global Crawl Queue</title>
#%env/templates/metas.template%#
</head>
<body id="IndexCreateWWWGlobalQueue">
#%env/templates/header.template%#
#%env/templates/submenuCrawlMonitor.template%#
<h2>Global Crawl Queue</h2>
<p>
This queue stores the urls that shall be sent to other peers to perform a remote crawl.
If there is no peer for remote crawling available, the links are crawled locally.
</p>
#(crawler-queue)#
<p>The global crawler queue is empty</p>
::
<form action="IndexCreateWWWGlobalQueue_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset>
<input type="submit" name="clearcrawlqueue" value="clear global crawl queue" />
</fieldset>
</form>
<p>There are <strong>#[num]#</strong> entries in the global crawler queue. Showing <strong>#[show-num]#</strong> most recent entries.</p>
<p>Show last <a href="IndexCreateWWWGlobalQueue_p.html?limit=50">50</a> | <a href="IndexCreateWWWGlobalQueue_p.html?limit=100">100</a> | <a href="IndexCreateWWWGlobalQueue_p.html?limit=250">250</a> | <a href="IndexCreateWWWGlobalQueue_p.html?limit=500">500</a> entries.</p>
<table border="0" cellpadding="2" cellspacing="1">
<colgroup>
<col width="60" span="2" />
<col width="10" />
<col width="80" />
<col width="180" />
<col />
<col width="10" />
</colgroup>
<tr class="TableHeader">
<th>Initiator</th>
<th>Profile</th>
<th>Depth</th>
<th>Modified Date</th>
<th>Anchor Name</th>
<th>URL</th>
<th>Delete</th>
</tr>
#{list}#
<tr class="TableCell#(dark)#Light::Dark#(/dark)#">
<td>#[initiator]#</td>
<td>#[profile]#</td>
<td>#[depth]#</td>
<td>#[modified]#</td>
<td>#[anchor]#</td>
<td><a href="#[url]#">#[url]#</a></td>
<td><a href="IndexCreateWWWGlobalQueue_p.html?deleteEntry=#[hash]#">[Delete]</a></td>
</tr>
#{/list}#
</table>
#(/crawler-queue)#
#%env/templates/footer.template%#
</body>
</html>

@ -1,125 +0,0 @@
// IndexCreateWWWGlobalQueue_p.java
// -------------------------------
// part of the AnomicHTTPD caching proxy
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004, 2005
//
//$LastChangedDate$
//$LastChangedRevision$
//$LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
// You must compile this file with
// javac -classpath .:../classes IndexCreate_p.java
// if the shell's current path is HTROOT
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.Locale;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.peers.Seed;
import net.yacy.search.Switchboard;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.NoticedURL;
import de.anomic.crawler.retrieval.Request;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
public class IndexCreateWWWGlobalQueue_p {
private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US);
private static String daydate(final Date date) {
if (date == null) return "";
return dayFormatter.format(date);
}
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
// return variable that accumulates replacements
final Switchboard sb = (Switchboard) env;
final serverObjects prop = new serverObjects();
int showLimit = 100;
if (post != null) {
showLimit = post.getInt("limit", 100);
if (post.containsKey("clearcrawlqueue")) {
final int c = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.LIMIT);
sb.crawlQueues.noticeURL.clear(NoticedURL.StackType.LIMIT);
try { sb.cleanProfiles(); } catch (final InterruptedException e) { /* Ignore this */}
/*
int c = 0;
while (switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.StackType.LIMIT) > 0) {
urlHash = switchboard.urlPool.noticeURL.pop(plasmaCrawlNURL.StackType.LIMIT).hash();
if (urlHash != null) { switchboard.urlPool.noticeURL.remove(urlHash); c++; }
}
*/
prop.put("info", "3");//crawling queue cleared
prop.putNum("info_numEntries", c);
} else if (post.containsKey("deleteEntry")) {
final String urlHash = post.get("deleteEntry");
sb.crawlQueues.noticeURL.removeByURLHash(urlHash.getBytes());
prop.put("LOCATION","");
return prop;
}
}
int stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.LIMIT);
if (stackSize == 0) {
prop.put("crawler-queue", "0");
} else {
prop.put("crawler-queue", "1");
final List<Request> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.StackType.LIMIT, showLimit);
Request urle;
boolean dark = true;
Seed initiator;
String profileHandle;
CrawlProfile profileEntry;
int i, showNum = 0;
for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) {
urle = crawlerList.get(i);
if (urle != null && urle.url() != null) {
initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : ASCII.String(urle.initiator()));
profileHandle = urle.profileHandle();
profileEntry = profileHandle == null ? null : sb.crawler.getActive(profileHandle.getBytes());
prop.put("crawler-queue_list_"+showNum+"_dark", dark ? "1" : "0");
prop.putHTML("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) );
prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));
prop.put("crawler-queue_list_"+showNum+"_depth", urle.depth());
prop.put("crawler-queue_list_"+showNum+"_modified", daydate(urle.appdate()) );
prop.putHTML("crawler-queue_list_"+showNum+"_anchor", urle.name());
prop.putHTML("crawler-queue_list_"+showNum+"_url", urle.url().toNormalform(false, true));
prop.put("crawler-queue_list_"+showNum+"_hash", urle.url().hash());
dark = !dark;
showNum++;
} else {
stackSize--;
}
}
prop.putNum("crawler-queue_show-num", showNum); //showin sjow-num most recent
prop.putNum("crawler-queue_num", stackSize);//num Entries
prop.putNum("crawler-queue_list", showNum);
}
// return rewrite properties
return prop;
}
}

@ -1,69 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>YaCy '#[clientname]#': Local Crawl Queue</title>
#%env/templates/metas.template%#
</head>
<body id="IndexCreateWWWLocalQueue">
#%env/templates/header.template%#
#%env/templates/submenuCrawlMonitor.template%#
<h2>Local Crawl Queue</h2>
<p>
This queue stores the urls that shall be crawled localy by this peer.
It may also contain urls that are computed by the proxy-prefetch.
</p>
#(crawler-queue)#
<p>The local crawler queue is empty</p>
::
<form action="IndexCreateWWWLocalQueue_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset>
Delete Entries:
<input type="text" name="pattern" value=".*" size="40" maxlength="200" />
<select name="option" size="1">
<option value="5">Initiator</option>
<option value="3">Profile</option>
<option value="4">Depth</option>
<option value="6">Modified Date</option>
<option value="2">Anchor Name</option>
<option value="1" selected="selected">URL</option>
</select>
<input type="submit" name="deleteEntries" value="Delete" /><em>This may take a quite long time.</em>
</fieldset>
</form>
<p>There are <strong>#[num]#</strong> entries in the local crawler queue. Showing <strong>#[show-num]#</strong> most recent entries.</p>
<p>Show last <a href="IndexCreateWWWLocalQueue_p.html?limit=50">50</a> | <a href="IndexCreateWWWLocalQueue_p.html?limit=100">100</a> | <a href="IndexCreateWWWLocalQueue_p.html?limit=250">250</a> | <a href="IndexCreateWWWLocalQueue_p.html?limit=500">500</a> entries.</p>
<table border="0" cellpadding="2" cellspacing="1">
<colgroup>
<col width="60" span="2" />
<col width="10" />
<col width="80" />
<col width="180" />
<col />
<col width="10" />
</colgroup>
<tr class="TableHeader">
<th>Initiator</th>
<th>Profile</th>
<th>Depth</th>
<th>Modified Date</th>
<th>Anchor Name</th>
<th>URL</th>
<th>Delete</th>
</tr>
#{list}#
<tr class="TableCell#(dark)#Light::Dark#(/dark)#">
<td>#[initiator]#</td>
<td>#[profile]#</td>
<td>#[depth]#</td>
<td>#[modified]#</td>
<td>#[anchor]#</td>
<td><a href="#[url]#">#[url]#</a></td>
<td><a href="IndexCreateWWWLocalQueue_p.html?deleteEntry=#[hash]#">[Delete]</a></td>
</tr>
#{/list}#
</table>
#(/crawler-queue)#
#%env/templates/footer.template%#
</body>
</html>

@ -1,65 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>YaCy '#[clientname]#': Remote Crawl Queue</title>
#%env/templates/metas.template%#
</head>
<body id="IndexCreateWWWGlobalQueue">
#%env/templates/header.template%#
#%env/templates/submenuCrawlMonitor.template%#
<h2>Remote Crawl Queue</h2>
<p>
This queue stores the urls that other peers sent to you in order to perform a remote crawl for them.
</p>
#(crawler-queue)#
<p>The remote crawler queue is empty</p>
::
<form action="IndexCreateWWWRemoteQueue_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset>
<input type="submit" name="clearcrawlqueue" value="clear remote crawl queue" />
</fieldset>
</form>
<p>
There are <strong>#[num]#</strong> entries in the remote crawler queue.
Showing <strong>#[show-num]#</strong> most recent entries.
</p>
<p>
Show last <a href="IndexCreateWWWRemoteQueue_p.html?limit=50">50</a> |
<a href="IndexCreateWWWRemoteQueue_p.html?limit=100">100</a> |
<a href="IndexCreateWWWRemoteQueue_p.html?limit=250">250</a> |
<a href="IndexCreateWWWRemoteQueue_p.html?limit=500">500</a> entries.
</p>
<table border="0" cellpadding="2" cellspacing="1">
<colgroup>
<col width="60" span="2" />
<col width="10" />
<col width="80" />
<col width="180" />
<col />
<col width="10" />
</colgroup>
<tr class="TableHeader">
<th>Initiator</th>
<th>Profile</th>
<th>Depth</th>
<th>Modified Date</th>
<th>Anchor Name</th>
<th>URL</th>
<th>Delete</th>
</tr>
#{list}#
<tr class="TableCell#(dark)#Light::Dark#(/dark)#">
<td>#[initiator]#</td>
<td>#[profile]#</td>
<td>#[depth]#</td>
<td>#[modified]#</td>
<td>#[anchor]#</td>
<td><a href="#[url]#">#[url]#</a></td>
<td><a href="IndexCreateWWWRemoteQueue_p.html?deleteEntry=#[hash]#">[Delete]</a></td>
</tr>
#{/list}#
</table>
#(/crawler-queue)#
#%env/templates/footer.template%#
</body>
</html>

@ -1,120 +0,0 @@
// IndexCreateWWWRemoteQueue_p.java
// -------------------------------
// part of the AnomicHTTPD caching proxy
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004, 2005
// last major change: 04.07.2005
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
// You must compile this file with
// javac -classpath .:../classes IndexCreateWWWRemoteQueue_p.java
// if the shell's current path is HTROOT
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.Locale;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.peers.Seed;
import net.yacy.search.Switchboard;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.NoticedURL;
import de.anomic.crawler.retrieval.Request;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.server.servletProperties;
public class IndexCreateWWWRemoteQueue_p {
private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US);
private static String daydate(final Date date) {
if (date == null) return "";
return dayFormatter.format(date);
}
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
final servletProperties prop = new servletProperties();
final Switchboard sb = (Switchboard)env;
int showLimit = 100;
if (post != null) {
showLimit = post.getInt("limit", 100);
if (post.containsKey("clearcrawlqueue")) {
final int c = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.REMOTE);
sb.crawlQueues.noticeURL.clear(NoticedURL.StackType.REMOTE);
try { sb.cleanProfiles(); } catch (final InterruptedException e) { /* Ignore this */}
/*
int c = 0;
while (switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.StackType.LIMIT) > 0) {
urlHash = switchboard.urlPool.noticeURL.pop(plasmaCrawlNURL.StackType.LIMIT).hash();
if (urlHash != null) { switchboard.urlPool.noticeURL.remove(urlHash); c++; }
}
*/
prop.put("info", "3"); // crawling queue cleared
prop.putNum("info_numEntries", c);
} else if (post.containsKey("deleteEntry")) {
final String urlHash = post.get("deleteEntry");
sb.crawlQueues.noticeURL.removeByURLHash(urlHash.getBytes());
prop.put("LOCATION","");
return prop;
}
}
int stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.REMOTE);
if (stackSize == 0) {
prop.put("crawler-queue", "0");
} else {
prop.put("crawler-queue", "1");
final List<Request> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.StackType.REMOTE, showLimit);
Request urle;
boolean dark = true;
Seed initiator;
String profileHandle;
CrawlProfile profileEntry;
int i, showNum = 0;
for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) {
urle = crawlerList.get(i);
if (urle != null && urle.url() != null) {
initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : ASCII.String(urle.initiator()));
profileHandle = urle.profileHandle();
profileEntry = profileHandle == null ? null : sb.crawler.getActive(profileHandle.getBytes());
prop.put("crawler-queue_list_" + showNum + "_dark", dark ? "1" : "0");
prop.putHTML("crawler-queue_list_" + showNum + "_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
prop.put("crawler-queue_list_" + showNum + "_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));
prop.put("crawler-queue_list_" + showNum + "_depth", urle.depth());
prop.put("crawler-queue_list_" + showNum + "_modified", daydate(urle.appdate()) );
prop.putHTML("crawler-queue_list_" + showNum + "_anchor", urle.name());
prop.putHTML("crawler-queue_list_" + showNum + "_url", urle.url().toString());
prop.put("crawler-queue_list_" + showNum + "_hash", urle.url().hash());
dark = !dark;
showNum++;
} else {
stackSize--;
}
}
prop.putNum("crawler-queue_show-num", showNum); //showin sjow-num most recent
prop.putNum("crawler-queue_num", stackSize);//num Entries
prop.putNum("crawler-queue_list", showNum);
}
return prop;
}
}

@ -1,124 +0,0 @@
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.Locale;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.peers.Seed;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.index.Segment;
import net.yacy.search.index.Segments;
import de.anomic.crawler.NoticedURL;
import de.anomic.crawler.retrieval.Request;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
public class queues_p {
public static final String STATE_RUNNING = "running";
public static final String STATE_PAUSED = "paused";
private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US);
private static String daydate(final Date date) {
if (date == null) return "";
return dayFormatter.format(date);
}
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
// return variable that accumulates replacements
final Switchboard sb = (Switchboard) env;
//wikiCode wikiTransformer = new wikiCode(switchboard);
final serverObjects prop = new serverObjects();
Segment segment = null;
final boolean html = post != null && post.containsKey("html");
prop.setLocalized(html);
if (post != null && post.containsKey("segment") && sb.verifyAuthentication(header)) {
segment = sb.indexSegments.segment(post.get("segment"));
}
if (segment == null) segment = sb.indexSegments.segment(Segments.Process.PUBLIC);
prop.put("rejected", "0");
//int showRejectedCount = 10;
Seed initiator;
// index size
prop.putNum("urlpublictextSize", segment.urlMetadata().size());
prop.putNum("rwipublictextSize", segment.termIndex().sizesMax());
// loader queue
prop.putNum("loaderSize", sb.crawlQueues.workerSize());
prop.putNum("loaderMax", sb.getConfigLong(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 10));
if (sb.crawlQueues.workerSize() == 0) {
prop.put("list-loader", "0");
} else {
final Request[] w = sb.crawlQueues.activeWorkerEntries();
int count = 0;
for (final Request r : w) {
if (r == null) continue;
prop.put("list-loader_"+count+"_profile", r.profileHandle());
initiator = sb.peers.getConnected((r.initiator() == null) ? "" : ASCII.String(r.initiator()));
prop.putHTML("list-loader_"+count+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
prop.put("list-loader_"+count+"_depth", r.depth());
prop.putXML("list-loader_"+count+"_url", r.url().toString());
count++;
}
prop.put("list-loader", count);
}
//local crawl queue
prop.putNum("localCrawlSize", sb.getThread(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL).getJobCount());
prop.put("localCrawlState", sb.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL) ? STATE_PAUSED : STATE_RUNNING);
int stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.CORE);
addNTable(sb, prop, "list-local", sb.crawlQueues.noticeURL.top(NoticedURL.StackType.CORE, Math.min(10, stackSize)));
//global crawl queue
prop.putNum("limitCrawlSize", sb.crawlQueues.limitCrawlJobSize());
prop.put("limitCrawlState", STATE_RUNNING);
stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.LIMIT);
//remote crawl queue
prop.putNum("remoteCrawlSize", sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL).getJobCount());
prop.put("remoteCrawlState", sb.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL) ? STATE_PAUSED : STATE_RUNNING);
stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.LIMIT);
if (stackSize == 0) {
prop.put("list-remote", "0");
} else {
addNTable(sb, prop, "list-remote", sb.crawlQueues.noticeURL.top(NoticedURL.StackType.LIMIT, Math.min(10, stackSize)));
}
//noload crawl queue
prop.putNum("noloadCrawlSize", sb.crawlQueues.noloadCrawlJobSize());
prop.put("noloadCrawlState", STATE_RUNNING);
//stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.NOLOAD);
// return rewrite properties
return prop;
}
public static final void addNTable(final Switchboard sb, final serverObjects prop, final String tableName, final List<Request> crawlerList) {
int showNum = 0;
Seed initiator;
for (final Request urle : crawlerList) {
if ((urle != null) && (urle.url() != null)) {
initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : UTF8.String(urle.initiator()));
prop.put(tableName + "_" + showNum + "_profile", urle.profileHandle());
prop.put(tableName + "_" + showNum + "_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
prop.put(tableName + "_" + showNum + "_depth", urle.depth());
prop.put(tableName + "_" + showNum + "_modified", daydate(urle.appdate()));
prop.putXML(tableName + "_" + showNum + "_anchor", urle.name());
prop.putXML(tableName + "_" + showNum + "_url", urle.url().toNormalform(false, true));
prop.put(tableName + "_" + showNum + "_hash", urle.url().hash());
showNum++;
}
}
prop.put(tableName, showNum);
}
}

@ -1,71 +0,0 @@
<?xml version="1.0"?>
<queues>
<dbsize>
<urlpublictext>#[urlpublictextSize]#</urlpublictext>
<rwipublictext>#[rwipublictextSize]#</rwipublictext>
</dbsize>
<loaderqueue>
<size>#[loaderSize]#</size>
<max>#[loaderMax]#</max>
#{list-loader}#
<entry>
<profile>#[profile]#</profile>
<initiator>#[initiator]#</initiator>
<depth>#[depth]#</depth>
<url>#[url]#</url>
</entry>
#{/list-loader}#
</loaderqueue>
<localcrawlerqueue>
<size>#[localCrawlSize]#</size>
<state>#[localCrawlState]#</state>
#{list-local}#
<entry>
<profile>#[profile]#</profile>
<initiator>#[initiator]#</initiator>
<depth>#[depth]#</depth>
<modified>#[modified]#</modified>
<anchor>#[anchor]#</anchor>
<url>#[url]#</url>
<hash>#[hash]#</hash>
<inProcess>#(inProcess)#false::true#(/inProcess)#</inProcess>
</entry>
#{/list-local}#
</localcrawlerqueue>
<limitcrawlerqueue>
<size>#[limitCrawlSize]#</size>
<state>#[limitCrawlState]#</state>
#{list-limit}#
<entry>
<profile>#[profile]#</profile>
<initiator>#[initiator]#</initiator>
<depth>#[depth]#</depth>
<modified>#[modified]#</modified>
<anchor>#[anchor]#</anchor>
<url>#[url]#</url>
<hash>#[hash]#</hash>
<inProcess>#(inProcess)#false::true#(/inProcess)#</inProcess>
</entry>
#{/list-limit}#
</limitcrawlerqueue>
<remotecrawlerqueue>
<size>#[remoteCrawlSize]#</size>
<state>#[remoteCrawlState]#</state>
#{list-remote}#
<entry>
<profile>#[profile]#</profile>
<initiator>#[initiator]#</initiator>
<depth>#[depth]#</depth>
<modified>#[modified]#</modified>
<anchor>#[anchor]#</anchor>
<url>#[url]#</url>
<hash>#[hash]#</hash>
<inProcess>#(inProcess)#false::true#(/inProcess)#</inProcess>
</entry>
#{/list-remote}#
</remotecrawlerqueue>
<noloadcrawlerqueue>
<size>#[noloadCrawlSize]#</size>
<state>#[noloadCrawlState]#</state>
</noloadcrawlerqueue>
</queues>

@ -1,4 +1,29 @@
// status_p
// (C) 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 18.12.2006 on http://www.anomic.de
// this file was created using the an implementation from IndexCreate_p.java, published 02.12.2004
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.io.ByteCount; import net.yacy.kelondro.io.ByteCount;
@ -13,6 +38,8 @@ import de.anomic.server.serverSwitch;
public class status_p { public class status_p {
public static final String STATE_RUNNING = "running";
public static final String STATE_PAUSED = "paused";
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
// return variable that accumulates replacements // return variable that accumulates replacements
@ -53,6 +80,30 @@ public class status_p {
prop.put("trafficProxy", ByteCount.getAccountCount(ByteCount.PROXY)); prop.put("trafficProxy", ByteCount.getAccountCount(ByteCount.PROXY));
prop.put("trafficCrawler", ByteCount.getAccountCount(ByteCount.CRAWLER)); prop.put("trafficCrawler", ByteCount.getAccountCount(ByteCount.CRAWLER));
// index size
prop.putNum("urlpublictextSize", segment.urlMetadata().size());
prop.putNum("rwipublictextSize", segment.termIndex().sizesMax());
// loader queue
prop.putNum("loaderSize", sb.crawlQueues.workerSize());
prop.putNum("loaderMax", sb.getConfigLong(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 10));
//local crawl queue
prop.putNum("localCrawlSize", sb.getThread(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL).getJobCount());
prop.put("localCrawlState", sb.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL) ? STATE_PAUSED : STATE_RUNNING);
//global crawl queue
prop.putNum("limitCrawlSize", sb.crawlQueues.limitCrawlJobSize());
prop.put("limitCrawlState", STATE_RUNNING);
//remote crawl queue
prop.putNum("remoteCrawlSize", sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL).getJobCount());
prop.put("remoteCrawlState", sb.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL) ? STATE_PAUSED : STATE_RUNNING);
//noload crawl queue
prop.putNum("noloadCrawlSize", sb.crawlQueues.noloadCrawlJobSize());
prop.put("noloadCrawlState", STATE_RUNNING);
// return rewrite properties // return rewrite properties
return prop; return prop;
} }

@ -1,35 +1,52 @@
<?xml version="1.0"?> <?xml version="1.0"?>
<status> <status>
<ppm>#[ppm]#</ppm> <ppm>#[ppm]#</ppm>
<wordCacheSize>#[wordCacheSize]#</wordCacheSize> <wordCacheSize>#[wordCacheSize]#</wordCacheSize>
<wordCacheMaxSize>#[wordCacheMaxSize]#</wordCacheMaxSize> <wordCacheMaxSize>#[wordCacheMaxSize]#</wordCacheMaxSize>
<memory>
<free>#[freeMemory]#</free>
<total>#[totalMemory]#</total>
<max>#[maxMemory]#</max>
</memory>
<processors>#[processors]#</processors>
<traffic>
<in>#[trafficIn]#</in>
<proxy>#[trafficProxy]#</proxy>
<crawler>#[trafficCrawler]#</crawler>
</traffic>
<dbsize>
<urlpublictext>#[urlpublictextSize]#</urlpublictext>
<rwipublictext>#[rwipublictextSize]#</rwipublictext>
</dbsize>
<loaderqueue> <loaderqueue>
<size>#[loaderSize]#</size> <size>#[loaderSize]#</size>
<max>#[loaderMax]#</max> <max>#[loaderMax]#</max>
</loaderqueue> </loaderqueue>
<localcrawlerqueue> <localcrawlerqueue>
<size>#[localCrawlSize]#</size> <size>#[localCrawlSize]#</size>
<state>#[localCrawlState]#</state>
</localcrawlerqueue> </localcrawlerqueue>
<limitcrawlerqueue> <limitcrawlerqueue>
<size>#[limitCrawlSize]#</size> <size>#[limitCrawlSize]#</size>
<state>#[limitCrawlState]#</state>
</limitcrawlerqueue> </limitcrawlerqueue>
<remotecrawlerqueue> <remotecrawlerqueue>
<size>#[remoteCrawlSize]#</size> <size>#[remoteCrawlSize]#</size>
<state>#[remoteCrawlState]#</state>
</remotecrawlerqueue> </remotecrawlerqueue>
<noloadcrawlerqueue> <noloadcrawlerqueue>
<size>#[noloadCrawlSize]#</size> <size>#[noloadCrawlSize]#</size>
<state>#[noloadCrawlState]#</state>
</noloadcrawlerqueue> </noloadcrawlerqueue>
<memory>
<free>#[freeMemory]#</free>
<total>#[totalMemory]#</total>
<max>#[maxMemory]#</max>
</memory>
<processors>#[processors]#</processors>
<traffic>
<in>#[trafficIn]#</in>
<proxy>#[trafficProxy]#</proxy>
<crawler>#[trafficCrawler]#</crawler>
</traffic>
</status> </status>

Binary file not shown.

After

Width:  |  Height:  |  Size: 932 B

@ -14,9 +14,10 @@
<div class="SubMenugroup"> <div class="SubMenugroup">
<h3>Queues</h3> <h3>Queues</h3>
<ul class="SubMenu"> <ul class="SubMenu">
<li><a href="/IndexCreateWWWLocalQueue_p.html" class="MenuItemLink lock">Local</a></li> <li><a href="/IndexCreateQueues_p.html?stack=LOCAL" class="MenuItemLink lock">Local</a></li>
<li><a href="/IndexCreateWWWGlobalQueue_p.html" class="MenuItemLink lock">Global</a></li> <li><a href="/IndexCreateQueues_p.html?stack=GLOBAL" class="MenuItemLink lock">Global</a></li>
<li><a href="/IndexCreateWWWRemoteQueue_p.html" class="MenuItemLink lock">Remote</a></li> <li><a href="/IndexCreateQueues_p.html?stack=REMOTE" class="MenuItemLink lock">Remote</a></li>
<li><a href="/IndexCreateQueues_p.html?stack=NOLOAD" class="MenuItemLink lock">No-Load</a></li>
</ul> </ul>
</div> </div>

@ -5,12 +5,10 @@ WORDCACHEBAR_LENGTH=1/4;
var statusRPC; var statusRPC;
var queuesRPC; var refreshInterval=3;
var refreshInterval=5;
var wait=0; var wait=0;
var changing=false; //change the interval var changing=false; //change the interval
var statusLoaded=true; var statusLoaded=true;
var queueLoaded=true;
function initCrawler(){ function initCrawler(){
refresh(); refresh();
@ -38,21 +36,20 @@ function newInterval(){
countInterval=window.setInterval("countdown()", 1000); countInterval=window.setInterval("countdown()", 1000);
changing=false; changing=false;
} }
function countdown(){ function countdown(){
if(statusLoaded && queueLoaded){ if(statusLoaded){
document.getElementById("nextUpdate").value=wait;
wait--; wait--;
if (wait == 0) { if (wait == 0) {
refresh(); refresh();
} }
} }
} }
function refresh(){ function refresh(){
wait=refreshInterval; wait=refreshInterval;
statusLoaded=false; statusLoaded=false;
queueLoaded=false;
requestStatus(); requestStatus();
requestQueues();
} }
function requestStatus(){ function requestStatus(){
@ -61,13 +58,6 @@ function requestStatus(){
statusRPC.onreadystatechange = handleStatus; statusRPC.onreadystatechange = handleStatus;
statusRPC.send(null); statusRPC.send(null);
} }
function requestQueues(){
queuesRPC=createRequestObject();
queuesRPC.open('get', '/api/queues_p.xml?html=');
queuesRPC.onreadystatechange = handleQueues;
queuesRPC.send(null);
}
function handleStatus(){ function handleStatus(){
if(statusRPC.readyState != 4){ if(statusRPC.readyState != 4){
@ -118,65 +108,44 @@ function handleStatus(){
img.setAttribute("src", BAR_IMG1); img.setAttribute("src", BAR_IMG1);
wordCacheSpan.appendChild(img); wordCacheSpan.appendChild(img);
} }
statusLoaded=true;
}
function handleQueues(){ dbsize=getFirstChild(statusTag, "dbsize");
if(queuesRPC.readyState != 4){
return;
}
var queuesResponse = queuesRPC.responseXML;
//xml=getFirstChild(queuesResponse);
xml=getFirstChild(queuesResponse, "queues");
if(queuesResponse != null){
clearTable(document.getElementById("queueTable"), 1);
dbsize=getFirstChild(xml, "dbsize");
urlpublictextSize=getValue(getFirstChild(dbsize, "urlpublictext")); urlpublictextSize=getValue(getFirstChild(dbsize, "urlpublictext"));
rwipublictextSize=getValue(getFirstChild(dbsize, "rwipublictext")); rwipublictextSize=getValue(getFirstChild(dbsize, "rwipublictext"));
document.getElementById("urldbsize").firstChild.nodeValue=urlpublictextSize; document.getElementById("urldbsize").firstChild.nodeValue=urlpublictextSize;
document.getElementById("rwidbsize").firstChild.nodeValue=rwipublictextSize; document.getElementById("rwidbsize").firstChild.nodeValue=rwipublictextSize;
loaderqueue=getFirstChild(xml, "loaderqueue"); loaderqueue=getFirstChild(statusTag, "loaderqueue");
updateTable(loaderqueue, "loader");
loaderqueue_size=getValue(getFirstChild(loaderqueue, "size")); loaderqueue_size=getValue(getFirstChild(loaderqueue, "size"));
loaderqueue_max=getValue(getFirstChild(loaderqueue, "max")); loaderqueue_max=getValue(getFirstChild(loaderqueue, "max"));
document.getElementById("loaderqueuesize").firstChild.nodeValue=loaderqueue_size; document.getElementById("loaderqueuesize").firstChild.nodeValue=loaderqueue_size;
document.getElementById("loaderqueuemax").firstChild.nodeValue=loaderqueue_max; document.getElementById("loaderqueuemax").firstChild.nodeValue=loaderqueue_max;
localcrawlerqueue=getFirstChild(xml, "localcrawlerqueue"); localcrawlerqueue=getFirstChild(statusTag, "localcrawlerqueue");
localcrawlerqueue_size=getValue(getFirstChild(localcrawlerqueue, "size")); localcrawlerqueue_size=getValue(getFirstChild(localcrawlerqueue, "size"));
localcrawlerqueue_state=getValue(getFirstChild(localcrawlerqueue, "state")); localcrawlerqueue_state=getValue(getFirstChild(localcrawlerqueue, "state"));
document.getElementById("localcrawlerqueuesize").firstChild.nodeValue=localcrawlerqueue_size; document.getElementById("localcrawlerqueuesize").firstChild.nodeValue=localcrawlerqueue_size;
putQueueState("localcrawler", localcrawlerqueue_state); putQueueState("localcrawler", localcrawlerqueue_state);
updateTable(localcrawlerqueue, "local crawler"); limitcrawlerqueue=getFirstChild(statusTag, "limitcrawlerqueue");
limitcrawlerqueue=getFirstChild(xml, "limitcrawlerqueue");
updateTable(limitcrawlerqueue, "limitCrawlerTable");
limitcrawlerqueue_size=getValue(getFirstChild(limitcrawlerqueue, "size")); limitcrawlerqueue_size=getValue(getFirstChild(limitcrawlerqueue, "size"));
limitcrawlerqueue_state=getValue(getFirstChild(limitcrawlerqueue, "state")); limitcrawlerqueue_state=getValue(getFirstChild(limitcrawlerqueue, "state"));
document.getElementById("limitcrawlerqueuesize").firstChild.nodeValue=limitcrawlerqueue_size; document.getElementById("limitcrawlerqueuesize").firstChild.nodeValue=limitcrawlerqueue_size;
putQueueState("limitcrawler", limitcrawlerqueue_state); putQueueState("limitcrawler", limitcrawlerqueue_state);
updateTable(limitcrawlerqueue, "limit crawler");
remotecrawlerqueue=getFirstChild(xml, "remotecrawlerqueue"); remotecrawlerqueue=getFirstChild(statusTag, "remotecrawlerqueue");
updateTable(remotecrawlerqueue, "remoteCrawlerTable");
remotecrawlerqueue_size=getValue(getFirstChild(remotecrawlerqueue, "size")); remotecrawlerqueue_size=getValue(getFirstChild(remotecrawlerqueue, "size"));
remotecrawlerqueue_state=getValue(getFirstChild(remotecrawlerqueue, "state")); remotecrawlerqueue_state=getValue(getFirstChild(remotecrawlerqueue, "state"));
document.getElementById("remotecrawlerqueuesize").firstChild.nodeValue=remotecrawlerqueue_size; document.getElementById("remotecrawlerqueuesize").firstChild.nodeValue=remotecrawlerqueue_size;
putQueueState("remotecrawler", remotecrawlerqueue_state); putQueueState("remotecrawler", remotecrawlerqueue_state);
updateTable(remotecrawlerqueue, "remote crawler");
noloadcrawlerqueue=getFirstChild(xml, "noloadcrawlerqueue"); noloadcrawlerqueue=getFirstChild(statusTag, "noloadcrawlerqueue");
noloadcrawlerqueue_size=getValue(getFirstChild(noloadcrawlerqueue, "size")); noloadcrawlerqueue_size=getValue(getFirstChild(noloadcrawlerqueue, "size"));
noloadcrawlerqueue_state=getValue(getFirstChild(noloadcrawlerqueue, "state")); noloadcrawlerqueue_state=getValue(getFirstChild(noloadcrawlerqueue, "state"));
document.getElementById("noloadcrawlerqueuesize").firstChild.nodeValue=noloadcrawlerqueue_size; document.getElementById("noloadcrawlerqueuesize").firstChild.nodeValue=noloadcrawlerqueue_size;
putQueueState("noloadcrawler", noloadcrawlerqueue_state); putQueueState("noloadcrawler", noloadcrawlerqueue_state);
} statusLoaded=true;
queueLoaded=true;
} }
function putQueueState(queue, state) { function putQueueState(queue, state) {
@ -184,53 +153,17 @@ function putQueueState(queue, state) {
img = document.getElementById(queue + "stateIMG"); img = document.getElementById(queue + "stateIMG");
if (state == "paused") { if (state == "paused") {
a.href = "Crawler_p.html?continue=" + queue; a.href = "Crawler_p.html?continue=" + queue;
a.title = "Continue this queue"; a.title = "Continue this queue (" + state + ")";
img.src = "/env/grafics/start.gif"; img.src = "/env/grafics/start.gif";
img.alt = "Continue this queue"; img.alt = "Continue this queue";
} else { } else {
a.href = "Crawler_p.html?pause=" + queue; a.href = "Crawler_p.html?pause=" + queue;
a.title = "Pause this queue"; a.title = "Pause this queue (" + state + ")";
img.src = "/env/grafics/stop.gif"; img.src = "/env/grafics/stop.gif";
img.alt = "Pause this queue"; img.alt = "Pause this queue";
} }
} }
function updateTable(indexingqueue, tablename){
indexingTable=document.getElementById("queueTable");
entries=indexingqueue.getElementsByTagName("entry");
dark=false;
for(i=0;i<entries.length;i++){
profile=getValue(getFirstChild(entries[i], "profile"));
initiator=getValue(getFirstChild(entries[i], "initiator"));
depth=getValue(getFirstChild(entries[i], "depth"));
modified=getValue(getFirstChild(entries[i], "modified"));
anchor=getValue(getFirstChild(entries[i], "anchor"));
url=getValue(getFirstChild(entries[i], "url"));
size=getValue(getFirstChild(entries[i], "size"));
hash=getValue(getFirstChild(entries[i], "hash"));
inProcess=false;
if(getValue(getFirstChild(entries[i], "inProcess"))=="true"){
inProcess=true;
}
if (tablename=="indexingTable")
deletebutton=createLinkCol("IndexCreateIndexingQueue_p.html?deleteEntry="+hash, DELETE_STRING);
else
deletebutton=createCol("");
row=createIndexingRow(tablename, profile, initiator, depth, modified, anchor, url, size, deletebutton);
//create row
if(inProcess){
row.setAttribute("class", "TableCellActive");
}else if(dark){
row.setAttribute("class", "TableCellDark");
}else{
row.setAttribute("class", "TableCellLight");
}
getFirstChild(indexingTable, "tbody").appendChild(row);
dark=!dark;
}
}
function shortenURL(url) { function shortenURL(url) {
if (url.length > 80) { if (url.length > 80) {

@ -60,7 +60,7 @@ public class urls {
if (post.get("call", "").equals("remotecrawl")) { if (post.get("call", "").equals("remotecrawl")) {
// perform a remote crawl url handover // perform a remote crawl url handover
final NoticedURL.StackType stackType = NoticedURL.StackType.LIMIT; final NoticedURL.StackType stackType = NoticedURL.StackType.GLOBAL;
int maxCount = Math.min(100, post.getInt("count", 10)); int maxCount = Math.min(100, post.getInt("count", 10));
final long maxTime = Math.min(20000, Math.max(1000, post.getInt("time", 10000))); final long maxTime = Math.min(20000, Math.max(1000, post.getInt("time", 10000)));
final long timeout = System.currentTimeMillis() + maxTime; final long timeout = System.currentTimeMillis() + maxTime;

@ -29,22 +29,20 @@ package de.anomic.crawler;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap; import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap; import java.util.TreeMap;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ConcurrentMap; import java.util.concurrent.ConcurrentMap;
import net.yacy.cora.document.ASCII; import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8; import net.yacy.cora.document.UTF8;
import net.yacy.cora.order.CloneableIterator; import net.yacy.cora.order.CloneableIterator;
import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.index.BufferedObjectIndex; import net.yacy.kelondro.index.BufferedObjectIndex;
import net.yacy.kelondro.index.HandleSet; import net.yacy.kelondro.index.HandleSet;
@ -53,7 +51,6 @@ import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.table.Table; import net.yacy.kelondro.table.Table;
import net.yacy.kelondro.util.ByteBuffer;
import net.yacy.kelondro.util.MemoryControl; import net.yacy.kelondro.util.MemoryControl;
import de.anomic.crawler.retrieval.Request; import de.anomic.crawler.retrieval.Request;
import de.anomic.http.client.Cache; import de.anomic.http.client.Cache;
@ -74,9 +71,6 @@ public class Balancer {
// class variables computed during operation // class variables computed during operation
private final ConcurrentMap<String, HandleSet> domainStacks; // a map from host name to lists with url hashs private final ConcurrentMap<String, HandleSet> domainStacks; // a map from host name to lists with url hashs
private final ConcurrentLinkedQueue<byte[]> top; // a list of url-hashes that shall be taken next
private final SortedMap<Long, byte[]> delayed;
private final HandleSet ddc;
private final HandleSet double_push_check; // for debugging private final HandleSet double_push_check; // for debugging
private long lastDomainStackFill; private long lastDomainStackFill;
private int domStackInitSize; private int domStackInitSize;
@ -91,13 +85,10 @@ public class Balancer {
final boolean exceed134217727) { final boolean exceed134217727) {
this.cacheStacksPath = cachePath; this.cacheStacksPath = cachePath;
this.domainStacks = new ConcurrentHashMap<String, HandleSet>(); this.domainStacks = new ConcurrentHashMap<String, HandleSet>();
this.top = new ConcurrentLinkedQueue<byte[]>();
this.delayed = new TreeMap<Long, byte[]>();
this.minimumLocalDelta = minimumLocalDelta; this.minimumLocalDelta = minimumLocalDelta;
this.minimumGlobalDelta = minimumGlobalDelta; this.minimumGlobalDelta = minimumGlobalDelta;
this.myAgentIDs = myAgentIDs; this.myAgentIDs = myAgentIDs;
this.domStackInitSize = Integer.MAX_VALUE; this.domStackInitSize = Integer.MAX_VALUE;
this.ddc = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
this.double_push_check = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); this.double_push_check = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
// create a stack for newly entered entries // create a stack for newly entered entries
@ -145,12 +136,7 @@ public class Balancer {
Log.logException(e); Log.logException(e);
} }
this.domainStacks.clear(); this.domainStacks.clear();
this.top.clear();
synchronized (this.delayed) {
this.delayed.clear();
}
this.double_push_check.clear(); this.double_push_check.clear();
this.ddc.clear();
} }
public Request get(final byte[] urlhash) throws IOException { public Request get(final byte[] urlhash) throws IOException {
@ -202,28 +188,11 @@ public class Balancer {
if (entry != null) removedCounter++; if (entry != null) removedCounter++;
// remove from double-check caches // remove from double-check caches
this.ddc.remove(urlhash);
this.double_push_check.remove(urlhash); this.double_push_check.remove(urlhash);
} }
if (removedCounter == 0) return 0; if (removedCounter == 0) return 0;
assert this.urlFileIndex.size() + removedCounter == s : "urlFileIndex.size() = " + this.urlFileIndex.size() + ", s = " + s; assert this.urlFileIndex.size() + removedCounter == s : "urlFileIndex.size() = " + this.urlFileIndex.size() + ", s = " + s;
// iterate through the top list
final Iterator<byte[]> j = this.top.iterator();
byte[] urlhash;
while (j.hasNext()) {
urlhash = j.next();
if (urlHashes.has(urlhash)) j.remove();
}
// remove from delayed
synchronized (this.delayed) {
final Iterator<Map.Entry<Long, byte[]>> k = this.delayed.entrySet().iterator();
while (k.hasNext()) {
if (urlHashes.has(k.next().getValue())) k.remove();
}
}
// iterate through the domain stacks // iterate through the domain stacks
final Iterator<Map.Entry<String, HandleSet>> q = this.domainStacks.entrySet().iterator(); final Iterator<Map.Entry<String, HandleSet>> q = this.domainStacks.entrySet().iterator();
HandleSet stack; HandleSet stack;
@ -237,7 +206,7 @@ public class Balancer {
} }
public boolean has(final byte[] urlhashb) { public boolean has(final byte[] urlhashb) {
return this.urlFileIndex.has(urlhashb) || this.ddc.has(urlhashb); return this.urlFileIndex.has(urlhashb) || this.double_push_check.has(urlhashb);
} }
public boolean notEmpty() { public boolean notEmpty() {
@ -277,7 +246,6 @@ public class Balancer {
synchronized (this) { synchronized (this) {
// double-check // double-check
if (this.double_push_check.has(hash)) return "double occurrence in double_push_check"; if (this.double_push_check.has(hash)) return "double occurrence in double_push_check";
if (this.ddc.has(hash)) return "double occurrence in ddc";
if (this.urlFileIndex.has(hash)) return "double occurrence in urlFileIndex"; if (this.urlFileIndex.has(hash)) return "double occurrence in urlFileIndex";
if (this.double_push_check.size() > 10000 || MemoryControl.shortStatus()) this.double_push_check.clear(); if (this.double_push_check.size() > 10000 || MemoryControl.shortStatus()) this.double_push_check.clear();
@ -297,12 +265,12 @@ public class Balancer {
/** /**
* get a list of domains that are currently maintained as domain stacks * get a list of domains that are currently maintained as domain stacks
* @return a map of clear text strings of host names to the size of the domain stack * @return a map of clear text strings of host names to an integer array: {the size of the domain stack, guessed delta waiting time}
*/ */
public Map<String, Integer> getDomainStackHosts() { public Map<String, Integer[]> getDomainStackHosts() {
Map<String, Integer> map = new HashMap<String, Integer>(); Map<String, Integer[]> map = new TreeMap<String, Integer[]>(); // we use a tree map to get a stable ordering
for (Map.Entry<String, HandleSet> entry: this.domainStacks.entrySet()) { for (Map.Entry<String, HandleSet> entry: this.domainStacks.entrySet()) {
map.put(entry.getKey(), entry.getValue().size()); map.put(entry.getKey(), new Integer[]{entry.getValue().size(), (int) Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta)});
} }
return map; return map;
} }
@ -315,17 +283,17 @@ public class Balancer {
*/ */
public long getDomainSleepTime(final CrawlSwitchboard cs, Request crawlEntry) { public long getDomainSleepTime(final CrawlSwitchboard cs, Request crawlEntry) {
final CrawlProfile profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle())); final CrawlProfile profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle()));
return getDomainSleepTime(cs, profileEntry, crawlEntry); return getDomainSleepTime(cs, profileEntry, crawlEntry.url());
} }
private long getDomainSleepTime(final CrawlSwitchboard cs, final CrawlProfile profileEntry, Request crawlEntry) { private long getDomainSleepTime(final CrawlSwitchboard cs, final CrawlProfile profileEntry, final DigestURI crawlURL) {
if (profileEntry == null) { if (profileEntry == null) {
return 0; return 0;
} }
long sleeptime = ( long sleeptime = (
profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY || profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY ||
(profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlEntry.url())) (profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlURL.hash()))
) ? 0 : Latency.waitingRemaining(crawlEntry.url(), this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server ) ? 0 : Latency.waitingRemaining(crawlURL, this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
return sleeptime; return sleeptime;
} }
@ -388,21 +356,6 @@ public class Balancer {
if (domainList.isEmpty()) this.domainStacks.remove(host); if (domainList.isEmpty()) this.domainStacks.remove(host);
} }
private byte[] nextFromDelayed() {
if (this.delayed.isEmpty()) return null;
final Long first = this.delayed.firstKey();
if (first.longValue() < System.currentTimeMillis()) {
return this.delayed.remove(first);
}
return null;
}
private byte[] anyFromDelayed() {
if (this.delayed.isEmpty()) return null;
final Long first = this.delayed.firstKey();
return this.delayed.remove(first);
}
/** /**
* get the next entry in this crawl queue in such a way that the domain access time delta is maximized * get the next entry in this crawl queue in such a way that the domain access time delta is maximized
* and always above the given minimum delay time. An additional delay time is computed using the robots.txt * and always above the given minimum delay time. An additional delay time is computed using the robots.txt
@ -418,41 +371,13 @@ public class Balancer {
public Request pop(final boolean delay, final CrawlSwitchboard cs) throws IOException { public Request pop(final boolean delay, final CrawlSwitchboard cs) throws IOException {
// returns a crawl entry from the stack and ensures minimum delta times // returns a crawl entry from the stack and ensures minimum delta times
try {
filltop(delay, -600000, false);
filltop(delay, -60000, false);
filltop(delay, -10000, false);
filltop(delay, -6000, false);
filltop(delay, -4000, false);
filltop(delay, -3000, false);
filltop(delay, -2000, false);
filltop(delay, -1000, false);
filltop(delay, -500, false);
filltop(delay, 0, true);
filltop(delay, 500, true);
filltop(delay, 1000, true);
filltop(delay, 2000, true);
filltop(delay, 3000, true);
filltop(delay, 4000, true);
filltop(delay, 6000, true);
filltop(delay, Long.MAX_VALUE, true);
} catch (final RowSpaceExceededException e) {}
long sleeptime = 0; long sleeptime = 0;
Request crawlEntry = null; Request crawlEntry = null;
synchronized (this) { synchronized (this) {
byte[] failhash = null; byte[] failhash = null;
while (!this.urlFileIndex.isEmpty()) { while (!this.urlFileIndex.isEmpty()) {
// first simply take one of the entries in the top list, that should be one without any delay byte[] nexthash = getbest();
byte[] nexthash = nextFromDelayed(); if (nexthash == null) return null;
//System.out.println("*** nextFromDelayed=" + nexthash);
if (nexthash == null && !this.top.isEmpty()) {
nexthash = this.top.remove();
//System.out.println("*** top.remove()=" + nexthash);
}
if (nexthash == null) {
nexthash = anyFromDelayed();
}
// check minimumDelta and if necessary force a sleep // check minimumDelta and if necessary force a sleep
//final int s = urlFileIndex.size(); //final int s = urlFileIndex.size();
@ -485,37 +410,14 @@ public class Balancer {
return null; return null;
} }
// depending on the caching policy we need sleep time to avoid DoS-like situations // depending on the caching policy we need sleep time to avoid DoS-like situations
sleeptime = getDomainSleepTime(cs, profileEntry, crawlEntry); sleeptime = getDomainSleepTime(cs, profileEntry, crawlEntry.url());
assert Base64Order.enhancedCoder.equal(nexthash, rowEntry.getPrimaryKeyBytes()) : "result = " + ASCII.String(nexthash) + ", rowEntry.getPrimaryKeyBytes() = " + ASCII.String(rowEntry.getPrimaryKeyBytes()); assert Base64Order.enhancedCoder.equal(nexthash, rowEntry.getPrimaryKeyBytes()) : "result = " + ASCII.String(nexthash) + ", rowEntry.getPrimaryKeyBytes() = " + ASCII.String(rowEntry.getPrimaryKeyBytes());
assert Base64Order.enhancedCoder.equal(nexthash, crawlEntry.url().hash()) : "result = " + ASCII.String(nexthash) + ", crawlEntry.url().hash() = " + ASCII.String(crawlEntry.url().hash()); assert Base64Order.enhancedCoder.equal(nexthash, crawlEntry.url().hash()) : "result = " + ASCII.String(nexthash) + ", crawlEntry.url().hash() = " + ASCII.String(crawlEntry.url().hash());
if (failhash != null && Base64Order.enhancedCoder.equal(failhash, nexthash)) break; // prevent endless loops if (failhash != null && Base64Order.enhancedCoder.equal(failhash, nexthash)) break; // prevent endless loops
if (delay && sleeptime > 0 && this.domStackInitSize > 1) {
//System.out.println("*** putback: nexthash=" + nexthash + ", failhash="+failhash);
// put that thing back to omit a delay here
if (!ByteBuffer.contains(this.delayed.values(), nexthash)) {
//System.out.println("*** delayed +=" + nexthash);
this.delayed.put(Long.valueOf(System.currentTimeMillis() + sleeptime + 1), nexthash);
}
try {
this.urlFileIndex.put(rowEntry);
String host = crawlEntry.url().getHost();
if (host == null) host = localhost;
this.domainStacks.remove(host);
failhash = nexthash;
} catch (final RowSpaceExceededException e) {
Log.logException(e);
}
continue;
}
break; break;
} }
if (crawlEntry != null) {
if (this.ddc.size() > 10000 || MemoryControl.shortStatus()) this.ddc.clear();
try { this.ddc.put(crawlEntry.url().hash()); } catch (final RowSpaceExceededException e) {}
}
} }
if (crawlEntry == null) return null; if (crawlEntry == null) return null;
@ -524,7 +426,7 @@ public class Balancer {
// in best case, this should never happen if the balancer works propertly // in best case, this should never happen if the balancer works propertly
// this is only to protection against the worst case, where the crawler could // this is only to protection against the worst case, where the crawler could
// behave in a DoS-manner // behave in a DoS-manner
Log.logInfo("BALANCER", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta) + ", top.size() = " + this.top.size() + ", delayed.size() = " + this.delayed.size() + ", domainStacks.size() = " + this.domainStacks.size() + ", domainStacksInitSize = " + this.domStackInitSize); Log.logInfo("BALANCER", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta) + ", domainStacks.size() = " + this.domainStacks.size() + ", domainStacksInitSize = " + this.domStackInitSize);
long loops = sleeptime / 1000; long loops = sleeptime / 1000;
long rest = sleeptime % 1000; long rest = sleeptime % 1000;
if (loops < 3) { if (loops < 3) {
@ -537,15 +439,11 @@ public class Balancer {
try {this.wait(1000); } catch (final InterruptedException e) {} try {this.wait(1000); } catch (final InterruptedException e) {}
} }
} }
this.ddc.remove(crawlEntry.url().hash());
Latency.update(crawlEntry.url()); Latency.update(crawlEntry.url());
return crawlEntry; return crawlEntry;
} }
private void filltop(final boolean delay, final long maximumwaiting, final boolean acceptonebest) throws RowSpaceExceededException { private byte[] getbest() {
if (!this.top.isEmpty()) return;
//System.out.println("*** DEBUG started filltop delay=" + ((delay) ? "true":"false") + ", maximumwaiting=" + maximumwaiting + ", acceptonebest=" + ((acceptonebest) ? "true":"false"));
// check if we need to get entries from the file index // check if we need to get entries from the file index
try { try {
@ -560,6 +458,7 @@ public class Balancer {
long smallestWaiting = Long.MAX_VALUE; long smallestWaiting = Long.MAX_VALUE;
byte[] besturlhash = null; byte[] besturlhash = null;
String besthost = null; String besthost = null;
Map<String, byte[]> zeroWaitingCandidates = new HashMap<String, byte[]>();
while (i.hasNext()) { while (i.hasNext()) {
entry = i.next(); entry = i.next();
@ -571,34 +470,52 @@ public class Balancer {
final byte[] n = entry.getValue().removeOne(); final byte[] n = entry.getValue().removeOne();
if (n == null) continue; if (n == null) continue;
if (delay) {
final long w = Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta); final long w = Latency.waitingRemainingGuessed(entry.getKey(), this.minimumLocalDelta, this.minimumGlobalDelta);
if (w > maximumwaiting) {
if (w < smallestWaiting) { if (w < smallestWaiting) {
smallestWaiting = w; smallestWaiting = w;
besturlhash = n; besturlhash = n;
besthost = entry.getKey(); besthost = entry.getKey();
if (w <= 0) {
zeroWaitingCandidates.put(besthost, besturlhash);
} }
entry.getValue().put(n); // put entry back }
continue; try {
entry.getValue().put(n); // put entry back, we are checking only
} catch (RowSpaceExceededException e) {
e.printStackTrace();
} }
} }
this.top.add(n); if (besturlhash == null) return null; // worst case
if (entry.getValue().isEmpty()) i.remove();
// best case would be, if we have some zeroWaitingCandidates,
// then we select that one with the largest stack
if (zeroWaitingCandidates.size() > 0) {
int largestStack = -1;
String largestStackHost = null;
byte[] largestStackHash = null;
for (Map.Entry<String, byte[]> z: zeroWaitingCandidates.entrySet()) {
HandleSet hs = this.domainStacks.get(z.getKey());
if (hs == null || hs.size() <= largestStack) continue;
largestStack = hs.size();
largestStackHost = z.getKey();
largestStackHash = z.getValue();
}
if (largestStackHost != null && largestStackHash != null) {
removeHashFromDomainStacks(largestStackHost, largestStackHash);
//Log.logInfo("Balancer", "*** picked one from largest stack");
return largestStackHash;
}
} }
// if we could not find any entry, then take the best we have seen so far // default case: just take that one with least waiting
if (acceptonebest && !this.top.isEmpty() && besturlhash != null) {
removeHashFromDomainStacks(besthost, besturlhash); removeHashFromDomainStacks(besthost, besturlhash);
this.top.add(besturlhash); return besturlhash;
}
} }
private void fillDomainStacks() throws IOException { private void fillDomainStacks() throws IOException {
if (!this.domainStacks.isEmpty() && System.currentTimeMillis() - this.lastDomainStackFill < 120000L) return; if (!this.domainStacks.isEmpty() && System.currentTimeMillis() - this.lastDomainStackFill < 60000L) return;
this.domainStacks.clear(); this.domainStacks.clear();
this.top.clear();
this.lastDomainStackFill = System.currentTimeMillis(); this.lastDomainStackFill = System.currentTimeMillis();
final HandleSet handles = this.urlFileIndex.keysFromBuffer(objectIndexBufferSize / 2); final HandleSet handles = this.urlFileIndex.keysFromBuffer(objectIndexBufferSize / 2);
final CloneableIterator<byte[]> i = handles.keys(true, null); final CloneableIterator<byte[]> i = handles.keys(true, null);
@ -621,51 +538,6 @@ public class Balancer {
this.domStackInitSize = this.domainStacks.size(); this.domStackInitSize = this.domainStacks.size();
} }
public List<Request> top(int count) {
final List<Request> cel = new ArrayList<Request>();
if (count == 0) return cel;
byte[][] ta = new byte[Math.min(count, this.top.size())][];
ta = this.top.toArray(ta);
for (final byte[] n: ta) {
if (n == null) break;
try {
final Row.Entry rowEntry = this.urlFileIndex.get(n, false);
if (rowEntry == null) continue;
final Request crawlEntry = new Request(rowEntry);
cel.add(crawlEntry);
count--;
if (count <= 0) break;
} catch (final IOException e) {}
}
int depth = 0;
loop: while (count > 0) {
// iterate over the domain stacks
final int celsize = cel.size();
ll: for (final HandleSet list: this.domainStacks.values()) {
if (list.size() <= depth) continue ll;
final byte[] n = list.getOne(depth);
if (n == null) continue ll;
try {
final Row.Entry rowEntry = this.urlFileIndex.get(n, false);
if (rowEntry == null) continue;
final Request crawlEntry = new Request(rowEntry);
cel.add(crawlEntry);
count--;
if (count <= 0) break loop;
} catch (final IOException e) {}
}
if (cel.size() == celsize) break loop;
depth++;
}
if (cel.size() < count) try {
final List<Row.Entry> list = this.urlFileIndex.top(count - cel.size());
for (final Row.Entry entry: list) cel.add(new Request(entry));
} catch (final IOException e) { }
return cel;
}
public Iterator<Request> iterator() throws IOException { public Iterator<Request> iterator() throws IOException {
return new EntryIterator(); return new EntryIterator();
} }
@ -678,10 +550,12 @@ public class Balancer {
this.rowIterator = Balancer.this.urlFileIndex.rows(); this.rowIterator = Balancer.this.urlFileIndex.rows();
} }
@Override
public boolean hasNext() { public boolean hasNext() {
return (this.rowIterator == null) ? false : this.rowIterator.hasNext(); return (this.rowIterator == null) ? false : this.rowIterator.hasNext();
} }
@Override
public Request next() { public Request next() {
final Row.Entry entry = this.rowIterator.next(); final Row.Entry entry = this.rowIterator.next();
try { try {
@ -693,6 +567,7 @@ public class Balancer {
} }
} }
@Override
public void remove() { public void remove() {
if (this.rowIterator != null) this.rowIterator.remove(); if (this.rowIterator != null) this.rowIterator.remove();
} }

@ -215,7 +215,7 @@ public class CrawlQueues {
} }
public int coreCrawlJobSize() { public int coreCrawlJobSize() {
return this.noticeURL.stackSize(NoticedURL.StackType.CORE) + this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD); return this.noticeURL.stackSize(NoticedURL.StackType.LOCAL) + this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD);
} }
public boolean coreCrawlJob() { public boolean coreCrawlJob() {
@ -226,14 +226,14 @@ public class CrawlQueues {
// move some tasks to the core crawl job so we have something to do // move some tasks to the core crawl job so we have something to do
final int toshift = Math.min(10, limitCrawlJobSize()); // this cannot be a big number because the balancer makes a forced waiting if it cannot balance final int toshift = Math.min(10, limitCrawlJobSize()); // this cannot be a big number because the balancer makes a forced waiting if it cannot balance
for (int i = 0; i < toshift; i++) { for (int i = 0; i < toshift; i++) {
this.noticeURL.shift(NoticedURL.StackType.LIMIT, NoticedURL.StackType.CORE, this.sb.crawler); this.noticeURL.shift(NoticedURL.StackType.GLOBAL, NoticedURL.StackType.LOCAL, this.sb.crawler);
} }
this.log.logInfo("shifted " + toshift + " jobs from global crawl to local crawl (coreCrawlJobSize()=" + coreCrawlJobSize() + this.log.logInfo("shifted " + toshift + " jobs from global crawl to local crawl (coreCrawlJobSize()=" + coreCrawlJobSize() +
", limitCrawlJobSize()=" + limitCrawlJobSize() + ", cluster.mode=" + this.sb.getConfig(SwitchboardConstants.CLUSTER_MODE, "") + ", limitCrawlJobSize()=" + limitCrawlJobSize() + ", cluster.mode=" + this.sb.getConfig(SwitchboardConstants.CLUSTER_MODE, "") +
", robinsonMode=" + ((this.sb.isRobinsonMode()) ? "on" : "off")); ", robinsonMode=" + ((this.sb.isRobinsonMode()) ? "on" : "off"));
} }
final String queueCheckCore = loadIsPossible(NoticedURL.StackType.CORE); final String queueCheckCore = loadIsPossible(NoticedURL.StackType.LOCAL);
final String queueCheckNoload = loadIsPossible(NoticedURL.StackType.NOLOAD); final String queueCheckNoload = loadIsPossible(NoticedURL.StackType.NOLOAD);
if (queueCheckCore != null && queueCheckNoload != null) { if (queueCheckCore != null && queueCheckNoload != null) {
if (this.log.isFine()) { if (this.log.isFine()) {
@ -251,11 +251,11 @@ public class CrawlQueues {
// do a local crawl // do a local crawl
Request urlEntry; Request urlEntry;
while (this.noticeURL.stackSize(NoticedURL.StackType.CORE) > 0 || this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD) > 0) { while (this.noticeURL.stackSize(NoticedURL.StackType.LOCAL) > 0 || this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD) > 0) {
final String stats = "LOCALCRAWL[" + final String stats = "LOCALCRAWL[" +
this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD) + ", " +
this.noticeURL.stackSize(NoticedURL.StackType.CORE) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.LOCAL) + ", " +
this.noticeURL.stackSize(NoticedURL.StackType.LIMIT) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.GLOBAL) + ", " +
this.noticeURL.stackSize(NoticedURL.StackType.OVERHANG) + this.noticeURL.stackSize(NoticedURL.StackType.OVERHANG) +
", " + this.noticeURL.stackSize(NoticedURL.StackType.REMOTE) + "]"; ", " + this.noticeURL.stackSize(NoticedURL.StackType.REMOTE) + "]";
try { try {
@ -284,7 +284,7 @@ public class CrawlQueues {
return true; return true;
} }
urlEntry = this.noticeURL.pop(NoticedURL.StackType.CORE, true, this.sb.crawler); urlEntry = this.noticeURL.pop(NoticedURL.StackType.LOCAL, true, this.sb.crawler);
if (urlEntry == null) { if (urlEntry == null) {
continue; continue;
} }
@ -300,7 +300,7 @@ public class CrawlQueues {
} catch (final IOException e) { } catch (final IOException e) {
this.log.logSevere(stats + ": CANNOT FETCH ENTRY: " + e.getMessage(), e); this.log.logSevere(stats + ": CANNOT FETCH ENTRY: " + e.getMessage(), e);
if (e.getMessage().indexOf("hash is null",0) > 0) { if (e.getMessage().indexOf("hash is null",0) > 0) {
this.noticeURL.clear(NoticedURL.StackType.CORE); this.noticeURL.clear(NoticedURL.StackType.LOCAL);
} }
} }
} }
@ -547,7 +547,7 @@ public class CrawlQueues {
} }
public int limitCrawlJobSize() { public int limitCrawlJobSize() {
return this.noticeURL.stackSize(NoticedURL.StackType.LIMIT); return this.noticeURL.stackSize(NoticedURL.StackType.GLOBAL);
} }
public int noloadCrawlJobSize() { public int noloadCrawlJobSize() {
@ -579,7 +579,7 @@ public class CrawlQueues {
} }
// we don't want to crawl a global URL globally, since WE are the global part. (from this point of view) // we don't want to crawl a global URL globally, since WE are the global part. (from this point of view)
final String stats = "REMOTETRIGGEREDCRAWL[" + this.noticeURL.stackSize(NoticedURL.StackType.CORE) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.LIMIT) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.OVERHANG) + ", " final String stats = "REMOTETRIGGEREDCRAWL[" + this.noticeURL.stackSize(NoticedURL.StackType.LOCAL) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.GLOBAL) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.OVERHANG) + ", "
+ this.noticeURL.stackSize(NoticedURL.StackType.REMOTE) + "]"; + this.noticeURL.stackSize(NoticedURL.StackType.REMOTE) + "]";
try { try {
final Request urlEntry = this.noticeURL.pop(NoticedURL.StackType.REMOTE, true, this.sb.crawler); final Request urlEntry = this.noticeURL.pop(NoticedURL.StackType.REMOTE, true, this.sb.crawler);

@ -370,14 +370,14 @@ public final class CrawlStacker {
// it may be possible that global == true and local == true, so do not check an error case against it // it may be possible that global == true and local == true, so do not check an error case against it
if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, proxy = true, initiator = proxy" + ", profile.handle = " + profile.handle()); if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, proxy = true, initiator = proxy" + ", profile.handle = " + profile.handle());
if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle()); if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle());
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LIMIT, entry); warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.GLOBAL, entry);
} else if (local) { } else if (local) {
if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, proxy = true, initiator = proxy" + ", profile.handle = " + profile.handle()); if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, proxy = true, initiator = proxy" + ", profile.handle = " + profile.handle());
if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle()); if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle());
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.CORE, entry); warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LOCAL, entry);
} else if (proxy) { } else if (proxy) {
if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: proxy = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle()); if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: proxy = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle());
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.CORE, entry); warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LOCAL, entry);
} else if (remote) { } else if (remote) {
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.REMOTE, entry); warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.REMOTE, entry);
} }

@ -146,7 +146,7 @@ public class Latency {
// return time that is remaining // return time that is remaining
//System.out.println("Latency: " + (waiting - timeSinceLastAccess)); //System.out.println("Latency: " + (waiting - timeSinceLastAccess));
return waiting - timeSinceLastAccess; return Math.max(0, waiting - timeSinceLastAccess);
} }
/** /**

@ -44,7 +44,7 @@ import de.anomic.crawler.retrieval.Request;
public class NoticedURL { public class NoticedURL {
public enum StackType { public enum StackType {
NULL, CORE, LIMIT, OVERHANG, REMOTE, NOLOAD, IMAGE, MOVIE, MUSIC; LOCAL, GLOBAL, OVERHANG, REMOTE, NOLOAD;
} }
public static final long minimumLocalDeltaInit = 10; // the minimum time difference between access of the same local domain public static final long minimumLocalDeltaInit = 10; // the minimum time difference between access of the same local domain
@ -146,8 +146,8 @@ public class NoticedURL {
public int stackSize(final StackType stackType) { public int stackSize(final StackType stackType) {
switch (stackType) { switch (stackType) {
case NOLOAD: return (this.noloadStack == null) ? 0 : this.noloadStack.size(); case NOLOAD: return (this.noloadStack == null) ? 0 : this.noloadStack.size();
case CORE: return (this.coreStack == null) ? 0 : this.coreStack.size(); case LOCAL: return (this.coreStack == null) ? 0 : this.coreStack.size();
case LIMIT: return (this.limitStack == null) ? 0 : this.limitStack.size(); case GLOBAL: return (this.limitStack == null) ? 0 : this.limitStack.size();
case OVERHANG: return 0; case OVERHANG: return 0;
case REMOTE: return (this.remoteStack == null) ? 0 : this.remoteStack.size(); case REMOTE: return (this.remoteStack == null) ? 0 : this.remoteStack.size();
default: return -1; default: return -1;
@ -172,9 +172,9 @@ public class NoticedURL {
public String push(final StackType stackType, final Request entry) { public String push(final StackType stackType, final Request entry) {
try { try {
switch (stackType) { switch (stackType) {
case CORE: case LOCAL:
return this.coreStack.push(entry); return this.coreStack.push(entry);
case LIMIT: case GLOBAL:
return this.limitStack.push(entry); return this.limitStack.push(entry);
case REMOTE: case REMOTE:
return this.remoteStack.push(entry); return this.remoteStack.push(entry);
@ -233,10 +233,10 @@ public class NoticedURL {
* get a list of domains that are currently maintained as domain stacks * get a list of domains that are currently maintained as domain stacks
* @return a map of clear text strings of host names to the size of the domain stacks * @return a map of clear text strings of host names to the size of the domain stacks
*/ */
public Map<String, Integer> getDomainStackHosts(final StackType stackType) { public Map<String, Integer[]> getDomainStackHosts(final StackType stackType) {
switch (stackType) { switch (stackType) {
case CORE: return this.coreStack.getDomainStackHosts(); case LOCAL: return this.coreStack.getDomainStackHosts();
case LIMIT: return this.limitStack.getDomainStackHosts(); case GLOBAL: return this.limitStack.getDomainStackHosts();
case REMOTE: return this.remoteStack.getDomainStackHosts(); case REMOTE: return this.remoteStack.getDomainStackHosts();
case NOLOAD: return this.noloadStack.getDomainStackHosts(); case NOLOAD: return this.noloadStack.getDomainStackHosts();
default: return null; default: return null;
@ -249,8 +249,8 @@ public class NoticedURL {
*/ */
public long getDomainSleepTime(final StackType stackType, final CrawlSwitchboard cs, Request crawlEntry) { public long getDomainSleepTime(final StackType stackType, final CrawlSwitchboard cs, Request crawlEntry) {
switch (stackType) { switch (stackType) {
case CORE: return this.coreStack.getDomainSleepTime(cs, crawlEntry); case LOCAL: return this.coreStack.getDomainSleepTime(cs, crawlEntry);
case LIMIT: return this.limitStack.getDomainSleepTime(cs, crawlEntry); case GLOBAL: return this.limitStack.getDomainSleepTime(cs, crawlEntry);
case REMOTE: return this.remoteStack.getDomainSleepTime(cs, crawlEntry); case REMOTE: return this.remoteStack.getDomainSleepTime(cs, crawlEntry);
case NOLOAD: return this.noloadStack.getDomainSleepTime(cs, crawlEntry); case NOLOAD: return this.noloadStack.getDomainSleepTime(cs, crawlEntry);
default: return 0; default: return 0;
@ -265,28 +265,18 @@ public class NoticedURL {
*/ */
public List<Request> getDomainStackReferences(final StackType stackType, String host, int maxcount) { public List<Request> getDomainStackReferences(final StackType stackType, String host, int maxcount) {
switch (stackType) { switch (stackType) {
case CORE: return this.coreStack.getDomainStackReferences(host, maxcount); case LOCAL: return this.coreStack.getDomainStackReferences(host, maxcount);
case LIMIT: return this.limitStack.getDomainStackReferences(host, maxcount); case GLOBAL: return this.limitStack.getDomainStackReferences(host, maxcount);
case REMOTE: return this.remoteStack.getDomainStackReferences(host, maxcount); case REMOTE: return this.remoteStack.getDomainStackReferences(host, maxcount);
case NOLOAD: return this.noloadStack.getDomainStackReferences(host, maxcount); case NOLOAD: return this.noloadStack.getDomainStackReferences(host, maxcount);
default: return null; default: return null;
} }
} }
public List<Request> top(final StackType stackType, final int count) {
switch (stackType) {
case CORE: return top(this.coreStack, count);
case LIMIT: return top(this.limitStack, count);
case REMOTE: return top(this.remoteStack, count);
case NOLOAD: return top(this.noloadStack, count);
default: return null;
}
}
public Request pop(final StackType stackType, final boolean delay, final CrawlSwitchboard cs) throws IOException { public Request pop(final StackType stackType, final boolean delay, final CrawlSwitchboard cs) throws IOException {
switch (stackType) { switch (stackType) {
case CORE: return pop(this.coreStack, delay, cs); case LOCAL: return pop(this.coreStack, delay, cs);
case LIMIT: return pop(this.limitStack, delay, cs); case GLOBAL: return pop(this.limitStack, delay, cs);
case REMOTE: return pop(this.remoteStack, delay, cs); case REMOTE: return pop(this.remoteStack, delay, cs);
case NOLOAD: return pop(this.noloadStack, false, cs); case NOLOAD: return pop(this.noloadStack, false, cs);
default: return null; default: return null;
@ -310,8 +300,8 @@ public class NoticedURL {
public void clear(final StackType stackType) { public void clear(final StackType stackType) {
Log.logInfo("NoticedURL", "CLEARING STACK " + stackType); Log.logInfo("NoticedURL", "CLEARING STACK " + stackType);
switch (stackType) { switch (stackType) {
case CORE: this.coreStack.clear(); break; case LOCAL: this.coreStack.clear(); break;
case LIMIT: this.limitStack.clear(); break; case GLOBAL: this.limitStack.clear(); break;
case REMOTE: this.remoteStack.clear(); break; case REMOTE: this.remoteStack.clear(); break;
case NOLOAD: this.noloadStack.clear(); break; case NOLOAD: this.noloadStack.clear(); break;
default: return; default: return;
@ -340,17 +330,11 @@ public class NoticedURL {
return null; return null;
} }
private static List<Request> top(final Balancer balancer, int count) {
// this is a filo - top
if (count > balancer.size()) count = balancer.size();
return balancer.top(count);
}
public Iterator<Request> iterator(final StackType stackType) { public Iterator<Request> iterator(final StackType stackType) {
// returns an iterator of plasmaCrawlBalancerEntry Objects // returns an iterator of plasmaCrawlBalancerEntry Objects
try {switch (stackType) { try {switch (stackType) {
case CORE: return this.coreStack.iterator(); case LOCAL: return this.coreStack.iterator();
case LIMIT: return this.limitStack.iterator(); case GLOBAL: return this.limitStack.iterator();
case REMOTE: return this.remoteStack.iterator(); case REMOTE: return this.remoteStack.iterator();
case NOLOAD: return this.noloadStack.iterator(); case NOLOAD: return this.noloadStack.iterator();
default: return null; default: return null;

@ -40,6 +40,7 @@ import java.io.UnsupportedEncodingException;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.kelondro.blob.ArrayStack; import net.yacy.kelondro.blob.ArrayStack;
import net.yacy.kelondro.blob.Compressor; import net.yacy.kelondro.blob.Compressor;
@ -172,26 +173,30 @@ public final class Cache {
* @return true if the content of the url is in the cache, false otherwise * @return true if the content of the url is in the cache, false otherwise
*/ */
public static boolean has(final DigestURI url) { public static boolean has(final DigestURI url) {
return has(url.hash());
}
public static boolean has(final byte[] urlhash) {
boolean headerExists; boolean headerExists;
boolean fileExists; boolean fileExists;
//synchronized (responseHeaderDB) { //synchronized (responseHeaderDB) {
headerExists = responseHeaderDB.containsKey(url.hash()); headerExists = responseHeaderDB.containsKey(urlhash);
fileExists = fileDB.containsKey(url.hash()); fileExists = fileDB.containsKey(urlhash);
//} //}
if (headerExists && fileExists) return true; if (headerExists && fileExists) return true;
if (!headerExists && !fileExists) return false; if (!headerExists && !fileExists) return false;
// if not both is there then we do a clean-up // if not both is there then we do a clean-up
if (headerExists) try { if (headerExists) try {
log.logWarning("header but not content of url " + url.toString() + " in cache; cleaned up"); log.logWarning("header but not content of urlhash " + ASCII.String(urlhash) + " in cache; cleaned up");
if (responseHeaderDB instanceof MapHeap) { if (responseHeaderDB instanceof MapHeap) {
((MapHeap) responseHeaderDB).delete(url.hash()); ((MapHeap) responseHeaderDB).delete(urlhash);
} else { } else {
responseHeaderDB.remove(url.hash()); responseHeaderDB.remove(urlhash);
} }
} catch (final IOException e) {} } catch (final IOException e) {}
if (fileExists) try { if (fileExists) try {
//log.logWarning("content but not header of url " + url.toString() + " in cache; cleaned up"); //log.logWarning("content but not header of url " + url.toString() + " in cache; cleaned up");
fileDB.delete(url.hash()); fileDB.delete(urlhash);
} catch (final IOException e) {} } catch (final IOException e) {}
return false; return false;
} }

@ -152,6 +152,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// the root value here will not be used to load the resource. // the root value here will not be used to load the resource.
// it is only the reference for relative links // it is only the reference for relative links
super(linkTags0, linkTags1); super(linkTags0, linkTags1);
assert root != null;
this.root = root; this.root = root;
this.evaluationScores = new Evaluation(); this.evaluationScores = new Evaluation();
this.rss = new HashMap<MultiProtocolURI, String>(); this.rss = new HashMap<MultiProtocolURI, String>();

@ -3328,7 +3328,7 @@ public final class Switchboard extends serverSwitch
this.peers.mySeed().put(Seed.NCOUNT, Integer.toString(this.crawlQueues.noticeURL.size())); // the number of links that the peer has noticed, but not loaded (NURL's) this.peers.mySeed().put(Seed.NCOUNT, Integer.toString(this.crawlQueues.noticeURL.size())); // the number of links that the peer has noticed, but not loaded (NURL's)
this.peers.mySeed().put( this.peers.mySeed().put(
Seed.RCOUNT, Seed.RCOUNT,
Integer.toString(this.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.LIMIT))); // the number of links that the peer provides for remote crawling (ZURL's) Integer.toString(this.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.GLOBAL))); // the number of links that the peer provides for remote crawling (ZURL's)
this.peers.mySeed().put(Seed.ICOUNT, Long.toString(this.indexSegments.RWICount())); // the minimum number of words that the peer has indexed (as it says) this.peers.mySeed().put(Seed.ICOUNT, Long.toString(this.indexSegments.RWICount())); // the minimum number of words that the peer has indexed (as it says)
this.peers.mySeed().put(Seed.SCOUNT, Integer.toString(this.peers.sizeConnected())); // the number of seeds that the peer has stored this.peers.mySeed().put(Seed.SCOUNT, Integer.toString(this.peers.sizeConnected())); // the number of seeds that the peer has stored
this.peers.mySeed().put( this.peers.mySeed().put(

Loading…
Cancel
Save