ready-prepared crawl list but at the stacks of the domains that are stored for balanced crawling. This affects also the balancer since that does not need to prepare the pre-selected crawl list for monitoring. As a effect: - it is no more possible to see the correct order of next to-be-crawled links, since that depends on the actual state of the balancer stack the next time another url is requested for loading - the balancer works better since the next url can be selected according to the current situation and not according to a pre-selected order.pull/1/head
parent
7e4e3fe5b6
commit
9ad1d8dde2
@ -0,0 +1,95 @@
|
|||||||
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||||
|
<head>
|
||||||
|
<title>YaCy '#[clientname]#': '#[queuename]#' Crawl Queue</title>
|
||||||
|
#%env/templates/metas.template%#
|
||||||
|
</head>
|
||||||
|
<body id="IndexCreateQueues">
|
||||||
|
<div id="fullcontent">
|
||||||
|
#(embed)#
|
||||||
|
#%env/templates/header.template%#
|
||||||
|
#%env/templates/submenuCrawlMonitor.template%#
|
||||||
|
<h2>'#[queuename]#' Crawl Queue</h2>
|
||||||
|
::#(/embed)#
|
||||||
|
|
||||||
|
#(crawler)#
|
||||||
|
<p>This crawler queue is empty</p>
|
||||||
|
::
|
||||||
|
#(embed)#
|
||||||
|
<form action="IndexCreateQueues_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
|
||||||
|
<fieldset>
|
||||||
|
Delete Entries:
|
||||||
|
<input type="text" name="pattern" value="#[deletepattern]#" size="40" maxlength="200" />
|
||||||
|
<select name="option" size="1">
|
||||||
|
<option value="5">Initiator</option>
|
||||||
|
<option value="3">Profile</option>
|
||||||
|
<option value="4">Depth</option>
|
||||||
|
<option value="6">Modified Date</option>
|
||||||
|
<option value="2">Anchor Name</option>
|
||||||
|
<option value="1" selected="selected">URL</option>
|
||||||
|
</select>
|
||||||
|
<input type="hidden" name="stack" value="#[queuename]#" />
|
||||||
|
<input type="submit" name="delete" value="Delete" />
|
||||||
|
</fieldset>
|
||||||
|
</form>
|
||||||
|
::#(/embed)#
|
||||||
|
<table border="0" cellpadding="2" cellspacing="1">
|
||||||
|
<colgroup>
|
||||||
|
<col width="5" />
|
||||||
|
<col width="10" />
|
||||||
|
<col width="30" />
|
||||||
|
<col width="10" />
|
||||||
|
<col width="10" />
|
||||||
|
<col width="10" />
|
||||||
|
<col width="10" />
|
||||||
|
<col width="10" />
|
||||||
|
<col width="10" />
|
||||||
|
<col />
|
||||||
|
</colgroup>
|
||||||
|
<tr class="TableHeader">
|
||||||
|
<th>Count</th>
|
||||||
|
<th>Delta/ms</th>
|
||||||
|
<th>Host</th>
|
||||||
|
<th>Initiator</th>
|
||||||
|
<th>Profile</th>
|
||||||
|
<th>Depth</th>
|
||||||
|
<th>Modified Date</th>
|
||||||
|
<th>Anchor Name</th>
|
||||||
|
<th>Delta/ms</th>
|
||||||
|
<th>URL</th>
|
||||||
|
</tr>
|
||||||
|
#{host}#
|
||||||
|
<tr class="TableCellDark">
|
||||||
|
<td>#[hostcount]#</td>
|
||||||
|
<td>#[hostdelta]#</td>
|
||||||
|
<td><a href="IndexCreateQueues_p.html?#(embed)#::embed=&#(/embed)#delete=&stack=#[queuename]#&option=1&pattern=.*#[hostname]#.*&urlsPerHost=#[urlsPerHost]#"><img src="env/grafics/trash.gif"></a> #[hostname]#</td>
|
||||||
|
<td colspan="7"></td>
|
||||||
|
</tr>
|
||||||
|
#{list}#
|
||||||
|
<tr class="TableCellLight">
|
||||||
|
<td colspan="3"></td>
|
||||||
|
<td>#[initiator]#</td>
|
||||||
|
<td>#[profile]#</td>
|
||||||
|
<td>#[depth]#</td>
|
||||||
|
<td>#[modified]#</td>
|
||||||
|
<td>#[anchor]#</td>
|
||||||
|
<td>#[delta]#</td>
|
||||||
|
<td><a href="#[url]#">#[url]#</a></td>
|
||||||
|
</tr>
|
||||||
|
#{/list}#
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
#{/host}#
|
||||||
|
#(/crawler)#
|
||||||
|
#(embed)#
|
||||||
|
#%env/templates/footer.template%#
|
||||||
|
::#(/embed)#
|
||||||
|
</div>
|
||||||
|
<script type="text/javascript">
|
||||||
|
<!--
|
||||||
|
parentPage = parent.document.getElementById('QueuesTable');
|
||||||
|
if (parentPage != null) parentPage.height = document.getElementById('fullcontent').offsetHeight + 30;
|
||||||
|
-->
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
@ -1,58 +0,0 @@
|
|||||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
|
||||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
|
||||||
<head>
|
|
||||||
<title>YaCy '#[clientname]#': Global Crawl Queue</title>
|
|
||||||
#%env/templates/metas.template%#
|
|
||||||
</head>
|
|
||||||
<body id="IndexCreateWWWGlobalQueue">
|
|
||||||
#%env/templates/header.template%#
|
|
||||||
#%env/templates/submenuCrawlMonitor.template%#
|
|
||||||
<h2>Global Crawl Queue</h2>
|
|
||||||
<p>
|
|
||||||
This queue stores the urls that shall be sent to other peers to perform a remote crawl.
|
|
||||||
If there is no peer for remote crawling available, the links are crawled locally.
|
|
||||||
</p>
|
|
||||||
#(crawler-queue)#
|
|
||||||
<p>The global crawler queue is empty</p>
|
|
||||||
::
|
|
||||||
<form action="IndexCreateWWWGlobalQueue_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
|
|
||||||
<fieldset>
|
|
||||||
<input type="submit" name="clearcrawlqueue" value="clear global crawl queue" />
|
|
||||||
</fieldset>
|
|
||||||
</form>
|
|
||||||
<p>There are <strong>#[num]#</strong> entries in the global crawler queue. Showing <strong>#[show-num]#</strong> most recent entries.</p>
|
|
||||||
<p>Show last <a href="IndexCreateWWWGlobalQueue_p.html?limit=50">50</a> | <a href="IndexCreateWWWGlobalQueue_p.html?limit=100">100</a> | <a href="IndexCreateWWWGlobalQueue_p.html?limit=250">250</a> | <a href="IndexCreateWWWGlobalQueue_p.html?limit=500">500</a> entries.</p>
|
|
||||||
<table border="0" cellpadding="2" cellspacing="1">
|
|
||||||
<colgroup>
|
|
||||||
<col width="60" span="2" />
|
|
||||||
<col width="10" />
|
|
||||||
<col width="80" />
|
|
||||||
<col width="180" />
|
|
||||||
<col />
|
|
||||||
<col width="10" />
|
|
||||||
</colgroup>
|
|
||||||
<tr class="TableHeader">
|
|
||||||
<th>Initiator</th>
|
|
||||||
<th>Profile</th>
|
|
||||||
<th>Depth</th>
|
|
||||||
<th>Modified Date</th>
|
|
||||||
<th>Anchor Name</th>
|
|
||||||
<th>URL</th>
|
|
||||||
<th>Delete</th>
|
|
||||||
</tr>
|
|
||||||
#{list}#
|
|
||||||
<tr class="TableCell#(dark)#Light::Dark#(/dark)#">
|
|
||||||
<td>#[initiator]#</td>
|
|
||||||
<td>#[profile]#</td>
|
|
||||||
<td>#[depth]#</td>
|
|
||||||
<td>#[modified]#</td>
|
|
||||||
<td>#[anchor]#</td>
|
|
||||||
<td><a href="#[url]#">#[url]#</a></td>
|
|
||||||
<td><a href="IndexCreateWWWGlobalQueue_p.html?deleteEntry=#[hash]#">[Delete]</a></td>
|
|
||||||
</tr>
|
|
||||||
#{/list}#
|
|
||||||
</table>
|
|
||||||
#(/crawler-queue)#
|
|
||||||
#%env/templates/footer.template%#
|
|
||||||
</body>
|
|
||||||
</html>
|
|
@ -1,125 +0,0 @@
|
|||||||
// IndexCreateWWWGlobalQueue_p.java
|
|
||||||
// -------------------------------
|
|
||||||
// part of the AnomicHTTPD caching proxy
|
|
||||||
// (C) by Michael Peter Christen; mc@yacy.net
|
|
||||||
// first published on http://www.anomic.de
|
|
||||||
// Frankfurt, Germany, 2004, 2005
|
|
||||||
//
|
|
||||||
//$LastChangedDate$
|
|
||||||
//$LastChangedRevision$
|
|
||||||
//$LastChangedBy$
|
|
||||||
//
|
|
||||||
// This program is free software; you can redistribute it and/or modify
|
|
||||||
// it under the terms of the GNU General Public License as published by
|
|
||||||
// the Free Software Foundation; either version 2 of the License, or
|
|
||||||
// (at your option) any later version.
|
|
||||||
//
|
|
||||||
// This program is distributed in the hope that it will be useful,
|
|
||||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
// GNU General Public License for more details.
|
|
||||||
//
|
|
||||||
// You should have received a copy of the GNU General Public License
|
|
||||||
// along with this program; if not, write to the Free Software
|
|
||||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
||||||
|
|
||||||
// You must compile this file with
|
|
||||||
// javac -classpath .:../classes IndexCreate_p.java
|
|
||||||
// if the shell's current path is HTROOT
|
|
||||||
|
|
||||||
import java.text.SimpleDateFormat;
|
|
||||||
import java.util.Date;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Locale;
|
|
||||||
|
|
||||||
import net.yacy.cora.document.ASCII;
|
|
||||||
import net.yacy.cora.protocol.RequestHeader;
|
|
||||||
import net.yacy.peers.Seed;
|
|
||||||
import net.yacy.search.Switchboard;
|
|
||||||
|
|
||||||
import de.anomic.crawler.CrawlProfile;
|
|
||||||
import de.anomic.crawler.NoticedURL;
|
|
||||||
import de.anomic.crawler.retrieval.Request;
|
|
||||||
import de.anomic.server.serverObjects;
|
|
||||||
import de.anomic.server.serverSwitch;
|
|
||||||
|
|
||||||
public class IndexCreateWWWGlobalQueue_p {
|
|
||||||
|
|
||||||
private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US);
|
|
||||||
private static String daydate(final Date date) {
|
|
||||||
if (date == null) return "";
|
|
||||||
return dayFormatter.format(date);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
|
|
||||||
// return variable that accumulates replacements
|
|
||||||
final Switchboard sb = (Switchboard) env;
|
|
||||||
final serverObjects prop = new serverObjects();
|
|
||||||
|
|
||||||
int showLimit = 100;
|
|
||||||
if (post != null) {
|
|
||||||
showLimit = post.getInt("limit", 100);
|
|
||||||
|
|
||||||
if (post.containsKey("clearcrawlqueue")) {
|
|
||||||
final int c = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.LIMIT);
|
|
||||||
sb.crawlQueues.noticeURL.clear(NoticedURL.StackType.LIMIT);
|
|
||||||
try { sb.cleanProfiles(); } catch (final InterruptedException e) { /* Ignore this */}
|
|
||||||
/*
|
|
||||||
int c = 0;
|
|
||||||
while (switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.StackType.LIMIT) > 0) {
|
|
||||||
urlHash = switchboard.urlPool.noticeURL.pop(plasmaCrawlNURL.StackType.LIMIT).hash();
|
|
||||||
if (urlHash != null) { switchboard.urlPool.noticeURL.remove(urlHash); c++; }
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
prop.put("info", "3");//crawling queue cleared
|
|
||||||
prop.putNum("info_numEntries", c);
|
|
||||||
} else if (post.containsKey("deleteEntry")) {
|
|
||||||
final String urlHash = post.get("deleteEntry");
|
|
||||||
sb.crawlQueues.noticeURL.removeByURLHash(urlHash.getBytes());
|
|
||||||
prop.put("LOCATION","");
|
|
||||||
return prop;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.LIMIT);
|
|
||||||
if (stackSize == 0) {
|
|
||||||
prop.put("crawler-queue", "0");
|
|
||||||
} else {
|
|
||||||
prop.put("crawler-queue", "1");
|
|
||||||
final List<Request> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.StackType.LIMIT, showLimit);
|
|
||||||
|
|
||||||
Request urle;
|
|
||||||
boolean dark = true;
|
|
||||||
Seed initiator;
|
|
||||||
String profileHandle;
|
|
||||||
CrawlProfile profileEntry;
|
|
||||||
int i, showNum = 0;
|
|
||||||
for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) {
|
|
||||||
urle = crawlerList.get(i);
|
|
||||||
if (urle != null && urle.url() != null) {
|
|
||||||
initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : ASCII.String(urle.initiator()));
|
|
||||||
profileHandle = urle.profileHandle();
|
|
||||||
profileEntry = profileHandle == null ? null : sb.crawler.getActive(profileHandle.getBytes());
|
|
||||||
prop.put("crawler-queue_list_"+showNum+"_dark", dark ? "1" : "0");
|
|
||||||
prop.putHTML("crawler-queue_list_"+showNum+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()) );
|
|
||||||
prop.put("crawler-queue_list_"+showNum+"_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));
|
|
||||||
prop.put("crawler-queue_list_"+showNum+"_depth", urle.depth());
|
|
||||||
prop.put("crawler-queue_list_"+showNum+"_modified", daydate(urle.appdate()) );
|
|
||||||
prop.putHTML("crawler-queue_list_"+showNum+"_anchor", urle.name());
|
|
||||||
prop.putHTML("crawler-queue_list_"+showNum+"_url", urle.url().toNormalform(false, true));
|
|
||||||
prop.put("crawler-queue_list_"+showNum+"_hash", urle.url().hash());
|
|
||||||
dark = !dark;
|
|
||||||
showNum++;
|
|
||||||
} else {
|
|
||||||
stackSize--;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
prop.putNum("crawler-queue_show-num", showNum); //showin sjow-num most recent
|
|
||||||
prop.putNum("crawler-queue_num", stackSize);//num Entries
|
|
||||||
prop.putNum("crawler-queue_list", showNum);
|
|
||||||
}
|
|
||||||
|
|
||||||
// return rewrite properties
|
|
||||||
return prop;
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,69 +0,0 @@
|
|||||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
|
||||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
|
||||||
<head>
|
|
||||||
<title>YaCy '#[clientname]#': Local Crawl Queue</title>
|
|
||||||
#%env/templates/metas.template%#
|
|
||||||
</head>
|
|
||||||
<body id="IndexCreateWWWLocalQueue">
|
|
||||||
#%env/templates/header.template%#
|
|
||||||
#%env/templates/submenuCrawlMonitor.template%#
|
|
||||||
<h2>Local Crawl Queue</h2>
|
|
||||||
<p>
|
|
||||||
This queue stores the urls that shall be crawled localy by this peer.
|
|
||||||
It may also contain urls that are computed by the proxy-prefetch.
|
|
||||||
</p>
|
|
||||||
|
|
||||||
#(crawler-queue)#
|
|
||||||
<p>The local crawler queue is empty</p>
|
|
||||||
::
|
|
||||||
<form action="IndexCreateWWWLocalQueue_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
|
|
||||||
<fieldset>
|
|
||||||
Delete Entries:
|
|
||||||
<input type="text" name="pattern" value=".*" size="40" maxlength="200" />
|
|
||||||
<select name="option" size="1">
|
|
||||||
<option value="5">Initiator</option>
|
|
||||||
<option value="3">Profile</option>
|
|
||||||
<option value="4">Depth</option>
|
|
||||||
<option value="6">Modified Date</option>
|
|
||||||
<option value="2">Anchor Name</option>
|
|
||||||
<option value="1" selected="selected">URL</option>
|
|
||||||
</select>
|
|
||||||
<input type="submit" name="deleteEntries" value="Delete" /><em>This may take a quite long time.</em>
|
|
||||||
</fieldset>
|
|
||||||
</form>
|
|
||||||
<p>There are <strong>#[num]#</strong> entries in the local crawler queue. Showing <strong>#[show-num]#</strong> most recent entries.</p>
|
|
||||||
<p>Show last <a href="IndexCreateWWWLocalQueue_p.html?limit=50">50</a> | <a href="IndexCreateWWWLocalQueue_p.html?limit=100">100</a> | <a href="IndexCreateWWWLocalQueue_p.html?limit=250">250</a> | <a href="IndexCreateWWWLocalQueue_p.html?limit=500">500</a> entries.</p>
|
|
||||||
<table border="0" cellpadding="2" cellspacing="1">
|
|
||||||
<colgroup>
|
|
||||||
<col width="60" span="2" />
|
|
||||||
<col width="10" />
|
|
||||||
<col width="80" />
|
|
||||||
<col width="180" />
|
|
||||||
<col />
|
|
||||||
<col width="10" />
|
|
||||||
</colgroup>
|
|
||||||
<tr class="TableHeader">
|
|
||||||
<th>Initiator</th>
|
|
||||||
<th>Profile</th>
|
|
||||||
<th>Depth</th>
|
|
||||||
<th>Modified Date</th>
|
|
||||||
<th>Anchor Name</th>
|
|
||||||
<th>URL</th>
|
|
||||||
<th>Delete</th>
|
|
||||||
</tr>
|
|
||||||
#{list}#
|
|
||||||
<tr class="TableCell#(dark)#Light::Dark#(/dark)#">
|
|
||||||
<td>#[initiator]#</td>
|
|
||||||
<td>#[profile]#</td>
|
|
||||||
<td>#[depth]#</td>
|
|
||||||
<td>#[modified]#</td>
|
|
||||||
<td>#[anchor]#</td>
|
|
||||||
<td><a href="#[url]#">#[url]#</a></td>
|
|
||||||
<td><a href="IndexCreateWWWLocalQueue_p.html?deleteEntry=#[hash]#">[Delete]</a></td>
|
|
||||||
</tr>
|
|
||||||
#{/list}#
|
|
||||||
</table>
|
|
||||||
#(/crawler-queue)#
|
|
||||||
#%env/templates/footer.template%#
|
|
||||||
</body>
|
|
||||||
</html>
|
|
@ -1,65 +0,0 @@
|
|||||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
|
||||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
|
||||||
<head>
|
|
||||||
<title>YaCy '#[clientname]#': Remote Crawl Queue</title>
|
|
||||||
#%env/templates/metas.template%#
|
|
||||||
</head>
|
|
||||||
<body id="IndexCreateWWWGlobalQueue">
|
|
||||||
#%env/templates/header.template%#
|
|
||||||
#%env/templates/submenuCrawlMonitor.template%#
|
|
||||||
<h2>Remote Crawl Queue</h2>
|
|
||||||
<p>
|
|
||||||
This queue stores the urls that other peers sent to you in order to perform a remote crawl for them.
|
|
||||||
</p>
|
|
||||||
#(crawler-queue)#
|
|
||||||
<p>The remote crawler queue is empty</p>
|
|
||||||
::
|
|
||||||
<form action="IndexCreateWWWRemoteQueue_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
|
|
||||||
<fieldset>
|
|
||||||
<input type="submit" name="clearcrawlqueue" value="clear remote crawl queue" />
|
|
||||||
</fieldset>
|
|
||||||
</form>
|
|
||||||
<p>
|
|
||||||
There are <strong>#[num]#</strong> entries in the remote crawler queue.
|
|
||||||
Showing <strong>#[show-num]#</strong> most recent entries.
|
|
||||||
</p>
|
|
||||||
<p>
|
|
||||||
Show last <a href="IndexCreateWWWRemoteQueue_p.html?limit=50">50</a> |
|
|
||||||
<a href="IndexCreateWWWRemoteQueue_p.html?limit=100">100</a> |
|
|
||||||
<a href="IndexCreateWWWRemoteQueue_p.html?limit=250">250</a> |
|
|
||||||
<a href="IndexCreateWWWRemoteQueue_p.html?limit=500">500</a> entries.
|
|
||||||
</p>
|
|
||||||
<table border="0" cellpadding="2" cellspacing="1">
|
|
||||||
<colgroup>
|
|
||||||
<col width="60" span="2" />
|
|
||||||
<col width="10" />
|
|
||||||
<col width="80" />
|
|
||||||
<col width="180" />
|
|
||||||
<col />
|
|
||||||
<col width="10" />
|
|
||||||
</colgroup>
|
|
||||||
<tr class="TableHeader">
|
|
||||||
<th>Initiator</th>
|
|
||||||
<th>Profile</th>
|
|
||||||
<th>Depth</th>
|
|
||||||
<th>Modified Date</th>
|
|
||||||
<th>Anchor Name</th>
|
|
||||||
<th>URL</th>
|
|
||||||
<th>Delete</th>
|
|
||||||
</tr>
|
|
||||||
#{list}#
|
|
||||||
<tr class="TableCell#(dark)#Light::Dark#(/dark)#">
|
|
||||||
<td>#[initiator]#</td>
|
|
||||||
<td>#[profile]#</td>
|
|
||||||
<td>#[depth]#</td>
|
|
||||||
<td>#[modified]#</td>
|
|
||||||
<td>#[anchor]#</td>
|
|
||||||
<td><a href="#[url]#">#[url]#</a></td>
|
|
||||||
<td><a href="IndexCreateWWWRemoteQueue_p.html?deleteEntry=#[hash]#">[Delete]</a></td>
|
|
||||||
</tr>
|
|
||||||
#{/list}#
|
|
||||||
</table>
|
|
||||||
#(/crawler-queue)#
|
|
||||||
#%env/templates/footer.template%#
|
|
||||||
</body>
|
|
||||||
</html>
|
|
@ -1,120 +0,0 @@
|
|||||||
// IndexCreateWWWRemoteQueue_p.java
|
|
||||||
// -------------------------------
|
|
||||||
// part of the AnomicHTTPD caching proxy
|
|
||||||
// (C) by Michael Peter Christen; mc@yacy.net
|
|
||||||
// first published on http://www.anomic.de
|
|
||||||
// Frankfurt, Germany, 2004, 2005
|
|
||||||
// last major change: 04.07.2005
|
|
||||||
//
|
|
||||||
// This program is free software; you can redistribute it and/or modify
|
|
||||||
// it under the terms of the GNU General Public License as published by
|
|
||||||
// the Free Software Foundation; either version 2 of the License, or
|
|
||||||
// (at your option) any later version.
|
|
||||||
//
|
|
||||||
// This program is distributed in the hope that it will be useful,
|
|
||||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
// GNU General Public License for more details.
|
|
||||||
//
|
|
||||||
// You should have received a copy of the GNU General Public License
|
|
||||||
// along with this program; if not, write to the Free Software
|
|
||||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
||||||
|
|
||||||
// You must compile this file with
|
|
||||||
// javac -classpath .:../classes IndexCreateWWWRemoteQueue_p.java
|
|
||||||
// if the shell's current path is HTROOT
|
|
||||||
|
|
||||||
import java.text.SimpleDateFormat;
|
|
||||||
import java.util.Date;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Locale;
|
|
||||||
|
|
||||||
import net.yacy.cora.document.ASCII;
|
|
||||||
import net.yacy.cora.protocol.RequestHeader;
|
|
||||||
import net.yacy.peers.Seed;
|
|
||||||
import net.yacy.search.Switchboard;
|
|
||||||
|
|
||||||
import de.anomic.crawler.CrawlProfile;
|
|
||||||
import de.anomic.crawler.NoticedURL;
|
|
||||||
import de.anomic.crawler.retrieval.Request;
|
|
||||||
import de.anomic.server.serverObjects;
|
|
||||||
import de.anomic.server.serverSwitch;
|
|
||||||
import de.anomic.server.servletProperties;
|
|
||||||
|
|
||||||
public class IndexCreateWWWRemoteQueue_p {
|
|
||||||
|
|
||||||
private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US);
|
|
||||||
private static String daydate(final Date date) {
|
|
||||||
if (date == null) return "";
|
|
||||||
return dayFormatter.format(date);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
|
|
||||||
final servletProperties prop = new servletProperties();
|
|
||||||
final Switchboard sb = (Switchboard)env;
|
|
||||||
|
|
||||||
int showLimit = 100;
|
|
||||||
if (post != null) {
|
|
||||||
showLimit = post.getInt("limit", 100);
|
|
||||||
|
|
||||||
if (post.containsKey("clearcrawlqueue")) {
|
|
||||||
final int c = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.REMOTE);
|
|
||||||
sb.crawlQueues.noticeURL.clear(NoticedURL.StackType.REMOTE);
|
|
||||||
try { sb.cleanProfiles(); } catch (final InterruptedException e) { /* Ignore this */}
|
|
||||||
/*
|
|
||||||
int c = 0;
|
|
||||||
while (switchboard.urlPool.noticeURL.stackSize(plasmaCrawlNURL.StackType.LIMIT) > 0) {
|
|
||||||
urlHash = switchboard.urlPool.noticeURL.pop(plasmaCrawlNURL.StackType.LIMIT).hash();
|
|
||||||
if (urlHash != null) { switchboard.urlPool.noticeURL.remove(urlHash); c++; }
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
prop.put("info", "3"); // crawling queue cleared
|
|
||||||
prop.putNum("info_numEntries", c);
|
|
||||||
} else if (post.containsKey("deleteEntry")) {
|
|
||||||
final String urlHash = post.get("deleteEntry");
|
|
||||||
sb.crawlQueues.noticeURL.removeByURLHash(urlHash.getBytes());
|
|
||||||
prop.put("LOCATION","");
|
|
||||||
return prop;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.REMOTE);
|
|
||||||
if (stackSize == 0) {
|
|
||||||
prop.put("crawler-queue", "0");
|
|
||||||
} else {
|
|
||||||
prop.put("crawler-queue", "1");
|
|
||||||
final List<Request> crawlerList = sb.crawlQueues.noticeURL.top(NoticedURL.StackType.REMOTE, showLimit);
|
|
||||||
|
|
||||||
Request urle;
|
|
||||||
boolean dark = true;
|
|
||||||
Seed initiator;
|
|
||||||
String profileHandle;
|
|
||||||
CrawlProfile profileEntry;
|
|
||||||
int i, showNum = 0;
|
|
||||||
for (i = 0; (i < crawlerList.size()) && (showNum < showLimit); i++) {
|
|
||||||
urle = crawlerList.get(i);
|
|
||||||
if (urle != null && urle.url() != null) {
|
|
||||||
initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : ASCII.String(urle.initiator()));
|
|
||||||
profileHandle = urle.profileHandle();
|
|
||||||
profileEntry = profileHandle == null ? null : sb.crawler.getActive(profileHandle.getBytes());
|
|
||||||
prop.put("crawler-queue_list_" + showNum + "_dark", dark ? "1" : "0");
|
|
||||||
prop.putHTML("crawler-queue_list_" + showNum + "_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
|
|
||||||
prop.put("crawler-queue_list_" + showNum + "_profile", ((profileEntry == null) ? "unknown" : profileEntry.name()));
|
|
||||||
prop.put("crawler-queue_list_" + showNum + "_depth", urle.depth());
|
|
||||||
prop.put("crawler-queue_list_" + showNum + "_modified", daydate(urle.appdate()) );
|
|
||||||
prop.putHTML("crawler-queue_list_" + showNum + "_anchor", urle.name());
|
|
||||||
prop.putHTML("crawler-queue_list_" + showNum + "_url", urle.url().toString());
|
|
||||||
prop.put("crawler-queue_list_" + showNum + "_hash", urle.url().hash());
|
|
||||||
dark = !dark;
|
|
||||||
showNum++;
|
|
||||||
} else {
|
|
||||||
stackSize--;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
prop.putNum("crawler-queue_show-num", showNum); //showin sjow-num most recent
|
|
||||||
prop.putNum("crawler-queue_num", stackSize);//num Entries
|
|
||||||
prop.putNum("crawler-queue_list", showNum);
|
|
||||||
}
|
|
||||||
return prop;
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,124 +0,0 @@
|
|||||||
import java.text.SimpleDateFormat;
|
|
||||||
import java.util.Date;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Locale;
|
|
||||||
|
|
||||||
import net.yacy.cora.document.ASCII;
|
|
||||||
import net.yacy.cora.document.UTF8;
|
|
||||||
import net.yacy.cora.protocol.RequestHeader;
|
|
||||||
import net.yacy.peers.Seed;
|
|
||||||
import net.yacy.search.Switchboard;
|
|
||||||
import net.yacy.search.SwitchboardConstants;
|
|
||||||
import net.yacy.search.index.Segment;
|
|
||||||
import net.yacy.search.index.Segments;
|
|
||||||
import de.anomic.crawler.NoticedURL;
|
|
||||||
import de.anomic.crawler.retrieval.Request;
|
|
||||||
import de.anomic.server.serverObjects;
|
|
||||||
import de.anomic.server.serverSwitch;
|
|
||||||
|
|
||||||
public class queues_p {
|
|
||||||
|
|
||||||
public static final String STATE_RUNNING = "running";
|
|
||||||
public static final String STATE_PAUSED = "paused";
|
|
||||||
|
|
||||||
private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US);
|
|
||||||
private static String daydate(final Date date) {
|
|
||||||
if (date == null) return "";
|
|
||||||
return dayFormatter.format(date);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
|
|
||||||
// return variable that accumulates replacements
|
|
||||||
final Switchboard sb = (Switchboard) env;
|
|
||||||
//wikiCode wikiTransformer = new wikiCode(switchboard);
|
|
||||||
final serverObjects prop = new serverObjects();
|
|
||||||
Segment segment = null;
|
|
||||||
final boolean html = post != null && post.containsKey("html");
|
|
||||||
prop.setLocalized(html);
|
|
||||||
if (post != null && post.containsKey("segment") && sb.verifyAuthentication(header)) {
|
|
||||||
segment = sb.indexSegments.segment(post.get("segment"));
|
|
||||||
}
|
|
||||||
if (segment == null) segment = sb.indexSegments.segment(Segments.Process.PUBLIC);
|
|
||||||
prop.put("rejected", "0");
|
|
||||||
//int showRejectedCount = 10;
|
|
||||||
|
|
||||||
Seed initiator;
|
|
||||||
|
|
||||||
// index size
|
|
||||||
prop.putNum("urlpublictextSize", segment.urlMetadata().size());
|
|
||||||
prop.putNum("rwipublictextSize", segment.termIndex().sizesMax());
|
|
||||||
|
|
||||||
// loader queue
|
|
||||||
prop.putNum("loaderSize", sb.crawlQueues.workerSize());
|
|
||||||
prop.putNum("loaderMax", sb.getConfigLong(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 10));
|
|
||||||
if (sb.crawlQueues.workerSize() == 0) {
|
|
||||||
prop.put("list-loader", "0");
|
|
||||||
} else {
|
|
||||||
final Request[] w = sb.crawlQueues.activeWorkerEntries();
|
|
||||||
int count = 0;
|
|
||||||
for (final Request r : w) {
|
|
||||||
if (r == null) continue;
|
|
||||||
prop.put("list-loader_"+count+"_profile", r.profileHandle());
|
|
||||||
initiator = sb.peers.getConnected((r.initiator() == null) ? "" : ASCII.String(r.initiator()));
|
|
||||||
prop.putHTML("list-loader_"+count+"_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
|
|
||||||
prop.put("list-loader_"+count+"_depth", r.depth());
|
|
||||||
prop.putXML("list-loader_"+count+"_url", r.url().toString());
|
|
||||||
count++;
|
|
||||||
}
|
|
||||||
prop.put("list-loader", count);
|
|
||||||
}
|
|
||||||
|
|
||||||
//local crawl queue
|
|
||||||
prop.putNum("localCrawlSize", sb.getThread(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL).getJobCount());
|
|
||||||
prop.put("localCrawlState", sb.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL) ? STATE_PAUSED : STATE_RUNNING);
|
|
||||||
int stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.CORE);
|
|
||||||
addNTable(sb, prop, "list-local", sb.crawlQueues.noticeURL.top(NoticedURL.StackType.CORE, Math.min(10, stackSize)));
|
|
||||||
|
|
||||||
//global crawl queue
|
|
||||||
prop.putNum("limitCrawlSize", sb.crawlQueues.limitCrawlJobSize());
|
|
||||||
prop.put("limitCrawlState", STATE_RUNNING);
|
|
||||||
stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.LIMIT);
|
|
||||||
|
|
||||||
//remote crawl queue
|
|
||||||
prop.putNum("remoteCrawlSize", sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL).getJobCount());
|
|
||||||
prop.put("remoteCrawlState", sb.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL) ? STATE_PAUSED : STATE_RUNNING);
|
|
||||||
stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.LIMIT);
|
|
||||||
|
|
||||||
if (stackSize == 0) {
|
|
||||||
prop.put("list-remote", "0");
|
|
||||||
} else {
|
|
||||||
addNTable(sb, prop, "list-remote", sb.crawlQueues.noticeURL.top(NoticedURL.StackType.LIMIT, Math.min(10, stackSize)));
|
|
||||||
}
|
|
||||||
|
|
||||||
//noload crawl queue
|
|
||||||
prop.putNum("noloadCrawlSize", sb.crawlQueues.noloadCrawlJobSize());
|
|
||||||
prop.put("noloadCrawlState", STATE_RUNNING);
|
|
||||||
//stackSize = sb.crawlQueues.noticeURL.stackSize(NoticedURL.StackType.NOLOAD);
|
|
||||||
|
|
||||||
|
|
||||||
// return rewrite properties
|
|
||||||
return prop;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public static final void addNTable(final Switchboard sb, final serverObjects prop, final String tableName, final List<Request> crawlerList) {
|
|
||||||
|
|
||||||
int showNum = 0;
|
|
||||||
Seed initiator;
|
|
||||||
for (final Request urle : crawlerList) {
|
|
||||||
if ((urle != null) && (urle.url() != null)) {
|
|
||||||
initiator = sb.peers.getConnected((urle.initiator() == null) ? "" : UTF8.String(urle.initiator()));
|
|
||||||
prop.put(tableName + "_" + showNum + "_profile", urle.profileHandle());
|
|
||||||
prop.put(tableName + "_" + showNum + "_initiator", ((initiator == null) ? "proxy" : initiator.getName()));
|
|
||||||
prop.put(tableName + "_" + showNum + "_depth", urle.depth());
|
|
||||||
prop.put(tableName + "_" + showNum + "_modified", daydate(urle.appdate()));
|
|
||||||
prop.putXML(tableName + "_" + showNum + "_anchor", urle.name());
|
|
||||||
prop.putXML(tableName + "_" + showNum + "_url", urle.url().toNormalform(false, true));
|
|
||||||
prop.put(tableName + "_" + showNum + "_hash", urle.url().hash());
|
|
||||||
showNum++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
prop.put(tableName, showNum);
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,71 +0,0 @@
|
|||||||
<?xml version="1.0"?>
|
|
||||||
<queues>
|
|
||||||
<dbsize>
|
|
||||||
<urlpublictext>#[urlpublictextSize]#</urlpublictext>
|
|
||||||
<rwipublictext>#[rwipublictextSize]#</rwipublictext>
|
|
||||||
</dbsize>
|
|
||||||
<loaderqueue>
|
|
||||||
<size>#[loaderSize]#</size>
|
|
||||||
<max>#[loaderMax]#</max>
|
|
||||||
#{list-loader}#
|
|
||||||
<entry>
|
|
||||||
<profile>#[profile]#</profile>
|
|
||||||
<initiator>#[initiator]#</initiator>
|
|
||||||
<depth>#[depth]#</depth>
|
|
||||||
<url>#[url]#</url>
|
|
||||||
</entry>
|
|
||||||
#{/list-loader}#
|
|
||||||
</loaderqueue>
|
|
||||||
<localcrawlerqueue>
|
|
||||||
<size>#[localCrawlSize]#</size>
|
|
||||||
<state>#[localCrawlState]#</state>
|
|
||||||
#{list-local}#
|
|
||||||
<entry>
|
|
||||||
<profile>#[profile]#</profile>
|
|
||||||
<initiator>#[initiator]#</initiator>
|
|
||||||
<depth>#[depth]#</depth>
|
|
||||||
<modified>#[modified]#</modified>
|
|
||||||
<anchor>#[anchor]#</anchor>
|
|
||||||
<url>#[url]#</url>
|
|
||||||
<hash>#[hash]#</hash>
|
|
||||||
<inProcess>#(inProcess)#false::true#(/inProcess)#</inProcess>
|
|
||||||
</entry>
|
|
||||||
#{/list-local}#
|
|
||||||
</localcrawlerqueue>
|
|
||||||
<limitcrawlerqueue>
|
|
||||||
<size>#[limitCrawlSize]#</size>
|
|
||||||
<state>#[limitCrawlState]#</state>
|
|
||||||
#{list-limit}#
|
|
||||||
<entry>
|
|
||||||
<profile>#[profile]#</profile>
|
|
||||||
<initiator>#[initiator]#</initiator>
|
|
||||||
<depth>#[depth]#</depth>
|
|
||||||
<modified>#[modified]#</modified>
|
|
||||||
<anchor>#[anchor]#</anchor>
|
|
||||||
<url>#[url]#</url>
|
|
||||||
<hash>#[hash]#</hash>
|
|
||||||
<inProcess>#(inProcess)#false::true#(/inProcess)#</inProcess>
|
|
||||||
</entry>
|
|
||||||
#{/list-limit}#
|
|
||||||
</limitcrawlerqueue>
|
|
||||||
<remotecrawlerqueue>
|
|
||||||
<size>#[remoteCrawlSize]#</size>
|
|
||||||
<state>#[remoteCrawlState]#</state>
|
|
||||||
#{list-remote}#
|
|
||||||
<entry>
|
|
||||||
<profile>#[profile]#</profile>
|
|
||||||
<initiator>#[initiator]#</initiator>
|
|
||||||
<depth>#[depth]#</depth>
|
|
||||||
<modified>#[modified]#</modified>
|
|
||||||
<anchor>#[anchor]#</anchor>
|
|
||||||
<url>#[url]#</url>
|
|
||||||
<hash>#[hash]#</hash>
|
|
||||||
<inProcess>#(inProcess)#false::true#(/inProcess)#</inProcess>
|
|
||||||
</entry>
|
|
||||||
#{/list-remote}#
|
|
||||||
</remotecrawlerqueue>
|
|
||||||
<noloadcrawlerqueue>
|
|
||||||
<size>#[noloadCrawlSize]#</size>
|
|
||||||
<state>#[noloadCrawlState]#</state>
|
|
||||||
</noloadcrawlerqueue>
|
|
||||||
</queues>
|
|
@ -1,35 +1,52 @@
|
|||||||
<?xml version="1.0"?>
|
<?xml version="1.0"?>
|
||||||
<status>
|
<status>
|
||||||
<ppm>#[ppm]#</ppm>
|
<ppm>#[ppm]#</ppm>
|
||||||
|
|
||||||
<wordCacheSize>#[wordCacheSize]#</wordCacheSize>
|
<wordCacheSize>#[wordCacheSize]#</wordCacheSize>
|
||||||
<wordCacheMaxSize>#[wordCacheMaxSize]#</wordCacheMaxSize>
|
<wordCacheMaxSize>#[wordCacheMaxSize]#</wordCacheMaxSize>
|
||||||
|
|
||||||
|
<memory>
|
||||||
|
<free>#[freeMemory]#</free>
|
||||||
|
<total>#[totalMemory]#</total>
|
||||||
|
<max>#[maxMemory]#</max>
|
||||||
|
</memory>
|
||||||
|
|
||||||
|
<processors>#[processors]#</processors>
|
||||||
|
|
||||||
|
<traffic>
|
||||||
|
<in>#[trafficIn]#</in>
|
||||||
|
<proxy>#[trafficProxy]#</proxy>
|
||||||
|
<crawler>#[trafficCrawler]#</crawler>
|
||||||
|
</traffic>
|
||||||
|
|
||||||
|
<dbsize>
|
||||||
|
<urlpublictext>#[urlpublictextSize]#</urlpublictext>
|
||||||
|
<rwipublictext>#[rwipublictextSize]#</rwipublictext>
|
||||||
|
</dbsize>
|
||||||
|
|
||||||
<loaderqueue>
|
<loaderqueue>
|
||||||
<size>#[loaderSize]#</size>
|
<size>#[loaderSize]#</size>
|
||||||
<max>#[loaderMax]#</max>
|
<max>#[loaderMax]#</max>
|
||||||
</loaderqueue>
|
</loaderqueue>
|
||||||
|
|
||||||
<localcrawlerqueue>
|
<localcrawlerqueue>
|
||||||
<size>#[localCrawlSize]#</size>
|
<size>#[localCrawlSize]#</size>
|
||||||
|
<state>#[localCrawlState]#</state>
|
||||||
</localcrawlerqueue>
|
</localcrawlerqueue>
|
||||||
|
|
||||||
<limitcrawlerqueue>
|
<limitcrawlerqueue>
|
||||||
<size>#[limitCrawlSize]#</size>
|
<size>#[limitCrawlSize]#</size>
|
||||||
|
<state>#[limitCrawlState]#</state>
|
||||||
</limitcrawlerqueue>
|
</limitcrawlerqueue>
|
||||||
|
|
||||||
<remotecrawlerqueue>
|
<remotecrawlerqueue>
|
||||||
<size>#[remoteCrawlSize]#</size>
|
<size>#[remoteCrawlSize]#</size>
|
||||||
|
<state>#[remoteCrawlState]#</state>
|
||||||
</remotecrawlerqueue>
|
</remotecrawlerqueue>
|
||||||
|
|
||||||
<noloadcrawlerqueue>
|
<noloadcrawlerqueue>
|
||||||
<size>#[noloadCrawlSize]#</size>
|
<size>#[noloadCrawlSize]#</size>
|
||||||
|
<state>#[noloadCrawlState]#</state>
|
||||||
</noloadcrawlerqueue>
|
</noloadcrawlerqueue>
|
||||||
|
|
||||||
<memory>
|
|
||||||
<free>#[freeMemory]#</free>
|
|
||||||
<total>#[totalMemory]#</total>
|
|
||||||
<max>#[maxMemory]#</max>
|
|
||||||
</memory>
|
|
||||||
<processors>#[processors]#</processors>
|
|
||||||
<traffic>
|
|
||||||
<in>#[trafficIn]#</in>
|
|
||||||
<proxy>#[trafficProxy]#</proxy>
|
|
||||||
<crawler>#[trafficCrawler]#</crawler>
|
|
||||||
</traffic>
|
|
||||||
</status>
|
</status>
|
||||||
|
After Width: | Height: | Size: 932 B |
Loading…
Reference in new issue