- added a remote crawl menu item to the index create menu. This menu also shows a list of peers that provide remote crawl urls - set remote crawl option by default to off. This option may be important but it also confuses first-time users git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7158 6c8d7289-2bf4-0310-a012-ef5d649a1542pull/1/head
parent
89c2d8b81e
commit
670ba4d52b
@ -0,0 +1,75 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<title>YaCy '#[clientname]#': Remote Crawl Configuration</title>
|
||||
#%env/templates/metas.template%#
|
||||
</head>
|
||||
<body id="RemoteCrawl">
|
||||
#%env/templates/header.template%#
|
||||
#%env/templates/submenuIndexCreate.template%#
|
||||
<h2>Remote Crawler</h2>
|
||||
The remote crawler is a process that requests urls from other peers.
|
||||
Peers offer remote-crawl urls if the flag 'Do Remote Indexing'
|
||||
is switched on when a crawl is started.
|
||||
<fieldset>
|
||||
<legend>
|
||||
<label>Remote Crawler Configuration</label>
|
||||
</legend>
|
||||
<form id="ConfigForm" method="post" action="RemoteCrawl_p.html" enctype="multipart/form-data" accept-charset="UTF-8">
|
||||
<dl>
|
||||
<dt>
|
||||
<label for="crawlResponse">Accept Remote Crawl Requests</label>
|
||||
<input type="checkbox" id="crawlResponse" name="crawlResponse" onclick="window.location.href='RemoteCrawl_p.html?#(crawlResponse)#crawlResponse=on::crawlResponse=off#(/crawlResponse)#'" #(crawlResponse)#::checked="checked" #(/crawlResponse)#/>
|
||||
</dt>
|
||||
<dd>
|
||||
Perform web indexing upon request of another peer.<br />
|
||||
<label for="acceptCrawlLimit">Load with a maximum of</label>
|
||||
<input id="acceptCrawlLimit" name="acceptCrawlLimit" type="text" size="4" maxlength="4" value="#[acceptCrawlLimit]#" /> pages per minute
|
||||
<input type="submit" name="save" value="Save" />
|
||||
<p>Crawl results will appear in the <a href="CrawlResults.html?process=6">Crawl Result Monitor</a></p>
|
||||
</dd>
|
||||
</dl>
|
||||
</form>
|
||||
</fieldset>
|
||||
|
||||
<fieldset>
|
||||
<legend>
|
||||
<label>Peers offering remote crawl URLs</label>
|
||||
</legend>
|
||||
If the remote crawl option is switched on, then this peer will load URLs from the following remote peers:
|
||||
<form method="get" action="#">
|
||||
<table class="sortable" border="0" cellpadding="2" cellspacing="1">
|
||||
<tr class="TableHeader" valign="bottom">
|
||||
<td><strong>Name</strong><br/></td>
|
||||
<td><strong>URLs for<br/>Remote<br/>Crawl</strong></td>
|
||||
<td><strong>Release/<br/>SVN</strong></td>
|
||||
<td><strong>PPM</strong></td>
|
||||
<td><strong>QPH</strong></td>
|
||||
<td><strong>Last<br/>Seen</strong></td>
|
||||
<td><strong>UTC</strong><br/>Offset</td>
|
||||
<td style="width:70px;"><strong>Uptime</strong></td>
|
||||
<td><strong>Links</strong></td>
|
||||
<td><strong>RWIs</strong></td>
|
||||
<td><strong>Age</strong></td>
|
||||
</tr>
|
||||
#{list}#
|
||||
<tr class="TableCell#(dark)#Light::Dark::Summary#(/dark)#">
|
||||
<td><a href="http://www.#[fullname]#.yacy">#[shortname]#</a></td>
|
||||
<td align="right">#[RCount]#</td>
|
||||
<td align="right">#[version]#</td>
|
||||
<td align="right">#[ppm]#</td>
|
||||
<td align="right">#[qph]#</td>
|
||||
<td align="right">#[lastSeen]#</td>
|
||||
<td align="right">#[utc]#</td>
|
||||
<td align="right">#[uptime]#</td>
|
||||
<td align="right">#[LCount]#</td>
|
||||
<td align="right">#[ICount]#</td>
|
||||
<td align="right">#[age]#</td>
|
||||
</tr>
|
||||
#{/list}#
|
||||
</table>
|
||||
</form>
|
||||
</fieldset>
|
||||
#%env/templates/footer.template%#
|
||||
</body>
|
||||
</html>
|
@ -0,0 +1,155 @@
|
||||
// RemoteCrawl_p.java
|
||||
// --------------------
|
||||
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||
// first published 20.04.2007 on http://yacy.net
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate: 2010-09-02 21:24:22 +0200 (Do, 02 Sep 2010) $
|
||||
// $LastChangedRevision: 7092 $
|
||||
// $LastChangedBy: orbiter $
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import net.yacy.cora.protocol.RequestHeader;
|
||||
import net.yacy.kelondro.util.DateFormatter;
|
||||
import net.yacy.kelondro.workflow.BusyThread;
|
||||
|
||||
import de.anomic.data.WorkTables;
|
||||
import de.anomic.search.Switchboard;
|
||||
import de.anomic.search.SwitchboardConstants;
|
||||
import de.anomic.server.serverObjects;
|
||||
import de.anomic.server.serverSwitch;
|
||||
import de.anomic.yacy.yacySeed;
|
||||
import de.anomic.yacy.yacyVersion;
|
||||
|
||||
public class RemoteCrawl_p {
|
||||
|
||||
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
|
||||
|
||||
final Switchboard sb = (Switchboard) env;
|
||||
final serverObjects prop = new serverObjects();
|
||||
|
||||
if (post != null) {
|
||||
|
||||
// store this call as api call
|
||||
sb.tables.recordAPICall(post, "RemoteCrawl_p.html", WorkTables.TABLE_API_TYPE_CONFIGURATION, "remote crawler configuration");
|
||||
|
||||
if (post.containsKey("crawlResponse")) {
|
||||
boolean crawlResponse = post.get("crawlResponse", "off").equals("on");
|
||||
|
||||
// read remote crawl request settings
|
||||
sb.setConfig("crawlResponse", (crawlResponse) ? "true" : "false");
|
||||
}
|
||||
|
||||
if (post.containsKey("acceptCrawlLimit")) {
|
||||
// read remote crawl request settings
|
||||
int newppm = 1;
|
||||
try {
|
||||
newppm = Math.max(1, Integer.parseInt(post.get("acceptCrawlLimit", "1")));
|
||||
} catch (final NumberFormatException e) {}
|
||||
final long newBusySleep = Math.max(100, 60000 / newppm);
|
||||
|
||||
// propagate to crawler
|
||||
final BusyThread rct = sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
|
||||
sb.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, newBusySleep);
|
||||
sb.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_IDLESLEEP, newBusySleep * 3);
|
||||
rct.setBusySleep(newBusySleep);
|
||||
rct.setIdleSleep(newBusySleep * 3);
|
||||
|
||||
// propagate to loader
|
||||
final BusyThread rcl = sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER);
|
||||
sb.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_BUSYSLEEP, newBusySleep * 5);
|
||||
sb.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_IDLESLEEP, newBusySleep * 10);
|
||||
rcl.setBusySleep(newBusySleep * 5);
|
||||
rcl.setIdleSleep(newBusySleep * 10);
|
||||
|
||||
sb.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, Long.toString(newBusySleep));
|
||||
}
|
||||
}
|
||||
|
||||
// write remote crawl request settings
|
||||
prop.put("crawlResponse", sb.getConfigBool("crawlResponse", false) ? "1" : "0");
|
||||
long RTCbusySleep = 100;
|
||||
try {
|
||||
RTCbusySleep = Math.max(1, Integer.parseInt(env.getConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, "100")));
|
||||
} catch (final NumberFormatException e) {}
|
||||
final int RTCppm = (int) (60000L / RTCbusySleep);
|
||||
prop.put("acceptCrawlLimit", RTCppm);
|
||||
|
||||
// set seed information directly
|
||||
sb.peers.mySeed().setFlagAcceptRemoteCrawl(sb.getConfigBool("crawlResponse", false));
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
// write network list
|
||||
final String STR_TABLE_LIST = "list_";
|
||||
int conCount = 0;
|
||||
|
||||
boolean dark = true;
|
||||
yacySeed seed;
|
||||
Iterator<yacySeed> e = null;
|
||||
e = sb.peers.seedsSortedConnected(false, yacySeed.RCOUNT);
|
||||
//e = sb.peers.seedsSortedConnected(false, yacySeed.LCOUNT);
|
||||
Pattern peerSearchPattern = null;
|
||||
while (e.hasNext() && conCount < 300) {
|
||||
seed = e.next();
|
||||
assert seed != null;
|
||||
if (seed != null) {
|
||||
final long lastseen = Math.abs((System.currentTimeMillis() - seed.getLastSeenUTC()) / 1000 / 60);
|
||||
if (lastseen > 720) continue;
|
||||
long rcount = seed.getLong(yacySeed.RCOUNT, 0);
|
||||
if (rcount == 0) continue;
|
||||
if ((post != null && post.containsKey("search")) && peerSearchPattern != null /*(wrongregex == null)*/) {
|
||||
boolean abort = true;
|
||||
Matcher m = peerSearchPattern.matcher (seed.getName());
|
||||
if (m.find ()) {
|
||||
abort = false;
|
||||
}
|
||||
m = peerSearchPattern.matcher (seed.hash);
|
||||
if (m.find ()) {
|
||||
abort = false;
|
||||
}
|
||||
if (abort) continue;
|
||||
}
|
||||
prop.put(STR_TABLE_LIST + conCount + "_dark", ((dark) ? 1 : 0) ); dark=!dark;
|
||||
String shortname = seed.get(yacySeed.NAME, "deadlink");
|
||||
if (shortname.length() > 20) shortname = shortname.substring(0, 20) + "...";
|
||||
prop.putHTML(STR_TABLE_LIST + conCount + "_shortname", shortname);
|
||||
prop.putHTML(STR_TABLE_LIST + conCount + "_fullname", seed.get(yacySeed.NAME, "deadlink"));
|
||||
prop.put(STR_TABLE_LIST + conCount + "_age", seed.getAge());
|
||||
prop.putHTML(STR_TABLE_LIST + conCount + "_version", yacyVersion.combined2prettyVersion(seed.get(yacySeed.VERSION, "0.1"), shortname));
|
||||
prop.putNum(STR_TABLE_LIST + conCount + "_lastSeen", /*seed.getLastSeenString() + " " +*/ lastseen);
|
||||
prop.put(STR_TABLE_LIST + conCount + "_utc", seed.get(yacySeed.UTC, "-"));
|
||||
prop.putHTML(STR_TABLE_LIST + conCount + "_uptime", DateFormatter.formatInterval(60000 * Long.parseLong(seed.get(yacySeed.UPTIME, "0"))));
|
||||
prop.putNum(STR_TABLE_LIST + conCount + "_LCount", seed.getLinkCount());
|
||||
prop.putNum(STR_TABLE_LIST + conCount + "_ICount", seed.getWordCount());
|
||||
prop.putNum(STR_TABLE_LIST + conCount + "_RCount", rcount);
|
||||
prop.putNum(STR_TABLE_LIST + conCount + "_ppm", seed.getPPM());
|
||||
prop.putNum(STR_TABLE_LIST + conCount + "_qph", Math.round(6000d * seed.getQPM()) / 100d);
|
||||
conCount++;
|
||||
} // seed != null
|
||||
} // while
|
||||
prop.putNum("list", conCount);
|
||||
|
||||
return prop;
|
||||
}
|
||||
}
|
Loading…
Reference in new issue