- removed the remote crawl option from the network configuration submenu and

- added a remote crawl menu item to the index create menu. This menu also shows a list of peers that provide remote crawl urls
- set remote crawl option by default to off. This option may be important but it also confuses first-time users


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7158 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 89c2d8b81e
commit 670ba4d52b

@ -522,10 +522,12 @@ storeTXCache=true
# order=parameters for requester; response=parameters for responder
# these values apply only for senior-senior - communication
# The delay value is number of seconds bewteen two separate orders
# crawlOrder: default value for remote crawl starts
# crawlResponse: set to true if a peer should retrieve remote crawl urls from other peers
crawlOrder=true
crawlOrderDepth=0
crawlOrderDelay=8
crawlResponse=true
crawlResponse=false
crawlResponseDepth=0
# indexing-exclusion - rules

@ -44,12 +44,9 @@
::<div class="error">Inapplicable Setting Combination:</div>
::<div class="error">No changes were made!</div>
#(/commit)#
#(commitCrawlPlea)#::<div class="error">P2P operation can run without remote indexing, but runs better with remote indexing switched on. Please switch 'Accept Remote Crawl Requests' on.</div>#(/commitCrawlPlea)#
#(commitDHTIsRobinson)#::<div class="error">For P2P operation, at least DHT distribution or DHT receive (or both) must be set. You have thus defined a Robinson configuration.</div>#(/commitDHTIsRobinson)#
#(commitDHTNoGlobalSearch)#::<div class="error">Global Search in P2P configuration is only allowed, if index receive is switched on. You have a P2P configuration, but are not allowed to search other peers.</div>#(/commitDHTNoGlobalSearch)#
#(commitRobinson)#::<div class="commit">For Robinson Mode, index distribution and receive is switched off.</div>#(/commitRobinson)#
#(commitRobinsonWithRemoteIndexing)#::<div class="commit">This Robinson Mode switches remote indexing on, but limits targets to peers within the same cluster. Remote indexing requests from peers within the same cluster are accepted.</div>#(/commitRobinsonWithRemoteIndexing)#
#(commitRobinsonWithoutRemoteIndexing)#::<div class="commit">This Robinson Mode does not allow any remote indexing (neither requests remote indexing, nor accepts it).</div>#(/commitRobinsonWithoutRemoteIndexing)#
#(commitPasswordWarning)#::<div class="error">With this configuration it is not allowed to authentify automatically from localhost! Please open the <a href="ConfigAccounts_p.html">Account Configuration</a> and set a new password.</div>#(/commitPasswordWarning)#
<form id="NetworkForm" method="post" action="ConfigNetwork_p.html" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset>
@ -142,17 +139,8 @@
#(indexReceiveBlockBlacklistChecked.off)#::checked="checked" #(/indexReceiveBlockBlacklistChecked.off)#/>
<label for="indexReceiveBlockBlacklistOff">accept transmitted URLs that match your blacklist</label>.
</dd>
<dt>
<label for="crawlResponse">Accept Remote Crawl Requests</label>
<input type="checkbox" id="crawlResponse" name="crawlResponse" #(crawlResponse)#::checked="checked" #(/crawlResponse)#/>
</dt>
<dd>
Perform web indexing upon request of another peer.<br />
This works only if you are a senior peer.<br />
<label for="acceptCrawlLimit">Load with a maximum of</label>
<input id="acceptCrawlLimit" name="acceptCrawlLimit" type="text" size="4" maxlength="4" value="#[acceptCrawlLimit]#" /> pages per minute
</dd>
</dl>
<input type="submit" name="save" value="Save" />
</fieldset>
<fieldset>
@ -216,7 +204,7 @@
If you leave the field empty, no peer asks your peer. If you fill in a '*', your peer is always asked.
<input type="text" id="peertags" name="peertags" value="#[peertags]#" size="40" maxlength="80" />
</dd>
</dl>
</dl>
<input type="submit" name="save" value="Save" />
</fieldset>
</form>

@ -31,7 +31,6 @@ import java.util.HashSet;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.MapTools;
import net.yacy.kelondro.workflow.BusyThread;
import de.anomic.data.WorkTables;
import de.anomic.http.server.HTTPDemon;
@ -56,7 +55,7 @@ public class ConfigNetwork_p {
if (post != null) {
// store this call as api call
sb.tables.recordAPICall(post, "ConfigNetwork.html", WorkTables.TABLE_API_TYPE_CONFIGURATION, "network settings");
sb.tables.recordAPICall(post, "ConfigNetwork_p.html", WorkTables.TABLE_API_TYPE_CONFIGURATION, "network settings");
if (post.containsKey("changeNetwork")) {
final String networkDefinition = post.get("networkDefinition", "defaults/yacy.network.freeworld.unit");
@ -75,24 +74,14 @@ public class ConfigNetwork_p {
}
if (post.containsKey("save")) {
boolean crawlResponse = post.get("crawlResponse", "off").equals("on");
// DHT control
boolean indexDistribute = post.get("indexDistribute", "").equals("on");
boolean indexReceive = post.get("indexReceive", "").equals("on");
final boolean robinsonmode = post.get("network", "").equals("robinson");
final String clustermode = post.get("cluster.mode", "publicpeer");
if (robinsonmode) {
indexDistribute = false;
indexReceive = false;
if ((clustermode.equals("privatepeer")) || (clustermode.equals("publicpeer"))) {
prop.put("commitRobinsonWithoutRemoteIndexing", "1");
crawlResponse = false;
}
if ((clustermode.equals("privatecluster")) || (clustermode.equals("publiccluster"))) {
prop.put("commitRobinsonWithRemoteIndexing", "1");
crawlResponse = true;
}
commit = 1;
} else {
if (!indexDistribute && !indexReceive) {
@ -104,9 +93,6 @@ public class ConfigNetwork_p {
if (!indexReceive) prop.put("commitDHTNoGlobalSearch", "1");
commit = 1;
}
if (!crawlResponse) {
prop.put("commitCrawlPlea", "1");
}
}
if (indexDistribute) {
@ -147,31 +133,6 @@ public class ConfigNetwork_p {
}
sb.setConfig("cluster.mode", post.get("cluster.mode", "publicpeer"));
// read remote crawl request settings
sb.setConfig("crawlResponse", (crawlResponse) ? "true" : "false");
int newppm = 1;
try {
newppm = Math.max(1, Integer.parseInt(post.get("acceptCrawlLimit", "1")));
} catch (final NumberFormatException e) {}
final long newBusySleep = Math.max(100, 60000 / newppm);
// propagate to crawler
final BusyThread rct = sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
sb.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, newBusySleep);
sb.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_IDLESLEEP, newBusySleep * 3);
rct.setBusySleep(newBusySleep);
rct.setIdleSleep(newBusySleep * 3);
// propagate to loader
final BusyThread rcl = sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER);
sb.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_BUSYSLEEP, newBusySleep * 5);
sb.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_IDLESLEEP, newBusySleep * 10);
rcl.setBusySleep(newBusySleep * 5);
rcl.setIdleSleep(newBusySleep * 10);
sb.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, Long.toString(newBusySleep));
sb.setConfig("cluster.peers.ipport", checkIPPortList(post.get("cluster.peers.ipport", "")));
sb.setConfig("cluster.peers.yacydomain", checkYaCyDomainList(post.get("cluster.peers.yacydomain", "")));
@ -238,7 +199,7 @@ public class ConfigNetwork_p {
return prop;
}
public static String normalizedList(String input) {
private static String normalizedList(String input) {
input = input.replace(' ', ',');
input = input.replace(' ', ';');
input = input.replaceAll(",,", ",");
@ -247,7 +208,7 @@ public class ConfigNetwork_p {
return input;
}
public static String checkYaCyDomainList(String input) {
private static String checkYaCyDomainList(String input) {
input = normalizedList(input);
final String[] s = input.split(",");
input = "";
@ -259,7 +220,7 @@ public class ConfigNetwork_p {
return input.substring(1);
}
public static String checkIPPortList(String input) {
private static String checkIPPortList(String input) {
input = normalizedList(input);
final String[] s = input.split(",");
input = "";

@ -0,0 +1,75 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>YaCy '#[clientname]#': Remote Crawl Configuration</title>
#%env/templates/metas.template%#
</head>
<body id="RemoteCrawl">
#%env/templates/header.template%#
#%env/templates/submenuIndexCreate.template%#
<h2>Remote Crawler</h2>
The remote crawler is a process that requests urls from other peers.
Peers offer remote-crawl urls if the flag 'Do Remote Indexing'
is switched on when a crawl is started.
<fieldset>
<legend>
<label>Remote Crawler Configuration</label>
</legend>
<form id="ConfigForm" method="post" action="RemoteCrawl_p.html" enctype="multipart/form-data" accept-charset="UTF-8">
<dl>
<dt>
<label for="crawlResponse">Accept Remote Crawl Requests</label>
<input type="checkbox" id="crawlResponse" name="crawlResponse" onclick="window.location.href='RemoteCrawl_p.html?#(crawlResponse)#crawlResponse=on::crawlResponse=off#(/crawlResponse)#'" #(crawlResponse)#::checked="checked" #(/crawlResponse)#/>
</dt>
<dd>
Perform web indexing upon request of another peer.<br />
<label for="acceptCrawlLimit">Load with a maximum of</label>
<input id="acceptCrawlLimit" name="acceptCrawlLimit" type="text" size="4" maxlength="4" value="#[acceptCrawlLimit]#" /> pages per minute
<input type="submit" name="save" value="Save" />
<p>Crawl results will appear in the <a href="CrawlResults.html?process=6">Crawl Result Monitor</a></p>
</dd>
</dl>
</form>
</fieldset>
<fieldset>
<legend>
<label>Peers offering remote crawl URLs</label>
</legend>
If the remote crawl option is switched on, then this peer will load URLs from the following remote peers:
<form method="get" action="#">
<table class="sortable" border="0" cellpadding="2" cellspacing="1">
<tr class="TableHeader" valign="bottom">
<td><strong>Name</strong><br/></td>
<td><strong>URLs for<br/>Remote<br/>Crawl</strong></td>
<td><strong>Release/<br/>SVN</strong></td>
<td><strong>PPM</strong></td>
<td><strong>QPH</strong></td>
<td><strong>Last<br/>Seen</strong></td>
<td><strong>UTC</strong><br/>Offset</td>
<td style="width:70px;"><strong>Uptime</strong></td>
<td><strong>Links</strong></td>
<td><strong>RWIs</strong></td>
<td><strong>Age</strong></td>
</tr>
#{list}#
<tr class="TableCell#(dark)#Light::Dark::Summary#(/dark)#">
<td><a href="http://www.#[fullname]#.yacy">#[shortname]#</a></td>
<td align="right">#[RCount]#</td>
<td align="right">#[version]#</td>
<td align="right">#[ppm]#</td>
<td align="right">#[qph]#</td>
<td align="right">#[lastSeen]#</td>
<td align="right">#[utc]#</td>
<td align="right">#[uptime]#</td>
<td align="right">#[LCount]#</td>
<td align="right">#[ICount]#</td>
<td align="right">#[age]#</td>
</tr>
#{/list}#
</table>
</form>
</fieldset>
#%env/templates/footer.template%#
</body>
</html>

@ -0,0 +1,155 @@
// RemoteCrawl_p.java
// --------------------
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 20.04.2007 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2010-09-02 21:24:22 +0200 (Do, 02 Sep 2010) $
// $LastChangedRevision: 7092 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.workflow.BusyThread;
import de.anomic.data.WorkTables;
import de.anomic.search.Switchboard;
import de.anomic.search.SwitchboardConstants;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacySeed;
import de.anomic.yacy.yacyVersion;
public class RemoteCrawl_p {
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
final Switchboard sb = (Switchboard) env;
final serverObjects prop = new serverObjects();
if (post != null) {
// store this call as api call
sb.tables.recordAPICall(post, "RemoteCrawl_p.html", WorkTables.TABLE_API_TYPE_CONFIGURATION, "remote crawler configuration");
if (post.containsKey("crawlResponse")) {
boolean crawlResponse = post.get("crawlResponse", "off").equals("on");
// read remote crawl request settings
sb.setConfig("crawlResponse", (crawlResponse) ? "true" : "false");
}
if (post.containsKey("acceptCrawlLimit")) {
// read remote crawl request settings
int newppm = 1;
try {
newppm = Math.max(1, Integer.parseInt(post.get("acceptCrawlLimit", "1")));
} catch (final NumberFormatException e) {}
final long newBusySleep = Math.max(100, 60000 / newppm);
// propagate to crawler
final BusyThread rct = sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
sb.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, newBusySleep);
sb.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_IDLESLEEP, newBusySleep * 3);
rct.setBusySleep(newBusySleep);
rct.setIdleSleep(newBusySleep * 3);
// propagate to loader
final BusyThread rcl = sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER);
sb.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_BUSYSLEEP, newBusySleep * 5);
sb.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_IDLESLEEP, newBusySleep * 10);
rcl.setBusySleep(newBusySleep * 5);
rcl.setIdleSleep(newBusySleep * 10);
sb.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, Long.toString(newBusySleep));
}
}
// write remote crawl request settings
prop.put("crawlResponse", sb.getConfigBool("crawlResponse", false) ? "1" : "0");
long RTCbusySleep = 100;
try {
RTCbusySleep = Math.max(1, Integer.parseInt(env.getConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, "100")));
} catch (final NumberFormatException e) {}
final int RTCppm = (int) (60000L / RTCbusySleep);
prop.put("acceptCrawlLimit", RTCppm);
// set seed information directly
sb.peers.mySeed().setFlagAcceptRemoteCrawl(sb.getConfigBool("crawlResponse", false));
// -------------------------------------------------------------------------------------
// write network list
final String STR_TABLE_LIST = "list_";
int conCount = 0;
boolean dark = true;
yacySeed seed;
Iterator<yacySeed> e = null;
e = sb.peers.seedsSortedConnected(false, yacySeed.RCOUNT);
//e = sb.peers.seedsSortedConnected(false, yacySeed.LCOUNT);
Pattern peerSearchPattern = null;
while (e.hasNext() && conCount < 300) {
seed = e.next();
assert seed != null;
if (seed != null) {
final long lastseen = Math.abs((System.currentTimeMillis() - seed.getLastSeenUTC()) / 1000 / 60);
if (lastseen > 720) continue;
long rcount = seed.getLong(yacySeed.RCOUNT, 0);
if (rcount == 0) continue;
if ((post != null && post.containsKey("search")) && peerSearchPattern != null /*(wrongregex == null)*/) {
boolean abort = true;
Matcher m = peerSearchPattern.matcher (seed.getName());
if (m.find ()) {
abort = false;
}
m = peerSearchPattern.matcher (seed.hash);
if (m.find ()) {
abort = false;
}
if (abort) continue;
}
prop.put(STR_TABLE_LIST + conCount + "_dark", ((dark) ? 1 : 0) ); dark=!dark;
String shortname = seed.get(yacySeed.NAME, "deadlink");
if (shortname.length() > 20) shortname = shortname.substring(0, 20) + "...";
prop.putHTML(STR_TABLE_LIST + conCount + "_shortname", shortname);
prop.putHTML(STR_TABLE_LIST + conCount + "_fullname", seed.get(yacySeed.NAME, "deadlink"));
prop.put(STR_TABLE_LIST + conCount + "_age", seed.getAge());
prop.putHTML(STR_TABLE_LIST + conCount + "_version", yacyVersion.combined2prettyVersion(seed.get(yacySeed.VERSION, "0.1"), shortname));
prop.putNum(STR_TABLE_LIST + conCount + "_lastSeen", /*seed.getLastSeenString() + " " +*/ lastseen);
prop.put(STR_TABLE_LIST + conCount + "_utc", seed.get(yacySeed.UTC, "-"));
prop.putHTML(STR_TABLE_LIST + conCount + "_uptime", DateFormatter.formatInterval(60000 * Long.parseLong(seed.get(yacySeed.UPTIME, "0"))));
prop.putNum(STR_TABLE_LIST + conCount + "_LCount", seed.getLinkCount());
prop.putNum(STR_TABLE_LIST + conCount + "_ICount", seed.getWordCount());
prop.putNum(STR_TABLE_LIST + conCount + "_RCount", rcount);
prop.putNum(STR_TABLE_LIST + conCount + "_ppm", seed.getPPM());
prop.putNum(STR_TABLE_LIST + conCount + "_qph", Math.round(6000d * seed.getQPM()) / 100d);
conCount++;
} // seed != null
} // while
prop.putNum("list", conCount);
return prop;
}
}

@ -5,32 +5,33 @@
<div class="SubMenugroup">
<h3>Crawler/Spider</h3>
<ul class="SubMenu">
<li><a href="/CrawlStart_p.html" class="MenuItemLink lock">Web/FTP Crawl Start<br/>(Advanced)</a></li>
<li><a href="/CrawlStart_p.html" class="MenuItemLink lock">Crawl Start<br/>(Advanced)</a></li>
<li><a href="/Load_MediawikiWiki.html" class="MenuItemLink">Crawling of<br/>Media Wikis</a></li>
<li><a href="/Load_PHPBB3.html" class="MenuItemLink">Crawling of<br/>phpBB3 Forums</a></li>
</ul>
</div>
<div class="SubMenugroup">
<h3>Database Reader</h3>
<h3>Content Import</h3>
<ul class="SubMenu">
<li><a href="/ContentIntegrationPHPBB3_p.html" class="MenuItemLink lock">Database Reader for<br/>phpBB3 Forums</a></li>
<li><a href="/IndexImportWikimedia_p.html" class="MenuItemLink lock">Dump Reader for <br/>Wikimedia dumps</a></li>
<li><a href="/Load_RSS_p.html" class="MenuItemLink lock">RSS Feed<br/>Importer</a></li>
<li><a href="/IndexImportOAIPMH_p.html" class="MenuItemLink lock">OAI-PMH<br/>Importer</a></li>
</ul>
</div>
<div class="SubMenugroup">
<h3>Content Import</h3>
<h3>Network Harvesting Methods</h3>
<ul class="SubMenu">
<li><a href="/Load_RSS_p.html" class="MenuItemLink lock">RSS Feed<br/>Importer</a></li>
<li><a href="/IndexImportOAIPMH_p.html" class="MenuItemLink lock">OAI-PMH Server<br/>Importer</a></li>
<li><a href="/RemoteCrawl_p.html" class="MenuItemLink lock">Remote Crawling: loading<br/>of URLs for other peers</a></li>
<li><a href="/ProxyIndexingMonitor_p.html" class="MenuItemLink lock">Document Harvesting<br/>with Scraping Proxy</a></li>
</ul>
</div>
<div class="SubMenugroup">
<h3>Other Harvesting Methods</h3>
<h3>Database Reader</h3>
<ul class="SubMenu">
<li><a href="/ProxyIndexingMonitor_p.html" class="MenuItemLink lock">Document Harvesting<br/>with Scraping Proxy</a></li>
<li><a href="/ContentIntegrationPHPBB3_p.html" class="MenuItemLink lock">Database Reader<br/>for phpBB3 Forums</a></li>
<li><a href="/IndexImportWikimedia_p.html" class="MenuItemLink lock">Dump Reader for <br/>Wikimedia dumps</a></li>
</ul>
</div>
</div>

@ -75,7 +75,7 @@ public final class yacySeedDB implements AlternativeDomainNames {
*/
public static final String DBFILE_OWN_SEED = "mySeed.txt";
public static final String[] sortFields = new String[] {yacySeed.LCOUNT, yacySeed.ICOUNT, yacySeed.UPTIME, yacySeed.VERSION, yacySeed.LASTSEEN};
public static final String[] sortFields = new String[] {yacySeed.LCOUNT, yacySeed.RCOUNT, yacySeed.ICOUNT, yacySeed.UPTIME, yacySeed.VERSION, yacySeed.LASTSEEN};
public static final String[] longaccFields = new String[] {yacySeed.LCOUNT, yacySeed.ICOUNT, yacySeed.ISPEED};
public static final String[] doubleaccFields = new String[] {yacySeed.RSPEED};

Loading…
Cancel
Save