From 670ba4d52b29b4a132d05fb0da157e262b00eb45 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 16 Sep 2010 00:39:05 +0000 Subject: [PATCH] - removed the remote crawl option from the network configuration submenu and - added a remote crawl menu item to the index create menu. This menu also shows a list of peers that provide remote crawl urls - set remote crawl option by default to off. This option may be important but it also confuses first-time users git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7158 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- defaults/yacy.init | 4 +- htroot/ConfigNetwork_p.html | 16 +- htroot/ConfigNetwork_p.java | 47 +----- htroot/RemoteCrawl_p.html | 75 +++++++++ htroot/RemoteCrawl_p.java | 155 ++++++++++++++++++ .../env/templates/submenuIndexCreate.template | 19 ++- source/de/anomic/yacy/yacySeedDB.java | 2 +- 7 files changed, 250 insertions(+), 68 deletions(-) create mode 100644 htroot/RemoteCrawl_p.html create mode 100644 htroot/RemoteCrawl_p.java diff --git a/defaults/yacy.init b/defaults/yacy.init index d252764d6..c2a3caf08 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -522,10 +522,12 @@ storeTXCache=true # order=parameters for requester; response=parameters for responder # these values apply only for senior-senior - communication # The delay value is number of seconds bewteen two separate orders +# crawlOrder: default value for remote crawl starts +# crawlResponse: set to true if a peer should retrieve remote crawl urls from other peers crawlOrder=true crawlOrderDepth=0 crawlOrderDelay=8 -crawlResponse=true +crawlResponse=false crawlResponseDepth=0 # indexing-exclusion - rules diff --git a/htroot/ConfigNetwork_p.html b/htroot/ConfigNetwork_p.html index 99bae70e1..4c49135c5 100644 --- a/htroot/ConfigNetwork_p.html +++ b/htroot/ConfigNetwork_p.html @@ -44,12 +44,9 @@ ::
Inapplicable Setting Combination:
::
No changes were made!
#(/commit)# - #(commitCrawlPlea)#::
P2P operation can run without remote indexing, but runs better with remote indexing switched on. Please switch 'Accept Remote Crawl Requests' on.
#(/commitCrawlPlea)# #(commitDHTIsRobinson)#::
For P2P operation, at least DHT distribution or DHT receive (or both) must be set. You have thus defined a Robinson configuration.
#(/commitDHTIsRobinson)# #(commitDHTNoGlobalSearch)#::
Global Search in P2P configuration is only allowed, if index receive is switched on. You have a P2P configuration, but are not allowed to search other peers.
#(/commitDHTNoGlobalSearch)# #(commitRobinson)#::
For Robinson Mode, index distribution and receive is switched off.
#(/commitRobinson)# - #(commitRobinsonWithRemoteIndexing)#::
This Robinson Mode switches remote indexing on, but limits targets to peers within the same cluster. Remote indexing requests from peers within the same cluster are accepted.
#(/commitRobinsonWithRemoteIndexing)# - #(commitRobinsonWithoutRemoteIndexing)#::
This Robinson Mode does not allow any remote indexing (neither requests remote indexing, nor accepts it).
#(/commitRobinsonWithoutRemoteIndexing)# #(commitPasswordWarning)#::
With this configuration it is not allowed to authentify automatically from localhost! Please open the Account Configuration and set a new password.
#(/commitPasswordWarning)#
@@ -142,17 +139,8 @@ #(indexReceiveBlockBlacklistChecked.off)#::checked="checked" #(/indexReceiveBlockBlacklistChecked.off)#/> . -
- - -
-
- Perform web indexing upon request of another peer.
- This works only if you are a senior peer.
- - pages per minute -
+
@@ -216,7 +204,7 @@ If you leave the field empty, no peer asks your peer. If you fill in a '*', your peer is always asked. - +
diff --git a/htroot/ConfigNetwork_p.java b/htroot/ConfigNetwork_p.java index 3a1757591..8eee35d27 100644 --- a/htroot/ConfigNetwork_p.java +++ b/htroot/ConfigNetwork_p.java @@ -31,7 +31,6 @@ import java.util.HashSet; import net.yacy.cora.protocol.RequestHeader; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.MapTools; -import net.yacy.kelondro.workflow.BusyThread; import de.anomic.data.WorkTables; import de.anomic.http.server.HTTPDemon; @@ -56,7 +55,7 @@ public class ConfigNetwork_p { if (post != null) { // store this call as api call - sb.tables.recordAPICall(post, "ConfigNetwork.html", WorkTables.TABLE_API_TYPE_CONFIGURATION, "network settings"); + sb.tables.recordAPICall(post, "ConfigNetwork_p.html", WorkTables.TABLE_API_TYPE_CONFIGURATION, "network settings"); if (post.containsKey("changeNetwork")) { final String networkDefinition = post.get("networkDefinition", "defaults/yacy.network.freeworld.unit"); @@ -75,24 +74,14 @@ public class ConfigNetwork_p { } if (post.containsKey("save")) { - boolean crawlResponse = post.get("crawlResponse", "off").equals("on"); // DHT control boolean indexDistribute = post.get("indexDistribute", "").equals("on"); boolean indexReceive = post.get("indexReceive", "").equals("on"); final boolean robinsonmode = post.get("network", "").equals("robinson"); - final String clustermode = post.get("cluster.mode", "publicpeer"); if (robinsonmode) { indexDistribute = false; indexReceive = false; - if ((clustermode.equals("privatepeer")) || (clustermode.equals("publicpeer"))) { - prop.put("commitRobinsonWithoutRemoteIndexing", "1"); - crawlResponse = false; - } - if ((clustermode.equals("privatecluster")) || (clustermode.equals("publiccluster"))) { - prop.put("commitRobinsonWithRemoteIndexing", "1"); - crawlResponse = true; - } commit = 1; } else { if (!indexDistribute && !indexReceive) { @@ -104,9 +93,6 @@ public class ConfigNetwork_p { if (!indexReceive) prop.put("commitDHTNoGlobalSearch", "1"); commit = 1; } - if (!crawlResponse) { - prop.put("commitCrawlPlea", "1"); - } } if (indexDistribute) { @@ -147,31 +133,6 @@ public class ConfigNetwork_p { } sb.setConfig("cluster.mode", post.get("cluster.mode", "publicpeer")); - - // read remote crawl request settings - sb.setConfig("crawlResponse", (crawlResponse) ? "true" : "false"); - int newppm = 1; - try { - newppm = Math.max(1, Integer.parseInt(post.get("acceptCrawlLimit", "1"))); - } catch (final NumberFormatException e) {} - final long newBusySleep = Math.max(100, 60000 / newppm); - - // propagate to crawler - final BusyThread rct = sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL); - sb.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, newBusySleep); - sb.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_IDLESLEEP, newBusySleep * 3); - rct.setBusySleep(newBusySleep); - rct.setIdleSleep(newBusySleep * 3); - - // propagate to loader - final BusyThread rcl = sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER); - sb.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_BUSYSLEEP, newBusySleep * 5); - sb.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_IDLESLEEP, newBusySleep * 10); - rcl.setBusySleep(newBusySleep * 5); - rcl.setIdleSleep(newBusySleep * 10); - - sb.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, Long.toString(newBusySleep)); - sb.setConfig("cluster.peers.ipport", checkIPPortList(post.get("cluster.peers.ipport", ""))); sb.setConfig("cluster.peers.yacydomain", checkYaCyDomainList(post.get("cluster.peers.yacydomain", ""))); @@ -238,7 +199,7 @@ public class ConfigNetwork_p { return prop; } - public static String normalizedList(String input) { + private static String normalizedList(String input) { input = input.replace(' ', ','); input = input.replace(' ', ';'); input = input.replaceAll(",,", ","); @@ -247,7 +208,7 @@ public class ConfigNetwork_p { return input; } - public static String checkYaCyDomainList(String input) { + private static String checkYaCyDomainList(String input) { input = normalizedList(input); final String[] s = input.split(","); input = ""; @@ -259,7 +220,7 @@ public class ConfigNetwork_p { return input.substring(1); } - public static String checkIPPortList(String input) { + private static String checkIPPortList(String input) { input = normalizedList(input); final String[] s = input.split(","); input = ""; diff --git a/htroot/RemoteCrawl_p.html b/htroot/RemoteCrawl_p.html new file mode 100644 index 000000000..29eb6d67e --- /dev/null +++ b/htroot/RemoteCrawl_p.html @@ -0,0 +1,75 @@ + + + + YaCy '#[clientname]#': Remote Crawl Configuration + #%env/templates/metas.template%# + + + #%env/templates/header.template%# + #%env/templates/submenuIndexCreate.template%# +

Remote Crawler

+ The remote crawler is a process that requests urls from other peers. + Peers offer remote-crawl urls if the flag 'Do Remote Indexing' + is switched on when a crawl is started. +
+ + + +
+
+
+ + +
+
+ Perform web indexing upon request of another peer.
+ + pages per minute + +

Crawl results will appear in the Crawl Result Monitor

+
+
+
+
+ +
+ + + + If the remote crawl option is switched on, then this peer will load URLs from the following remote peers: +
+ + + + + + + + + + + + + + + #{list}# + + + + + + + + + + + + + + #{/list}# +
Name
URLs for
Remote
Crawl
Release/
SVN
PPMQPHLast
Seen
UTC
Offset
UptimeLinksRWIsAge
#[shortname]##[RCount]##[version]##[ppm]##[qph]##[lastSeen]##[utc]##[uptime]##[LCount]##[ICount]##[age]#
+
+
+ #%env/templates/footer.template%# + + \ No newline at end of file diff --git a/htroot/RemoteCrawl_p.java b/htroot/RemoteCrawl_p.java new file mode 100644 index 000000000..dbc59fc48 --- /dev/null +++ b/htroot/RemoteCrawl_p.java @@ -0,0 +1,155 @@ +// RemoteCrawl_p.java +// -------------------- +// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 20.04.2007 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2010-09-02 21:24:22 +0200 (Do, 02 Sep 2010) $ +// $LastChangedRevision: 7092 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +import java.util.Iterator; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import net.yacy.cora.protocol.RequestHeader; +import net.yacy.kelondro.util.DateFormatter; +import net.yacy.kelondro.workflow.BusyThread; + +import de.anomic.data.WorkTables; +import de.anomic.search.Switchboard; +import de.anomic.search.SwitchboardConstants; +import de.anomic.server.serverObjects; +import de.anomic.server.serverSwitch; +import de.anomic.yacy.yacySeed; +import de.anomic.yacy.yacyVersion; + +public class RemoteCrawl_p { + + public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { + + final Switchboard sb = (Switchboard) env; + final serverObjects prop = new serverObjects(); + + if (post != null) { + + // store this call as api call + sb.tables.recordAPICall(post, "RemoteCrawl_p.html", WorkTables.TABLE_API_TYPE_CONFIGURATION, "remote crawler configuration"); + + if (post.containsKey("crawlResponse")) { + boolean crawlResponse = post.get("crawlResponse", "off").equals("on"); + + // read remote crawl request settings + sb.setConfig("crawlResponse", (crawlResponse) ? "true" : "false"); + } + + if (post.containsKey("acceptCrawlLimit")) { + // read remote crawl request settings + int newppm = 1; + try { + newppm = Math.max(1, Integer.parseInt(post.get("acceptCrawlLimit", "1"))); + } catch (final NumberFormatException e) {} + final long newBusySleep = Math.max(100, 60000 / newppm); + + // propagate to crawler + final BusyThread rct = sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL); + sb.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, newBusySleep); + sb.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_IDLESLEEP, newBusySleep * 3); + rct.setBusySleep(newBusySleep); + rct.setIdleSleep(newBusySleep * 3); + + // propagate to loader + final BusyThread rcl = sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER); + sb.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_BUSYSLEEP, newBusySleep * 5); + sb.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_IDLESLEEP, newBusySleep * 10); + rcl.setBusySleep(newBusySleep * 5); + rcl.setIdleSleep(newBusySleep * 10); + + sb.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, Long.toString(newBusySleep)); + } + } + + // write remote crawl request settings + prop.put("crawlResponse", sb.getConfigBool("crawlResponse", false) ? "1" : "0"); + long RTCbusySleep = 100; + try { + RTCbusySleep = Math.max(1, Integer.parseInt(env.getConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, "100"))); + } catch (final NumberFormatException e) {} + final int RTCppm = (int) (60000L / RTCbusySleep); + prop.put("acceptCrawlLimit", RTCppm); + + // set seed information directly + sb.peers.mySeed().setFlagAcceptRemoteCrawl(sb.getConfigBool("crawlResponse", false)); + + // ------------------------------------------------------------------------------------- + // write network list + final String STR_TABLE_LIST = "list_"; + int conCount = 0; + + boolean dark = true; + yacySeed seed; + Iterator e = null; + e = sb.peers.seedsSortedConnected(false, yacySeed.RCOUNT); + //e = sb.peers.seedsSortedConnected(false, yacySeed.LCOUNT); + Pattern peerSearchPattern = null; + while (e.hasNext() && conCount < 300) { + seed = e.next(); + assert seed != null; + if (seed != null) { + final long lastseen = Math.abs((System.currentTimeMillis() - seed.getLastSeenUTC()) / 1000 / 60); + if (lastseen > 720) continue; + long rcount = seed.getLong(yacySeed.RCOUNT, 0); + if (rcount == 0) continue; + if ((post != null && post.containsKey("search")) && peerSearchPattern != null /*(wrongregex == null)*/) { + boolean abort = true; + Matcher m = peerSearchPattern.matcher (seed.getName()); + if (m.find ()) { + abort = false; + } + m = peerSearchPattern.matcher (seed.hash); + if (m.find ()) { + abort = false; + } + if (abort) continue; + } + prop.put(STR_TABLE_LIST + conCount + "_dark", ((dark) ? 1 : 0) ); dark=!dark; + String shortname = seed.get(yacySeed.NAME, "deadlink"); + if (shortname.length() > 20) shortname = shortname.substring(0, 20) + "..."; + prop.putHTML(STR_TABLE_LIST + conCount + "_shortname", shortname); + prop.putHTML(STR_TABLE_LIST + conCount + "_fullname", seed.get(yacySeed.NAME, "deadlink")); + prop.put(STR_TABLE_LIST + conCount + "_age", seed.getAge()); + prop.putHTML(STR_TABLE_LIST + conCount + "_version", yacyVersion.combined2prettyVersion(seed.get(yacySeed.VERSION, "0.1"), shortname)); + prop.putNum(STR_TABLE_LIST + conCount + "_lastSeen", /*seed.getLastSeenString() + " " +*/ lastseen); + prop.put(STR_TABLE_LIST + conCount + "_utc", seed.get(yacySeed.UTC, "-")); + prop.putHTML(STR_TABLE_LIST + conCount + "_uptime", DateFormatter.formatInterval(60000 * Long.parseLong(seed.get(yacySeed.UPTIME, "0")))); + prop.putNum(STR_TABLE_LIST + conCount + "_LCount", seed.getLinkCount()); + prop.putNum(STR_TABLE_LIST + conCount + "_ICount", seed.getWordCount()); + prop.putNum(STR_TABLE_LIST + conCount + "_RCount", rcount); + prop.putNum(STR_TABLE_LIST + conCount + "_ppm", seed.getPPM()); + prop.putNum(STR_TABLE_LIST + conCount + "_qph", Math.round(6000d * seed.getQPM()) / 100d); + conCount++; + } // seed != null + } // while + prop.putNum("list", conCount); + + return prop; + } +} diff --git a/htroot/env/templates/submenuIndexCreate.template b/htroot/env/templates/submenuIndexCreate.template index 27a77b2f1..896c31919 100644 --- a/htroot/env/templates/submenuIndexCreate.template +++ b/htroot/env/templates/submenuIndexCreate.template @@ -5,32 +5,33 @@ \ No newline at end of file diff --git a/source/de/anomic/yacy/yacySeedDB.java b/source/de/anomic/yacy/yacySeedDB.java index e5a18feb4..c121bf927 100644 --- a/source/de/anomic/yacy/yacySeedDB.java +++ b/source/de/anomic/yacy/yacySeedDB.java @@ -75,7 +75,7 @@ public final class yacySeedDB implements AlternativeDomainNames { */ public static final String DBFILE_OWN_SEED = "mySeed.txt"; - public static final String[] sortFields = new String[] {yacySeed.LCOUNT, yacySeed.ICOUNT, yacySeed.UPTIME, yacySeed.VERSION, yacySeed.LASTSEEN}; + public static final String[] sortFields = new String[] {yacySeed.LCOUNT, yacySeed.RCOUNT, yacySeed.ICOUNT, yacySeed.UPTIME, yacySeed.VERSION, yacySeed.LASTSEEN}; public static final String[] longaccFields = new String[] {yacySeed.LCOUNT, yacySeed.ICOUNT, yacySeed.ISPEED}; public static final String[] doubleaccFields = new String[] {yacySeed.RSPEED};