enhanced remote crawling:

- 300 ppm is default now (but this is switched off by default; if you switch it on you may want more traffic?)
- better timing for busy queue
- better amount of remote url retrieval
- better time-out values
- better tracking of availability of remote crawl urls
- more logging for result of receipt sending

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7159 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 670ba4d52b
commit 461a2a6ec7

@ -526,7 +526,6 @@ storeTXCache=true
# crawlResponse: set to true if a peer should retrieve remote crawl urls from other peers
crawlOrder=true
crawlOrderDepth=0
crawlOrderDelay=8
crawlResponse=false
crawlResponseDepth=0
@ -566,12 +565,12 @@ filterOutStopwordsFromTopwords=true
50_localcrawl_busysleep=20
50_localcrawl_memprereq=12582912
50_localcrawl_isPaused=false
60_remotecrawlloader_idlesleep=60000
60_remotecrawlloader_busysleep=10000
60_remotecrawlloader_idlesleep=4000
60_remotecrawlloader_busysleep=800
60_remotecrawlloader_memprereq=12582912
60_remotecrawlloader_isPaused=false
62_remotetriggeredcrawl_idlesleep=10000
62_remotetriggeredcrawl_busysleep=1000
62_remotetriggeredcrawl_idlesleep=2000
62_remotetriggeredcrawl_busysleep=200
62_remotetriggeredcrawl_memprereq=12582912
62_remotetriggeredcrawl_isPaused=false
70_surrogates_idlesleep=10000

@ -15,7 +15,7 @@ network.unit.dht = true
network.unit.dhtredundancy.junior = 1
network.unit.dhtredundancy.senior = 3
network.unit.dht.partitionExponent = 4
network.unit.remotecrawl.speed = 6
network.unit.remotecrawl.speed = 300
network.unit.bootstrap.seedlist0 = http://www.yacy.net/seed.txt
network.unit.bootstrap.seedlist1 = http://home.arcor.de/hermens/yacy/seed.txt
network.unit.bootstrap.seedlist2 = http://low.audioattack.de/yacy/seed.txt

@ -211,11 +211,7 @@ public class PerformanceQueues_p {
threadName.equals(SwitchboardConstants.SEED_UPLOAD) ||
threadName.equals(SwitchboardConstants.CLEANUP)) {
/* do not change any values */
} else if (threadName.equals(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER) ||
threadName.equals(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL)) {
sb.setRemotecrawlPPM(Math.max(1, (int) (sb.getConfigLong("network.unit.remotecrawl.speed", 60) / multiplier)));
}
else {
} else {
// load with new values
idlesleep = (long) (Long.parseLong(d(defaultSettings.get(threadName + "_idlesleep"), String.valueOf(idlesleep))) * multiplier);
busysleep = (long) (Long.parseLong(d(defaultSettings.get(threadName + "_busysleep"), String.valueOf(busysleep))) * multiplier);

@ -32,7 +32,6 @@ import java.util.regex.Pattern;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.workflow.BusyThread;
import de.anomic.data.WorkTables;
import de.anomic.search.Switchboard;
@ -67,32 +66,13 @@ public class RemoteCrawl_p {
try {
newppm = Math.max(1, Integer.parseInt(post.get("acceptCrawlLimit", "1")));
} catch (final NumberFormatException e) {}
final long newBusySleep = Math.max(100, 60000 / newppm);
// propagate to crawler
final BusyThread rct = sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
sb.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, newBusySleep);
sb.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_IDLESLEEP, newBusySleep * 3);
rct.setBusySleep(newBusySleep);
rct.setIdleSleep(newBusySleep * 3);
// propagate to loader
final BusyThread rcl = sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER);
sb.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_BUSYSLEEP, newBusySleep * 5);
sb.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_IDLESLEEP, newBusySleep * 10);
rcl.setBusySleep(newBusySleep * 5);
rcl.setIdleSleep(newBusySleep * 10);
sb.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, Long.toString(newBusySleep));
sb.setRemotecrawlPPM(newppm);
}
}
// write remote crawl request settings
prop.put("crawlResponse", sb.getConfigBool("crawlResponse", false) ? "1" : "0");
long RTCbusySleep = 100;
try {
RTCbusySleep = Math.max(1, Integer.parseInt(env.getConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, "100")));
} catch (final NumberFormatException e) {}
long RTCbusySleep = Math.max(1, env.getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, 100));
final int RTCppm = (int) (60000L / RTCbusySleep);
prop.put("acceptCrawlLimit", RTCppm);

@ -347,12 +347,12 @@ public class CrawlQueues {
return false;
}
if (this.workers.size() >= sb.getConfigLong(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 10)) {
if (this.workers.size() >= sb.getConfigLong(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 20)) {
// try a cleanup
cleanup();
}
// check again
if (this.workers.size() >= sb.getConfigLong(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 10)) {
if (this.workers.size() >= sb.getConfigLong(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 20)) {
if (this.log.isFine()) log.logFine("remoteCrawlLoaderJob: too many processes in loader queue, dismissed (" + "cacheLoader=" + this.workers.size() + "), httpClients = " + ConnectionInfo.getCount());
return false;
}
@ -363,12 +363,12 @@ public class CrawlQueues {
return false;
}
if (remoteTriggeredCrawlJobSize() > 100) {
if (remoteTriggeredCrawlJobSize() > 200) {
if (this.log.isFine()) log.logFine("remoteCrawlLoaderJob: the remote-triggered crawl job queue is filled, omitting processing");
return false;
}
if (coreCrawlJobSize() > 0 && sb.indexingStorageProcessor.queueSize() > 0) {
if (coreCrawlJobSize() > 0 /*&& sb.indexingStorageProcessor.queueSize() > 0*/) {
if (this.log.isFine()) log.logFine("remoteCrawlLoaderJob: a local crawl is running, omitting processing");
return false;
}
@ -380,9 +380,7 @@ public class CrawlQueues {
final Iterator<yacySeed> e = PeerSelection.getProvidesRemoteCrawlURLs(sb.peers);
while (e.hasNext()) {
seed = e.next();
if (seed != null) {
remoteCrawlProviderHashes.add(seed.hash);
}
if (seed != null) remoteCrawlProviderHashes.add(seed.hash);
}
}
}
@ -391,7 +389,7 @@ public class CrawlQueues {
// take one entry from the provider list and load the entries from the remote peer
seed = null;
String hash = null;
while ((seed == null) && (!remoteCrawlProviderHashes.isEmpty())) {
while (seed == null && !remoteCrawlProviderHashes.isEmpty()) {
hash = remoteCrawlProviderHashes.remove(remoteCrawlProviderHashes.size() - 1);
if (hash == null) continue;
seed = sb.peers.get(hash);
@ -405,12 +403,12 @@ public class CrawlQueues {
if (seed == null) return false;
// we know a peer which should provide remote crawl entries. load them now.
final RSSFeed feed = yacyClient.queryRemoteCrawlURLs(sb.peers, seed, 30, 60000);
final RSSFeed feed = yacyClient.queryRemoteCrawlURLs(sb.peers, seed, 60, 8000);
if (feed == null || feed.isEmpty()) {
// something is wrong with this provider. To prevent that we get not stuck with this peer
// we remove it from the peer list
sb.peers.peerActions.peerDeparture(seed, "no results from provided remote crawls");
// ask another peer
// try again and ask another peer
return remoteCrawlLoaderJob();
}
@ -424,7 +422,7 @@ public class CrawlQueues {
try {
url = new DigestURI(item.getLink(), null);
} catch (final MalformedURLException e) {
url = null;
continue;
}
try {
referrer = new DigestURI(item.getReferrer(), null);

@ -302,11 +302,6 @@ public final class Switchboard extends serverSwitch {
// set a high maximum cache size to current size; this is adopted later automatically
final int wordCacheMaxCount = (int) getConfigLong(SwitchboardConstants.WORDCACHE_MAX_COUNT, 20000);
setConfig(SwitchboardConstants.WORDCACHE_MAX_COUNT, Integer.toString(wordCacheMaxCount));
// set network-specific performance attributes
if (this.firstInit) {
setRemotecrawlPPM(Math.max(1, (int) getConfigLong("network.unit.remotecrawl.speed", 60)));
}
// load the network definition
overwriteNetworkDefinition();
@ -616,7 +611,7 @@ public final class Switchboard extends serverSwitch {
SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_METHOD_JOBCOUNT,
SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_METHOD_FREEMEM,
0, Long.MAX_VALUE, 0, Long.MAX_VALUE),
30000);
10000);
deployThread(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER, "Remote Crawl URL Loader", "thread that loads remote crawl lists from other peers", null,
new InstantBusyThread(
crawlQueues,
@ -624,7 +619,7 @@ public final class Switchboard extends serverSwitch {
SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_JOBCOUNT,
SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_FREEMEM,
10000, Long.MAX_VALUE, 10000, Long.MAX_VALUE),
30000); // error here?
10000); // error here?
deployThread(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL, "Local Crawl", "thread that performes a single crawl step from the local crawl queue", "/IndexCreateWWWLocalQueue_p.html",
new InstantBusyThread(
crawlQueues,
@ -661,6 +656,11 @@ public final class Switchboard extends serverSwitch {
Long.parseLong(getConfig(SwitchboardConstants.INDEX_DIST_BUSYSLEEP , "0")),
Long.parseLong(getConfig(SwitchboardConstants.INDEX_DIST_MEMPREREQ , "1000000")));
// set network-specific performance attributes
if (this.firstInit) {
setRemotecrawlPPM(Math.max(1, (int) getConfigLong("network.unit.remotecrawl.speed", 60)));
}
// test routine for snippet fetch
//Set query = new HashSet();
//query.add(CrawlSwitchboardEntry.word2hash("Weitergabe"));
@ -928,10 +928,21 @@ public final class Switchboard extends serverSwitch {
}
public void setRemotecrawlPPM(final int ppm) {
setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, 60000 / ppm);
setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_IDLESLEEP, Math.max(10000, 180000 / ppm));
setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_BUSYSLEEP, Math.max(15000, 1800000 / ppm));
setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_IDLESLEEP, Math.max(30000, 3600000 / ppm));
final long newBusySleep = Math.max(100, 60000 / ppm);
// propagate to crawler
final BusyThread rct = getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, newBusySleep);
setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_IDLESLEEP, Math.min(10000, newBusySleep * 10));
rct.setBusySleep(getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, 1000));
rct.setIdleSleep(getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_IDLESLEEP, 10000));
// propagate to loader
final BusyThread rcl = getThread(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER);
setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_BUSYSLEEP, newBusySleep * 4);
setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_IDLESLEEP, Math.min(10000, newBusySleep * 20));
rcl.setBusySleep(getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_BUSYSLEEP, 1000));
rcl.setIdleSleep(getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_IDLESLEEP, 10000));
}
public void initMessages() throws IOException {
@ -1952,7 +1963,6 @@ public final class Switchboard extends serverSwitch {
if ((processCase == EventOrigin.GLOBAL_CRAWLING) && (queueEntry.initiator() != null)) {
final yacySeed initiatorPeer = peers.get(new String(queueEntry.initiator()));
if (initiatorPeer != null) {
log.logInfo("Sending crawl receipt for '" + queueEntry.url().toNormalform(false, true) + "' to " + initiatorPeer.getName());
if (clusterhashes != null) initiatorPeer.setAlternativeAddress(clusterhashes.get(queueEntry.initiator()));
// start a thread for receipt sending to avoid a blocking here
new Thread(new receiptSending(initiatorPeer, newEntry), "sending receipt to " + new String(queueEntry.initiator())).start();
@ -2036,7 +2046,14 @@ public final class Switchboard extends serverSwitch {
this.reference = reference;
}
public void run() {
yacyClient.crawlReceipt(peers.mySeed(), initiatorPeer, "crawl", "fill", "indexed", reference, "");
long t = System.currentTimeMillis();
Map<String, String> response = yacyClient.crawlReceipt(peers.mySeed(), initiatorPeer, "crawl", "fill", "indexed", reference, "");
if (response == null) {
log.logInfo("Sending crawl receipt for '" + reference.metadata().url().toNormalform(false, true) + "' to " + initiatorPeer.getName() + " FAILED, send time = " + (System.currentTimeMillis() - t));
return;
}
String delay = response.get("delay");
log.logInfo("Sending crawl receipt for '" + reference.metadata().url().toNormalform(false, true) + "' to " + initiatorPeer.getName() + " success, delay = " + delay + ", send time = " + (System.currentTimeMillis() - t));
}
}

@ -316,7 +316,11 @@ public final class yacyClient {
public static RSSFeed queryRemoteCrawlURLs(final yacySeedDB seedDB, final yacySeed target, final int maxCount, final long maxTime) {
// returns a list of
if (target == null) { return null; }
int targetCount = Integer.parseInt(target.get(yacySeed.RCOUNT, "0"));
if (targetCount <= 0) {
yacyCore.log.logWarning("yacyClient.queryRemoteCrawlURLs wrong peer '" + target.getName() + "' selected: not enough links available");
return null;
}
// prepare request
final String salt = crypt.randomSalt();
@ -346,6 +350,9 @@ public final class yacyClient {
//Log.logException(e);
return null;
}
// update number of remotely available links in seed
target.put(yacySeed.RCOUNT, Integer.toString(Math.max(0, targetCount - feed.size())));
seedDB.update(target.hash, target);
return feed;
} catch (final IOException e) {
yacyCore.log.logSevere("yacyClient.queryRemoteCrawlURLs error asking peer '" + target.getName() + "':" + e.toString());

@ -135,7 +135,7 @@ public class HTTPClient {
connPerRoute.setMaxForRoute(new HttpRoute(localhost), maxcon);
ConnManagerParams.setMaxConnectionsPerRoute(httpParams, connPerRoute);
// how long to wait for getting a connection from manager in milliseconds
ConnManagerParams.setTimeout(httpParams, 3000L);
ConnManagerParams.setTimeout(httpParams, 9000L);
/**
* HTTP protocol settings
*/
@ -147,13 +147,13 @@ public class HTTPClient {
* HTTP connection settings
*/
// timeout in milliseconds until a connection is established in milliseconds
HttpConnectionParams.setConnectionTimeout(httpParams, 10000);
HttpConnectionParams.setConnectionTimeout(httpParams, 9500);
// SO_LINGER affects the socket close operation in seconds
// HttpConnectionParams.setLinger(httpParams, 6);
// TODO: is default ok?
// HttpConnectionParams.setSocketBufferSize(httpParams, 8192);
// SO_TIMEOUT: maximum period inactivity between two consecutive data packets in milliseconds
HttpConnectionParams.setSoTimeout(httpParams, 5000);
HttpConnectionParams.setSoTimeout(httpParams, 9900);
// getting an I/O error when executing a request over a connection that has been closed at the server side
HttpConnectionParams.setStaleCheckingEnabled(httpParams, true);
// conserve bandwidth by minimizing the number of segments that are sent

@ -483,14 +483,7 @@ public class Table implements Index, Iterable<Row.Entry> {
}
public boolean has(final byte[] key) {
/*
try {
assert file.size() == index.size() : "file.size() = " + file.size() + ", index.size() = " + index.size();
} catch (final IOException e) {
Log.logSevere("Table", "", e);
}
assert table == null || table.size() == index.size() : "table.size() = " + table.size() + ", index.size() = " + index.size();
*/
if (index == null) return false;
return index.has(key);
}

Loading…
Cancel
Save