diff --git a/defaults/yacy.init b/defaults/yacy.init index c2a3caf08..63ee34f4d 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -526,7 +526,6 @@ storeTXCache=true # crawlResponse: set to true if a peer should retrieve remote crawl urls from other peers crawlOrder=true crawlOrderDepth=0 -crawlOrderDelay=8 crawlResponse=false crawlResponseDepth=0 @@ -566,12 +565,12 @@ filterOutStopwordsFromTopwords=true 50_localcrawl_busysleep=20 50_localcrawl_memprereq=12582912 50_localcrawl_isPaused=false -60_remotecrawlloader_idlesleep=60000 -60_remotecrawlloader_busysleep=10000 +60_remotecrawlloader_idlesleep=4000 +60_remotecrawlloader_busysleep=800 60_remotecrawlloader_memprereq=12582912 60_remotecrawlloader_isPaused=false -62_remotetriggeredcrawl_idlesleep=10000 -62_remotetriggeredcrawl_busysleep=1000 +62_remotetriggeredcrawl_idlesleep=2000 +62_remotetriggeredcrawl_busysleep=200 62_remotetriggeredcrawl_memprereq=12582912 62_remotetriggeredcrawl_isPaused=false 70_surrogates_idlesleep=10000 diff --git a/defaults/yacy.network.freeworld.unit b/defaults/yacy.network.freeworld.unit index ffe18f19a..6caa699a0 100644 --- a/defaults/yacy.network.freeworld.unit +++ b/defaults/yacy.network.freeworld.unit @@ -15,7 +15,7 @@ network.unit.dht = true network.unit.dhtredundancy.junior = 1 network.unit.dhtredundancy.senior = 3 network.unit.dht.partitionExponent = 4 -network.unit.remotecrawl.speed = 6 +network.unit.remotecrawl.speed = 300 network.unit.bootstrap.seedlist0 = http://www.yacy.net/seed.txt network.unit.bootstrap.seedlist1 = http://home.arcor.de/hermens/yacy/seed.txt network.unit.bootstrap.seedlist2 = http://low.audioattack.de/yacy/seed.txt diff --git a/htroot/PerformanceQueues_p.java b/htroot/PerformanceQueues_p.java index 1f4f9894e..6be5b8231 100644 --- a/htroot/PerformanceQueues_p.java +++ b/htroot/PerformanceQueues_p.java @@ -211,11 +211,7 @@ public class PerformanceQueues_p { threadName.equals(SwitchboardConstants.SEED_UPLOAD) || threadName.equals(SwitchboardConstants.CLEANUP)) { /* do not change any values */ - } else if (threadName.equals(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER) || - threadName.equals(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL)) { - sb.setRemotecrawlPPM(Math.max(1, (int) (sb.getConfigLong("network.unit.remotecrawl.speed", 60) / multiplier))); - } - else { + } else { // load with new values idlesleep = (long) (Long.parseLong(d(defaultSettings.get(threadName + "_idlesleep"), String.valueOf(idlesleep))) * multiplier); busysleep = (long) (Long.parseLong(d(defaultSettings.get(threadName + "_busysleep"), String.valueOf(busysleep))) * multiplier); diff --git a/htroot/RemoteCrawl_p.java b/htroot/RemoteCrawl_p.java index dbc59fc48..19433f02c 100644 --- a/htroot/RemoteCrawl_p.java +++ b/htroot/RemoteCrawl_p.java @@ -32,7 +32,6 @@ import java.util.regex.Pattern; import net.yacy.cora.protocol.RequestHeader; import net.yacy.kelondro.util.DateFormatter; -import net.yacy.kelondro.workflow.BusyThread; import de.anomic.data.WorkTables; import de.anomic.search.Switchboard; @@ -67,32 +66,13 @@ public class RemoteCrawl_p { try { newppm = Math.max(1, Integer.parseInt(post.get("acceptCrawlLimit", "1"))); } catch (final NumberFormatException e) {} - final long newBusySleep = Math.max(100, 60000 / newppm); - - // propagate to crawler - final BusyThread rct = sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL); - sb.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, newBusySleep); - sb.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_IDLESLEEP, newBusySleep * 3); - rct.setBusySleep(newBusySleep); - rct.setIdleSleep(newBusySleep * 3); - - // propagate to loader - final BusyThread rcl = sb.getThread(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER); - sb.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_BUSYSLEEP, newBusySleep * 5); - sb.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_IDLESLEEP, newBusySleep * 10); - rcl.setBusySleep(newBusySleep * 5); - rcl.setIdleSleep(newBusySleep * 10); - - sb.setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, Long.toString(newBusySleep)); + sb.setRemotecrawlPPM(newppm); } } // write remote crawl request settings prop.put("crawlResponse", sb.getConfigBool("crawlResponse", false) ? "1" : "0"); - long RTCbusySleep = 100; - try { - RTCbusySleep = Math.max(1, Integer.parseInt(env.getConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, "100"))); - } catch (final NumberFormatException e) {} + long RTCbusySleep = Math.max(1, env.getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, 100)); final int RTCppm = (int) (60000L / RTCbusySleep); prop.put("acceptCrawlLimit", RTCppm); diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index a7c0b3829..d6c6a4b39 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -347,12 +347,12 @@ public class CrawlQueues { return false; } - if (this.workers.size() >= sb.getConfigLong(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 10)) { + if (this.workers.size() >= sb.getConfigLong(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 20)) { // try a cleanup cleanup(); } // check again - if (this.workers.size() >= sb.getConfigLong(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 10)) { + if (this.workers.size() >= sb.getConfigLong(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 20)) { if (this.log.isFine()) log.logFine("remoteCrawlLoaderJob: too many processes in loader queue, dismissed (" + "cacheLoader=" + this.workers.size() + "), httpClients = " + ConnectionInfo.getCount()); return false; } @@ -363,12 +363,12 @@ public class CrawlQueues { return false; } - if (remoteTriggeredCrawlJobSize() > 100) { + if (remoteTriggeredCrawlJobSize() > 200) { if (this.log.isFine()) log.logFine("remoteCrawlLoaderJob: the remote-triggered crawl job queue is filled, omitting processing"); return false; } - if (coreCrawlJobSize() > 0 && sb.indexingStorageProcessor.queueSize() > 0) { + if (coreCrawlJobSize() > 0 /*&& sb.indexingStorageProcessor.queueSize() > 0*/) { if (this.log.isFine()) log.logFine("remoteCrawlLoaderJob: a local crawl is running, omitting processing"); return false; } @@ -380,9 +380,7 @@ public class CrawlQueues { final Iterator e = PeerSelection.getProvidesRemoteCrawlURLs(sb.peers); while (e.hasNext()) { seed = e.next(); - if (seed != null) { - remoteCrawlProviderHashes.add(seed.hash); - } + if (seed != null) remoteCrawlProviderHashes.add(seed.hash); } } } @@ -391,7 +389,7 @@ public class CrawlQueues { // take one entry from the provider list and load the entries from the remote peer seed = null; String hash = null; - while ((seed == null) && (!remoteCrawlProviderHashes.isEmpty())) { + while (seed == null && !remoteCrawlProviderHashes.isEmpty()) { hash = remoteCrawlProviderHashes.remove(remoteCrawlProviderHashes.size() - 1); if (hash == null) continue; seed = sb.peers.get(hash); @@ -405,12 +403,12 @@ public class CrawlQueues { if (seed == null) return false; // we know a peer which should provide remote crawl entries. load them now. - final RSSFeed feed = yacyClient.queryRemoteCrawlURLs(sb.peers, seed, 30, 60000); + final RSSFeed feed = yacyClient.queryRemoteCrawlURLs(sb.peers, seed, 60, 8000); if (feed == null || feed.isEmpty()) { // something is wrong with this provider. To prevent that we get not stuck with this peer // we remove it from the peer list sb.peers.peerActions.peerDeparture(seed, "no results from provided remote crawls"); - // ask another peer + // try again and ask another peer return remoteCrawlLoaderJob(); } @@ -424,7 +422,7 @@ public class CrawlQueues { try { url = new DigestURI(item.getLink(), null); } catch (final MalformedURLException e) { - url = null; + continue; } try { referrer = new DigestURI(item.getReferrer(), null); diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 6a1dacea2..04367fc31 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -302,11 +302,6 @@ public final class Switchboard extends serverSwitch { // set a high maximum cache size to current size; this is adopted later automatically final int wordCacheMaxCount = (int) getConfigLong(SwitchboardConstants.WORDCACHE_MAX_COUNT, 20000); setConfig(SwitchboardConstants.WORDCACHE_MAX_COUNT, Integer.toString(wordCacheMaxCount)); - - // set network-specific performance attributes - if (this.firstInit) { - setRemotecrawlPPM(Math.max(1, (int) getConfigLong("network.unit.remotecrawl.speed", 60))); - } // load the network definition overwriteNetworkDefinition(); @@ -616,7 +611,7 @@ public final class Switchboard extends serverSwitch { SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_METHOD_JOBCOUNT, SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_METHOD_FREEMEM, 0, Long.MAX_VALUE, 0, Long.MAX_VALUE), - 30000); + 10000); deployThread(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER, "Remote Crawl URL Loader", "thread that loads remote crawl lists from other peers", null, new InstantBusyThread( crawlQueues, @@ -624,7 +619,7 @@ public final class Switchboard extends serverSwitch { SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_JOBCOUNT, SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_METHOD_FREEMEM, 10000, Long.MAX_VALUE, 10000, Long.MAX_VALUE), - 30000); // error here? + 10000); // error here? deployThread(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL, "Local Crawl", "thread that performes a single crawl step from the local crawl queue", "/IndexCreateWWWLocalQueue_p.html", new InstantBusyThread( crawlQueues, @@ -661,6 +656,11 @@ public final class Switchboard extends serverSwitch { Long.parseLong(getConfig(SwitchboardConstants.INDEX_DIST_BUSYSLEEP , "0")), Long.parseLong(getConfig(SwitchboardConstants.INDEX_DIST_MEMPREREQ , "1000000"))); + // set network-specific performance attributes + if (this.firstInit) { + setRemotecrawlPPM(Math.max(1, (int) getConfigLong("network.unit.remotecrawl.speed", 60))); + } + // test routine for snippet fetch //Set query = new HashSet(); //query.add(CrawlSwitchboardEntry.word2hash("Weitergabe")); @@ -928,10 +928,21 @@ public final class Switchboard extends serverSwitch { } public void setRemotecrawlPPM(final int ppm) { - setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, 60000 / ppm); - setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_IDLESLEEP, Math.max(10000, 180000 / ppm)); - setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_BUSYSLEEP, Math.max(15000, 1800000 / ppm)); - setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_IDLESLEEP, Math.max(30000, 3600000 / ppm)); + final long newBusySleep = Math.max(100, 60000 / ppm); + + // propagate to crawler + final BusyThread rct = getThread(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL); + setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, newBusySleep); + setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_IDLESLEEP, Math.min(10000, newBusySleep * 10)); + rct.setBusySleep(getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_BUSYSLEEP, 1000)); + rct.setIdleSleep(getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL_IDLESLEEP, 10000)); + + // propagate to loader + final BusyThread rcl = getThread(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER); + setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_BUSYSLEEP, newBusySleep * 4); + setConfig(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_IDLESLEEP, Math.min(10000, newBusySleep * 20)); + rcl.setBusySleep(getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_BUSYSLEEP, 1000)); + rcl.setIdleSleep(getConfigLong(SwitchboardConstants.CRAWLJOB_REMOTE_CRAWL_LOADER_IDLESLEEP, 10000)); } public void initMessages() throws IOException { @@ -1952,7 +1963,6 @@ public final class Switchboard extends serverSwitch { if ((processCase == EventOrigin.GLOBAL_CRAWLING) && (queueEntry.initiator() != null)) { final yacySeed initiatorPeer = peers.get(new String(queueEntry.initiator())); if (initiatorPeer != null) { - log.logInfo("Sending crawl receipt for '" + queueEntry.url().toNormalform(false, true) + "' to " + initiatorPeer.getName()); if (clusterhashes != null) initiatorPeer.setAlternativeAddress(clusterhashes.get(queueEntry.initiator())); // start a thread for receipt sending to avoid a blocking here new Thread(new receiptSending(initiatorPeer, newEntry), "sending receipt to " + new String(queueEntry.initiator())).start(); @@ -2036,7 +2046,14 @@ public final class Switchboard extends serverSwitch { this.reference = reference; } public void run() { - yacyClient.crawlReceipt(peers.mySeed(), initiatorPeer, "crawl", "fill", "indexed", reference, ""); + long t = System.currentTimeMillis(); + Map response = yacyClient.crawlReceipt(peers.mySeed(), initiatorPeer, "crawl", "fill", "indexed", reference, ""); + if (response == null) { + log.logInfo("Sending crawl receipt for '" + reference.metadata().url().toNormalform(false, true) + "' to " + initiatorPeer.getName() + " FAILED, send time = " + (System.currentTimeMillis() - t)); + return; + } + String delay = response.get("delay"); + log.logInfo("Sending crawl receipt for '" + reference.metadata().url().toNormalform(false, true) + "' to " + initiatorPeer.getName() + " success, delay = " + delay + ", send time = " + (System.currentTimeMillis() - t)); } } diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 03aa23af2..223333dce 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -316,7 +316,11 @@ public final class yacyClient { public static RSSFeed queryRemoteCrawlURLs(final yacySeedDB seedDB, final yacySeed target, final int maxCount, final long maxTime) { // returns a list of if (target == null) { return null; } - + int targetCount = Integer.parseInt(target.get(yacySeed.RCOUNT, "0")); + if (targetCount <= 0) { + yacyCore.log.logWarning("yacyClient.queryRemoteCrawlURLs wrong peer '" + target.getName() + "' selected: not enough links available"); + return null; + } // prepare request final String salt = crypt.randomSalt(); @@ -346,6 +350,9 @@ public final class yacyClient { //Log.logException(e); return null; } + // update number of remotely available links in seed + target.put(yacySeed.RCOUNT, Integer.toString(Math.max(0, targetCount - feed.size()))); + seedDB.update(target.hash, target); return feed; } catch (final IOException e) { yacyCore.log.logSevere("yacyClient.queryRemoteCrawlURLs error asking peer '" + target.getName() + "':" + e.toString()); diff --git a/source/net/yacy/cora/protocol/http/HTTPClient.java b/source/net/yacy/cora/protocol/http/HTTPClient.java index 422fab54e..3c37c5e50 100644 --- a/source/net/yacy/cora/protocol/http/HTTPClient.java +++ b/source/net/yacy/cora/protocol/http/HTTPClient.java @@ -135,7 +135,7 @@ public class HTTPClient { connPerRoute.setMaxForRoute(new HttpRoute(localhost), maxcon); ConnManagerParams.setMaxConnectionsPerRoute(httpParams, connPerRoute); // how long to wait for getting a connection from manager in milliseconds - ConnManagerParams.setTimeout(httpParams, 3000L); + ConnManagerParams.setTimeout(httpParams, 9000L); /** * HTTP protocol settings */ @@ -147,13 +147,13 @@ public class HTTPClient { * HTTP connection settings */ // timeout in milliseconds until a connection is established in milliseconds - HttpConnectionParams.setConnectionTimeout(httpParams, 10000); + HttpConnectionParams.setConnectionTimeout(httpParams, 9500); // SO_LINGER affects the socket close operation in seconds // HttpConnectionParams.setLinger(httpParams, 6); // TODO: is default ok? // HttpConnectionParams.setSocketBufferSize(httpParams, 8192); // SO_TIMEOUT: maximum period inactivity between two consecutive data packets in milliseconds - HttpConnectionParams.setSoTimeout(httpParams, 5000); + HttpConnectionParams.setSoTimeout(httpParams, 9900); // getting an I/O error when executing a request over a connection that has been closed at the server side HttpConnectionParams.setStaleCheckingEnabled(httpParams, true); // conserve bandwidth by minimizing the number of segments that are sent diff --git a/source/net/yacy/kelondro/table/Table.java b/source/net/yacy/kelondro/table/Table.java index 8fcce0efd..4a35b2d7f 100644 --- a/source/net/yacy/kelondro/table/Table.java +++ b/source/net/yacy/kelondro/table/Table.java @@ -483,14 +483,7 @@ public class Table implements Index, Iterable { } public boolean has(final byte[] key) { - /* - try { - assert file.size() == index.size() : "file.size() = " + file.size() + ", index.size() = " + index.size(); - } catch (final IOException e) { - Log.logSevere("Table", "", e); - } - assert table == null || table.size() == index.size() : "table.size() = " + table.size() + ", index.size() = " + index.size(); - */ + if (index == null) return false; return index.has(key); }