From 6ada0daae982745c42799e297d7849dd1f0ef2b5 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 21 Jan 2014 19:28:00 +0100 Subject: [PATCH] making latency_factor and maximum number of same hosts in loader queue settings available in Crawler_p.html servlet for steering. --- htroot/Crawler_p.html | 10 ++-- htroot/Crawler_p.java | 60 ++++++++++++++----- source/net/yacy/crawler/data/Latency.java | 13 ++-- .../kelondro/workflow/AbstractBusyThread.java | 3 +- .../yacy/kelondro/workflow/BusyThread.java | 2 +- source/net/yacy/search/Switchboard.java | 31 ---------- .../net/yacy/search/SwitchboardConstants.java | 5 +- source/net/yacy/server/serverSwitch.java | 4 ++ 8 files changed, 68 insertions(+), 60 deletions(-) diff --git a/htroot/Crawler_p.html b/htroot/Crawler_p.html index ab3c37058..2a3381e58 100644 --- a/htroot/Crawler_p.html +++ b/htroot/Crawler_p.html @@ -109,7 +109,7 @@
Progress -
+ @@ -119,9 +119,11 @@ diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index fce2fb4c0..b1271bb18 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -51,6 +51,7 @@ import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.TransformerWriter; import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.util.FileUtils; +import net.yacy.kelondro.workflow.BusyThread; import net.yacy.peers.NewsPool; import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.search.Switchboard; @@ -515,17 +516,56 @@ public class Crawler_p { } } + /* + * PPM + LF + MH + + (min/max) + + */ if (post != null && post.containsKey("crawlingPerformance")) { - setPerformance(sb, post); + final String crawlingPerformance = post.get("crawlingPerformance", "custom"); + final long LCbusySleep1 = sb.getConfigLong(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, 1000L); + int wantedPPM = (LCbusySleep1 == 0) ? 30000 : (int) (60000L / LCbusySleep1); + try { + wantedPPM = post.getInt("customPPM", wantedPPM); + } catch (final NumberFormatException e) {} + if ("minimum".equals(crawlingPerformance.toLowerCase())) wantedPPM = 10; + if ("maximum".equals(crawlingPerformance.toLowerCase())) wantedPPM = 30000; + + int wPPM = wantedPPM; + if ( wPPM <= 0 ) { + wPPM = 1; + } + if ( wPPM >= 30000 ) { + wPPM = 30000; + } + final int newBusySleep = 60000 / wPPM; // for wantedPPM = 10: 6000; for wantedPPM = 1000: 60 + final float loadprereq = wantedPPM <= 10 ? 1.0f : wantedPPM <= 100 ? 2.0f : wantedPPM >= 1000 ? 8.0f : 3.0f; + + BusyThread thread; + + thread = sb.getThread(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); + if ( thread != null ) { + sb.setConfig(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, thread.setBusySleep(newBusySleep)); + sb.setConfig(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_LOADPREREQ, thread.setLoadPreReqisite(loadprereq)); + thread.setLoadPreReqisite(loadprereq); + thread.setIdleSleep(2000); + } + + float latencyFactor = post.getFloat("latencyFactor", 0.5f); + int MaxSameHostInQueue = post.getInt("MaxSameHostInQueue", 20); + env.setConfig(SwitchboardConstants.CRAWLER_LATENCY_FACTOR, latencyFactor); + env.setConfig(SwitchboardConstants.CRAWLER_MAX_SAME_HOST_IN_QUEUE, MaxSameHostInQueue); } // performance settings final long LCbusySleep = env.getConfigLong(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, 1000L); final int LCppm = (int) (60000L / Math.max(1,LCbusySleep)); - prop.put("crawlingSpeedMaxChecked", (LCppm >= 30000) ? "1" : "0"); - prop.put("crawlingSpeedCustChecked", ((LCppm > 10) && (LCppm < 30000)) ? "1" : "0"); - prop.put("crawlingSpeedMinChecked", (LCppm <= 10) ? "1" : "0"); prop.put("customPPMdefault", Integer.toString(LCppm)); + prop.put("latencyFactorDefault", env.getConfigFloat(SwitchboardConstants.CRAWLER_LATENCY_FACTOR, 0.5f)); + prop.put("MaxSameHostInQueueDefault", env.getConfigInt(SwitchboardConstants.CRAWLER_MAX_SAME_HOST_IN_QUEUE, 20)); // generate crawl profile table int count = 0; @@ -590,16 +630,4 @@ public class Crawler_p { return 0L; } - private static void setPerformance(final Switchboard sb, final serverObjects post) { - final String crawlingPerformance = post.get("crawlingPerformance", "custom"); - final long LCbusySleep = sb.getConfigLong(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, 1000L); - int wantedPPM = (LCbusySleep == 0) ? 30000 : (int) (60000L / LCbusySleep); - try { - wantedPPM = post.getInt("customPPM", wantedPPM); - } catch (final NumberFormatException e) {} - if ("minimum".equals(crawlingPerformance.toLowerCase())) wantedPPM = 10; - if ("maximum".equals(crawlingPerformance.toLowerCase())) wantedPPM = 30000; - sb.setPerformance(wantedPPM); - } - } diff --git a/source/net/yacy/crawler/data/Latency.java b/source/net/yacy/crawler/data/Latency.java index c99c06c98..c1c7842b6 100644 --- a/source/net/yacy/crawler/data/Latency.java +++ b/source/net/yacy/crawler/data/Latency.java @@ -37,6 +37,7 @@ import net.yacy.crawler.robots.RobotsTxt; import net.yacy.crawler.robots.RobotsTxtEntry; import net.yacy.kelondro.util.MemoryControl; import net.yacy.search.Switchboard; +import net.yacy.search.SwitchboardConstants; public class Latency { @@ -160,10 +161,10 @@ public class Latency { // use the access latency as rule how fast we can access the server // this applies also to localhost, but differently, because it is not necessary to // consider so many external accesses - waiting = Math.max(waiting, (int) (host.average() * Switchboard.getSwitchboard().getConfigFloat("crawler.latencyFactor", 0.5f))); + waiting = Math.max(waiting, (int) (host.average() * Switchboard.getSwitchboard().getConfigFloat(SwitchboardConstants.CRAWLER_LATENCY_FACTOR, 0.5f))); // if the number of same hosts as in the url in the loading queue is greater than MaxSameHostInQueue, then increase waiting - if (Switchboard.getSwitchboard().crawlQueues.hostcount(hostname) > Switchboard.getSwitchboard().getConfigInt("crawler.MaxSameHostInQueue", 20)) waiting += 5000; + if (Switchboard.getSwitchboard().crawlQueues.hostcount(hostname) > Switchboard.getSwitchboard().getConfigInt(SwitchboardConstants.CRAWLER_MAX_SAME_HOST_IN_QUEUE, 20)) waiting += 5000; // the time since last access to the domain is the basis of the remaining calculation final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc()); @@ -203,10 +204,10 @@ public class Latency { if (!local) waiting += host.flux(waiting); // use the access latency as rule how fast we can access the server - waiting = Math.max(waiting, (int) (host.average() * Switchboard.getSwitchboard().getConfigFloat("crawler.latencyFactor", 0.5f))); + waiting = Math.max(waiting, (int) (host.average() * Switchboard.getSwitchboard().getConfigFloat(SwitchboardConstants.CRAWLER_LATENCY_FACTOR, 0.5f))); // if the number of same hosts as in the url in the loading queue is greater than MaxSameHostInQueue, then increase waiting - if (Switchboard.getSwitchboard().crawlQueues.hostcount(url.getHost()) > Switchboard.getSwitchboard().getConfigInt("crawler.MaxSameHostInQueue", 20)) waiting += 5000; + if (Switchboard.getSwitchboard().crawlQueues.hostcount(url.getHost()) > Switchboard.getSwitchboard().getConfigInt(SwitchboardConstants.CRAWLER_MAX_SAME_HOST_IN_QUEUE, 20)) waiting += 5000; // the time since last access to the domain is the basis of the remaining calculation final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc()); @@ -244,11 +245,11 @@ public class Latency { // this applies also to localhost, but differently, because it is not necessary to // consider so many external accesses s.append(", host.average = ").append(host.average()); - waiting = Math.max(waiting, (int) (host.average() * Switchboard.getSwitchboard().getConfigFloat("crawler.latencyFactor", 0.5f))); + waiting = Math.max(waiting, (int) (host.average() * Switchboard.getSwitchboard().getConfigFloat(SwitchboardConstants.CRAWLER_LATENCY_FACTOR, 0.5f))); // if the number of same hosts as in the url in the loading queue is greater than MaxSameHostInQueue, then increase waiting int hostcount = Switchboard.getSwitchboard().crawlQueues.hostcount(url.getHost()); - if (hostcount > Switchboard.getSwitchboard().getConfigInt("crawler.MaxSameHostInQueue", 20)) { + if (hostcount > Switchboard.getSwitchboard().getConfigInt(SwitchboardConstants.CRAWLER_MAX_SAME_HOST_IN_QUEUE, 20)) { s.append(", hostcount = ").append(hostcount); waiting += 5000; } diff --git a/source/net/yacy/kelondro/workflow/AbstractBusyThread.java b/source/net/yacy/kelondro/workflow/AbstractBusyThread.java index 4b1fe7b01..c39b26403 100644 --- a/source/net/yacy/kelondro/workflow/AbstractBusyThread.java +++ b/source/net/yacy/kelondro/workflow/AbstractBusyThread.java @@ -85,9 +85,10 @@ public abstract class AbstractBusyThread extends AbstractThread implements BusyT memprereq = freeBytes; } - public void setLoadPreReqisite(final double load) { + public double setLoadPreReqisite(final double load) { // sets minimum required amount of memory for the job execution loadprereq = load; + return load; } public void setObeyIntermission(final boolean obey) { diff --git a/source/net/yacy/kelondro/workflow/BusyThread.java b/source/net/yacy/kelondro/workflow/BusyThread.java index 200f7bd79..f6e80894d 100644 --- a/source/net/yacy/kelondro/workflow/BusyThread.java +++ b/source/net/yacy/kelondro/workflow/BusyThread.java @@ -69,7 +69,7 @@ public interface BusyThread extends WorkflowThread { * sets maximimum load for the job execution * @param load */ - public void setLoadPreReqisite(final double load); + public double setLoadPreReqisite(final double load); /** * defines if the thread should obey the intermission command diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 4816e656c..0ae180ce0 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -3341,37 +3341,6 @@ public final class Switchboard extends serverSwitch { } } - public void setPerformance(final int wantedPPM) { - int wPPM = wantedPPM; - // we consider 3 cases here - // wantedPPM <= 10: low performance - // 10 < wantedPPM < 30000: custom performance - // 30000 <= wantedPPM : maximum performance - if ( wPPM <= 0 ) { - wPPM = 1; - } - if ( wPPM >= 30000 ) { - wPPM = 30000; - } - final int newBusySleep = 60000 / wPPM; // for wantedPPM = 10: 6000; for wantedPPM = 1000: 60 - - BusyThread thread; - - thread = getThread(SwitchboardConstants.INDEX_DIST); - if ( thread != null ) { - setConfig( - SwitchboardConstants.INDEX_DIST_BUSYSLEEP, - thread.setBusySleep(Math.max(2000, thread.setBusySleep(newBusySleep * 2)))); - thread.setIdleSleep(30000); - } - - thread = getThread(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); - if ( thread != null ) { - setConfig(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, thread.setBusySleep(newBusySleep)); - thread.setIdleSleep(2000); - } - } - public String dhtShallTransfer() { final String cautionCause = onlineCaution(); if ( cautionCause != null ) { diff --git a/source/net/yacy/search/SwitchboardConstants.java b/source/net/yacy/search/SwitchboardConstants.java index 0a41661af..28b5f2ec5 100644 --- a/source/net/yacy/search/SwitchboardConstants.java +++ b/source/net/yacy/search/SwitchboardConstants.java @@ -102,6 +102,7 @@ public final class SwitchboardConstants { public static final String CRAWLJOB_LOCAL_CRAWL_METHOD_FREEMEM = "freemem"; public static final String CRAWLJOB_LOCAL_CRAWL_IDLESLEEP = "50_localcrawl_idlesleep"; public static final String CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP = "50_localcrawl_busysleep"; + public static final String CRAWLJOB_LOCAL_CRAWL_LOADPREREQ = "50_localcrawl_loadprereq"; // 60_remotecrawlloader /** *

public static final String CRAWLJOB_REMOTE_CRAWL_LOADER = "60_remotecrawlloader"

@@ -334,10 +335,12 @@ public final class SwitchboardConstants { *

Name of the setting how many active crawler-threads may maximal be running on the same time

*/ public static final String CRAWLER_THREADS_ACTIVE_MAX = "crawler.MaxActiveThreads"; - public static final String CRAWLER_SAME_HOST_IN_QUEUE = "crawler.MaxSameHostInQueue"; + public static final String CRAWLER_LATENCY_FACTOR = "crawler.latencyFactor"; + public static final String CRAWLER_MAX_SAME_HOST_IN_QUEUE = "crawler.MaxSameHostInQueue"; public static final String CRAWLER_FOLLOW_REDIRECTS = "crawler.http.FollowRedirects"; // ignore the target url and follow to the redirect public static final String CRAWLER_RECORD_REDIRECTS = "crawler.http.RecordRedirects"; // record the ignored redirected page to the index store + /** * debug flags */ diff --git a/source/net/yacy/server/serverSwitch.java b/source/net/yacy/server/serverSwitch.java index 06c25db7d..10ca7e007 100644 --- a/source/net/yacy/server/serverSwitch.java +++ b/source/net/yacy/server/serverSwitch.java @@ -190,6 +190,10 @@ public class serverSwitch setConfig(key, Float.toString(value)); } + public void setConfig(final String key, final double value) { + setConfig(key, Double.toString(value)); + } + public void setConfig(final String key, final String value) { // set the value final String oldValue = this.configProps.put(key, value);
Speed / PPM
(Pages Per Minute)
- - PPM - + PPM + LF + MH + + (min/max)