making latency_factor and maximum number of same hosts in loader queue

settings available in Crawler_p.html servlet for steering.
pull/1/head
Michael Peter Christen 11 years ago
parent 489c3fbc90
commit 6ada0daae9

@ -109,7 +109,7 @@
</fieldset>
<fieldset style="width:480px;height:190px;;float:left;">
<legend>Progress</legend>
<form action="Crawler_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<form action="Crawler_p.html" method="get" enctype="multipart/form-data" accept-charset="UTF-8">
<table border="0" cellpadding="2" cellspacing="1" class="watchCrawler">
<tbody>
<tr class="TableHeader">
@ -119,9 +119,11 @@
<tr class="TableCellLight">
<td align="left">Speed / PPM<br/>(Pages Per Minute)</td>
<td align="left" colspan="4">
<input #(crawlingSpeedMinChecked)#::class="TableCellDark"#(/crawlingSpeedMinChecked)# type="submit" name="crawlingPerformance" value="minimum" />
<input #(crawlingSpeedCustChecked)#::class="TableCellDark"#(/crawlingSpeedCustChecked)# id="customPPM" name="customPPM" type="text" size="5" maxlength="5" value="#[customPPMdefault]#" />PPM <input type="submit" name="crawlingPerformance" value="custom" />
<input #(crawlingSpeedMaxChecked)#::class="TableCellDark"#(/crawlingSpeedMaxChecked)# type="submit" name="crawlingPerformance" value="maximum" />
<input id="customPPM" name="customPPM" type="number" min="10" max="30000" style="width:46px" value="#[customPPMdefault]#" />PPM
<input id="latencyFactor" name="latencyFactor" type="number" min="0.1" max="3.0" step="0.1" style="width:32px" value="#[latencyFactorDefault]#" />LF
<input id="MaxSameHostInQueue" name="MaxSameHostInQueue" type="number" min="1" max="30" style="width:32px" value="#[MaxSameHostInQueueDefault]#" />MH
<input type="submit" name="crawlingPerformance" value="set" />
(<a href="/Crawler_p.html?crawlingPerformance=minimum">min</a>/<a href="/Crawler_p.html?crawlingPerformance=maximum">max</a>)
</td>
</tr>
<tr class="TableCellLight">

@ -51,6 +51,7 @@ import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.workflow.BusyThread;
import net.yacy.peers.NewsPool;
import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.Switchboard;
@ -515,17 +516,56 @@ public class Crawler_p {
}
}
/*
* <input id="customPPM" name="customPPM" type="number" min="10" max="30000" style="width:46px" value="#[customPPMdefault]#" />PPM
<input id="latencyFactor" name="latencyFactor" type="number" min="0.1" max="3.0" step="0.1" style="width:32px" value="#[latencyFactorDefault]#" />LF
<input id="MaxSameHostInQueue" name="MaxSameHostInQueue" type="number" min="1" max="30" style="width:32px" value="#[MaxSameHostInQueueDefault]#" />MH
<input type="submit" name="crawlingPerformance" value="set" />
(<a href="/Crawler_p.html?crawlingPerformance=minimum">min</a>/<a href="/Crawler_p.html?crawlingPerformance=maximum">max</a>)
</td>
*/
if (post != null && post.containsKey("crawlingPerformance")) {
setPerformance(sb, post);
final String crawlingPerformance = post.get("crawlingPerformance", "custom");
final long LCbusySleep1 = sb.getConfigLong(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, 1000L);
int wantedPPM = (LCbusySleep1 == 0) ? 30000 : (int) (60000L / LCbusySleep1);
try {
wantedPPM = post.getInt("customPPM", wantedPPM);
} catch (final NumberFormatException e) {}
if ("minimum".equals(crawlingPerformance.toLowerCase())) wantedPPM = 10;
if ("maximum".equals(crawlingPerformance.toLowerCase())) wantedPPM = 30000;
int wPPM = wantedPPM;
if ( wPPM <= 0 ) {
wPPM = 1;
}
if ( wPPM >= 30000 ) {
wPPM = 30000;
}
final int newBusySleep = 60000 / wPPM; // for wantedPPM = 10: 6000; for wantedPPM = 1000: 60
final float loadprereq = wantedPPM <= 10 ? 1.0f : wantedPPM <= 100 ? 2.0f : wantedPPM >= 1000 ? 8.0f : 3.0f;
BusyThread thread;
thread = sb.getThread(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
if ( thread != null ) {
sb.setConfig(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, thread.setBusySleep(newBusySleep));
sb.setConfig(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_LOADPREREQ, thread.setLoadPreReqisite(loadprereq));
thread.setLoadPreReqisite(loadprereq);
thread.setIdleSleep(2000);
}
float latencyFactor = post.getFloat("latencyFactor", 0.5f);
int MaxSameHostInQueue = post.getInt("MaxSameHostInQueue", 20);
env.setConfig(SwitchboardConstants.CRAWLER_LATENCY_FACTOR, latencyFactor);
env.setConfig(SwitchboardConstants.CRAWLER_MAX_SAME_HOST_IN_QUEUE, MaxSameHostInQueue);
}
// performance settings
final long LCbusySleep = env.getConfigLong(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, 1000L);
final int LCppm = (int) (60000L / Math.max(1,LCbusySleep));
prop.put("crawlingSpeedMaxChecked", (LCppm >= 30000) ? "1" : "0");
prop.put("crawlingSpeedCustChecked", ((LCppm > 10) && (LCppm < 30000)) ? "1" : "0");
prop.put("crawlingSpeedMinChecked", (LCppm <= 10) ? "1" : "0");
prop.put("customPPMdefault", Integer.toString(LCppm));
prop.put("latencyFactorDefault", env.getConfigFloat(SwitchboardConstants.CRAWLER_LATENCY_FACTOR, 0.5f));
prop.put("MaxSameHostInQueueDefault", env.getConfigInt(SwitchboardConstants.CRAWLER_MAX_SAME_HOST_IN_QUEUE, 20));
// generate crawl profile table
int count = 0;
@ -590,16 +630,4 @@ public class Crawler_p {
return 0L;
}
private static void setPerformance(final Switchboard sb, final serverObjects post) {
final String crawlingPerformance = post.get("crawlingPerformance", "custom");
final long LCbusySleep = sb.getConfigLong(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, 1000L);
int wantedPPM = (LCbusySleep == 0) ? 30000 : (int) (60000L / LCbusySleep);
try {
wantedPPM = post.getInt("customPPM", wantedPPM);
} catch (final NumberFormatException e) {}
if ("minimum".equals(crawlingPerformance.toLowerCase())) wantedPPM = 10;
if ("maximum".equals(crawlingPerformance.toLowerCase())) wantedPPM = 30000;
sb.setPerformance(wantedPPM);
}
}

@ -37,6 +37,7 @@ import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.crawler.robots.RobotsTxtEntry;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
public class Latency {
@ -160,10 +161,10 @@ public class Latency {
// use the access latency as rule how fast we can access the server
// this applies also to localhost, but differently, because it is not necessary to
// consider so many external accesses
waiting = Math.max(waiting, (int) (host.average() * Switchboard.getSwitchboard().getConfigFloat("crawler.latencyFactor", 0.5f)));
waiting = Math.max(waiting, (int) (host.average() * Switchboard.getSwitchboard().getConfigFloat(SwitchboardConstants.CRAWLER_LATENCY_FACTOR, 0.5f)));
// if the number of same hosts as in the url in the loading queue is greater than MaxSameHostInQueue, then increase waiting
if (Switchboard.getSwitchboard().crawlQueues.hostcount(hostname) > Switchboard.getSwitchboard().getConfigInt("crawler.MaxSameHostInQueue", 20)) waiting += 5000;
if (Switchboard.getSwitchboard().crawlQueues.hostcount(hostname) > Switchboard.getSwitchboard().getConfigInt(SwitchboardConstants.CRAWLER_MAX_SAME_HOST_IN_QUEUE, 20)) waiting += 5000;
// the time since last access to the domain is the basis of the remaining calculation
final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc());
@ -203,10 +204,10 @@ public class Latency {
if (!local) waiting += host.flux(waiting);
// use the access latency as rule how fast we can access the server
waiting = Math.max(waiting, (int) (host.average() * Switchboard.getSwitchboard().getConfigFloat("crawler.latencyFactor", 0.5f)));
waiting = Math.max(waiting, (int) (host.average() * Switchboard.getSwitchboard().getConfigFloat(SwitchboardConstants.CRAWLER_LATENCY_FACTOR, 0.5f)));
// if the number of same hosts as in the url in the loading queue is greater than MaxSameHostInQueue, then increase waiting
if (Switchboard.getSwitchboard().crawlQueues.hostcount(url.getHost()) > Switchboard.getSwitchboard().getConfigInt("crawler.MaxSameHostInQueue", 20)) waiting += 5000;
if (Switchboard.getSwitchboard().crawlQueues.hostcount(url.getHost()) > Switchboard.getSwitchboard().getConfigInt(SwitchboardConstants.CRAWLER_MAX_SAME_HOST_IN_QUEUE, 20)) waiting += 5000;
// the time since last access to the domain is the basis of the remaining calculation
final int timeSinceLastAccess = (int) (System.currentTimeMillis() - host.lastacc());
@ -244,11 +245,11 @@ public class Latency {
// this applies also to localhost, but differently, because it is not necessary to
// consider so many external accesses
s.append(", host.average = ").append(host.average());
waiting = Math.max(waiting, (int) (host.average() * Switchboard.getSwitchboard().getConfigFloat("crawler.latencyFactor", 0.5f)));
waiting = Math.max(waiting, (int) (host.average() * Switchboard.getSwitchboard().getConfigFloat(SwitchboardConstants.CRAWLER_LATENCY_FACTOR, 0.5f)));
// if the number of same hosts as in the url in the loading queue is greater than MaxSameHostInQueue, then increase waiting
int hostcount = Switchboard.getSwitchboard().crawlQueues.hostcount(url.getHost());
if (hostcount > Switchboard.getSwitchboard().getConfigInt("crawler.MaxSameHostInQueue", 20)) {
if (hostcount > Switchboard.getSwitchboard().getConfigInt(SwitchboardConstants.CRAWLER_MAX_SAME_HOST_IN_QUEUE, 20)) {
s.append(", hostcount = ").append(hostcount);
waiting += 5000;
}

@ -85,9 +85,10 @@ public abstract class AbstractBusyThread extends AbstractThread implements BusyT
memprereq = freeBytes;
}
public void setLoadPreReqisite(final double load) {
public double setLoadPreReqisite(final double load) {
// sets minimum required amount of memory for the job execution
loadprereq = load;
return load;
}
public void setObeyIntermission(final boolean obey) {

@ -69,7 +69,7 @@ public interface BusyThread extends WorkflowThread {
* sets maximimum load for the job execution
* @param load
*/
public void setLoadPreReqisite(final double load);
public double setLoadPreReqisite(final double load);
/**
* defines if the thread should obey the intermission command

@ -3341,37 +3341,6 @@ public final class Switchboard extends serverSwitch {
}
}
public void setPerformance(final int wantedPPM) {
int wPPM = wantedPPM;
// we consider 3 cases here
// wantedPPM <= 10: low performance
// 10 < wantedPPM < 30000: custom performance
// 30000 <= wantedPPM : maximum performance
if ( wPPM <= 0 ) {
wPPM = 1;
}
if ( wPPM >= 30000 ) {
wPPM = 30000;
}
final int newBusySleep = 60000 / wPPM; // for wantedPPM = 10: 6000; for wantedPPM = 1000: 60
BusyThread thread;
thread = getThread(SwitchboardConstants.INDEX_DIST);
if ( thread != null ) {
setConfig(
SwitchboardConstants.INDEX_DIST_BUSYSLEEP,
thread.setBusySleep(Math.max(2000, thread.setBusySleep(newBusySleep * 2))));
thread.setIdleSleep(30000);
}
thread = getThread(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
if ( thread != null ) {
setConfig(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, thread.setBusySleep(newBusySleep));
thread.setIdleSleep(2000);
}
}
public String dhtShallTransfer() {
final String cautionCause = onlineCaution();
if ( cautionCause != null ) {

@ -102,6 +102,7 @@ public final class SwitchboardConstants {
public static final String CRAWLJOB_LOCAL_CRAWL_METHOD_FREEMEM = "freemem";
public static final String CRAWLJOB_LOCAL_CRAWL_IDLESLEEP = "50_localcrawl_idlesleep";
public static final String CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP = "50_localcrawl_busysleep";
public static final String CRAWLJOB_LOCAL_CRAWL_LOADPREREQ = "50_localcrawl_loadprereq";
// 60_remotecrawlloader
/**
* <p><code>public static final String <strong>CRAWLJOB_REMOTE_CRAWL_LOADER</strong> = "60_remotecrawlloader"</code></p>
@ -334,10 +335,12 @@ public final class SwitchboardConstants {
* <p>Name of the setting how many active crawler-threads may maximal be running on the same time</p>
*/
public static final String CRAWLER_THREADS_ACTIVE_MAX = "crawler.MaxActiveThreads";
public static final String CRAWLER_SAME_HOST_IN_QUEUE = "crawler.MaxSameHostInQueue";
public static final String CRAWLER_LATENCY_FACTOR = "crawler.latencyFactor";
public static final String CRAWLER_MAX_SAME_HOST_IN_QUEUE = "crawler.MaxSameHostInQueue";
public static final String CRAWLER_FOLLOW_REDIRECTS = "crawler.http.FollowRedirects"; // ignore the target url and follow to the redirect
public static final String CRAWLER_RECORD_REDIRECTS = "crawler.http.RecordRedirects"; // record the ignored redirected page to the index store
/**
* debug flags
*/

@ -190,6 +190,10 @@ public class serverSwitch
setConfig(key, Float.toString(value));
}
public void setConfig(final String key, final double value) {
setConfig(key, Double.toString(value));
}
public void setConfig(final String key, final String value) {
// set the value
final String oldValue = this.configProps.put(key, value);

Loading…
Cancel
Save