- added more comments for user in site crawl servlet

- added a disable/enable function in case that 'sitemap' is selected for functions that do (not) apply
- better naming of menu items
- limit default crawl depth

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7162 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent 3f958ccc7e
commit 8c1da27347

@ -43,7 +43,7 @@ public class CrawlStartExpert_p {
//String repository = "http://" + ((a == null) ? "localhost:" + sb.getConfig("port", "8080") : a) + "/repository/"; //String repository = "http://" + ((a == null) ? "localhost:" + sb.getConfig("port", "8080") : a) + "/repository/";
prop.put("starturl", /*(intranet) ? repository :*/ "http://"); prop.put("starturl", /*(intranet) ? repository :*/ "http://");
prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0")); prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0"));
prop.put("crawlingDepth", env.getConfig("crawlingDepth", "0")); prop.put("crawlingDepth", Math.min(3, env.getConfigLong("crawlingDepth", 0)));
prop.put("mustmatch", /*(intranet) ? repository + ".*" :*/ CrawlProfile.MATCH_ALL); prop.put("mustmatch", /*(intranet) ? repository + ".*" :*/ CrawlProfile.MATCH_ALL);
prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER); prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER);

@ -35,7 +35,8 @@
<dt><label>Site</label></dt> <dt><label>Site</label></dt>
<dd> <dd>
<table border="0" cellpadding="0" cellspacing="0"><tr valign="top"> <table border="0" cellpadding="0" cellspacing="0"><tr valign="top">
<td valign="top"><input type="radio" name="crawlingMode" id="url" value="url" checked="checked" />Start URL</td> <td valign="top"><input type="radio" name="crawlingMode" id="url" value="url" checked="checked"
onmousedown="document.getElementById('rangeDomain').disabled=false;document.getElementById('rangeSubpath').disabled=false;document.getElementById('crawlingDomMaxCheck').disabled=false;document.getElementById('crawlingDomMaxPages').disabled=false;document.getElementById('crawlingQ').disabled=false;"/>Start URL</td>
<td valign="top"> <td valign="top">
<input name="crawlingURL" type="text" size="50" maxlength="256" value="#[starturl]#" onkeypress="changed()" onfocus="check('url')" style="font-size:16px"/><br/> <input name="crawlingURL" type="text" size="50" maxlength="256" value="#[starturl]#" onkeypress="changed()" onfocus="check('url')" style="font-size:16px"/><br/>
<input name="bookmarkTitle" id="bookmarkTitle" type="text" size="50" maxlength="256" value="" readonly="readonly" style="background:transparent; border:0px"/> <input name="bookmarkTitle" id="bookmarkTitle" type="text" size="50" maxlength="256" value="" readonly="readonly" style="background:transparent; border:0px"/>
@ -44,7 +45,8 @@
<span id="robotsOK"></span> <span id="robotsOK"></span>
<img align="top" src="/env/grafics/empty.gif" name="ajax" alt="empty" /> <img align="top" src="/env/grafics/empty.gif" name="ajax" alt="empty" />
</td></tr><tr> </td></tr><tr>
<td><input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled"/>Sitemap URL</td> <td><input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled"
onmousedown="document.getElementById('rangeDomain').disabled=true;document.getElementById('rangeSubpath').disabled=true;document.getElementById('crawlingDomMaxCheck').disabled=true;document.getElementById('crawlingDomMaxPages').disabled=true;document.getElementById('crawlingQ').disabled=true;"/>Sitemap URL</td>
<td><input name="sitemapURL" type="text" size="41" maxlength="256" value="" readonly="readonly" style="background:transparent; border:0px"/></td> <td><input name="sitemapURL" type="text" size="41" maxlength="256" value="" readonly="readonly" style="background:transparent; border:0px"/></td>
</tr></table><br/> </tr></table><br/>
</dd> </dd>
@ -52,7 +54,7 @@
<dt><label>Scheduler</label></dt> <dt><label>Scheduler</label></dt>
<dd> <dd>
<input type="radio" name="recrawl" value="nodoubles" #(crawlingIfOlderCheck)#checked="checked"::#(/crawlingIfOlderCheck)#/>run this crawl once<br/> <input type="radio" name="recrawl" value="nodoubles" #(crawlingIfOlderCheck)#checked="checked"::#(/crawlingIfOlderCheck)#/>run this crawl once<br/>
<input type="radio" name="recrawl" value="scheduler"/>scheduled, repeat the crawl every <input type="radio" name="recrawl" value="scheduler"/>scheduled, look every
<select name="repeat_time"> <select name="repeat_time">
<option value="1">1</option><option value="2">2</option><option value="3">3</option> <option value="1">1</option><option value="2">2</option><option value="3">3</option>
<option value="4">4</option><option value="5">5</option><option value="6">6</option> <option value="4">4</option><option value="5">5</option><option value="6">6</option>
@ -65,12 +67,12 @@
<option value="selminutes">minutes</option> <option value="selminutes">minutes</option>
<option value="selhours">hours</option> <option value="selhours">hours</option>
<option value="seldays" selected="selected">days</option> <option value="seldays" selected="selected">days</option>
</select> automatically. </select> for new documents automatically.
</dd> </dd>
<dt><label>Path in Domain</label></dt> <dt><label>Path in Domain</label></dt>
<dd> <dd>
<input type="radio" name="range" value="domain" checked="checked"/>full domain<br /> <input type="radio" name="range" id="rangeDomain" value="domain" checked="checked"/>full domain<br />
<input type="radio" name="range" value="subpath" />only sub-path of given url <input type="radio" name="range" id="rangeSubpath" value="subpath" />only sub-path of given url
</dd> </dd>
<input type="hidden" name="mustnotmatch" id="mustnotmatch" value=""> <input type="hidden" name="mustnotmatch" id="mustnotmatch" value="">
<input type="hidden" name="crawlingDomFilterCheck" id="crawlingDomFilterCheck" value="off"> <input type="hidden" name="crawlingDomFilterCheck" id="crawlingDomFilterCheck" value="off">
@ -102,7 +104,15 @@
</form> </form>
</fieldset> </fieldset>
<h3>Hints</h3>
<ul>
<li><h4>Crawl Speed Limitation</h4> No more that two pages are loaded from the same host in one second (not more that 120 document per minute) to limit the load on the target server.</li>
<li><h4>Target Balancer</h4> A second crawl for a different host increases the throughput to a maximum of 240 documents per minute since the crawler balances the load over all hosts.</li>
<li><h4>High Speed Crawling</h4> A 'shallow crawl' which is not limited to a single host (or site)
can extend the pages per minute (ppm) rate to unlimited documents per minute when the number of target hosts is high.
This can be done using the <a href="CrawlStartExpert_p.html">Expert Crawl Start</a> servlet.</li>
<li><h4>Scheduler Steering</h4> The scheduler on crawls can be changed or removed using the <a href="Table_API_p.html">API Steering</a>.
</ul>
#%env/templates/footer.template%# #%env/templates/footer.template%#
</body> </body>
</html> </html>

@ -87,7 +87,7 @@
<h3>Peer Control</h3> <h3>Peer Control</h3>
<ul class="menu"> <ul class="menu">
<li><a href="/Status.html" class="MenuItemLink">Admin Console</a></li> <li><a href="/Status.html" class="MenuItemLink">Admin Console</a></li>
<li><a href="/Table_API_p.html" class="MenuItemLink">Steering</a></li> <li><a href="/Table_API_p.html" class="MenuItemLink">API Action Steering</a></li>
<li><a href="/Steering.html?restart=" class="MenuItemLink lock" onclick="return confirm('Confirm Restart')">Re-Start</a></li> <li><a href="/Steering.html?restart=" class="MenuItemLink lock" onclick="return confirm('Confirm Restart')">Re-Start</a></li>
<li><a href="/Steering.html?shutdown=" class="MenuItemLink lock" onclick="return confirm('Confirm Shutdown')">Shutdown</a></li> <li><a href="/Steering.html?shutdown=" class="MenuItemLink lock" onclick="return confirm('Confirm Shutdown')">Shutdown</a></li>
</ul> </ul>

Loading…
Cancel
Save