@ -35,7 +35,8 @@
< dt > < label > Site< / label > < / dt >
< dt > < label > Site< / label > < / dt >
< dd >
< dd >
< table border = "0" cellpadding = "0" cellspacing = "0" > < tr valign = "top" >
< table border = "0" cellpadding = "0" cellspacing = "0" > < tr valign = "top" >
< td valign = "top" > < input type = "radio" name = "crawlingMode" id = "url" value = "url" checked = "checked" / > Start URL< / td >
< td valign = "top" > < input type = "radio" name = "crawlingMode" id = "url" value = "url" checked = "checked"
onmousedown="document.getElementById('rangeDomain').disabled=false;document.getElementById('rangeSubpath').disabled=false;document.getElementById('crawlingDomMaxCheck').disabled=false;document.getElementById('crawlingDomMaxPages').disabled=false;document.getElementById('crawlingQ').disabled=false;"/>Start URL< / td >
< td valign = "top" >
< td valign = "top" >
< input name = "crawlingURL" type = "text" size = "50" maxlength = "256" value = "#[starturl]#" onkeypress = "changed()" onfocus = "check('url')" style = "font-size:16px" / > < br / >
< input name = "crawlingURL" type = "text" size = "50" maxlength = "256" value = "#[starturl]#" onkeypress = "changed()" onfocus = "check('url')" style = "font-size:16px" / > < br / >
< input name = "bookmarkTitle" id = "bookmarkTitle" type = "text" size = "50" maxlength = "256" value = "" readonly = "readonly" style = "background:transparent; border:0px" / >
< input name = "bookmarkTitle" id = "bookmarkTitle" type = "text" size = "50" maxlength = "256" value = "" readonly = "readonly" style = "background:transparent; border:0px" / >
@ -44,7 +45,8 @@
< span id = "robotsOK" > < / span >
< span id = "robotsOK" > < / span >
< img align = "top" src = "/env/grafics/empty.gif" name = "ajax" alt = "empty" / >
< img align = "top" src = "/env/grafics/empty.gif" name = "ajax" alt = "empty" / >
< / td > < / tr > < tr >
< / td > < / tr > < tr >
< td > < input type = "radio" name = "crawlingMode" id = "sitemap" value = "sitemap" disabled = "disabled" / > Sitemap URL< / td >
< td > < input type = "radio" name = "crawlingMode" id = "sitemap" value = "sitemap" disabled = "disabled"
onmousedown="document.getElementById('rangeDomain').disabled=true;document.getElementById('rangeSubpath').disabled=true;document.getElementById('crawlingDomMaxCheck').disabled=true;document.getElementById('crawlingDomMaxPages').disabled=true;document.getElementById('crawlingQ').disabled=true;"/>Sitemap URL< / td >
< td > < input name = "sitemapURL" type = "text" size = "41" maxlength = "256" value = "" readonly = "readonly" style = "background:transparent; border:0px" / > < / td >
< td > < input name = "sitemapURL" type = "text" size = "41" maxlength = "256" value = "" readonly = "readonly" style = "background:transparent; border:0px" / > < / td >
< / tr > < / table > < br / >
< / tr > < / table > < br / >
< / dd >
< / dd >
@ -52,7 +54,7 @@
< dt > < label > Scheduler< / label > < / dt >
< dt > < label > Scheduler< / label > < / dt >
< dd >
< dd >
< input type = "radio" name = "recrawl" value = "nodoubles" # ( crawlingIfOlderCheck ) # checked = "checked" :: # ( / crawlingIfOlderCheck ) # / > run this crawl once< br / >
< input type = "radio" name = "recrawl" value = "nodoubles" # ( crawlingIfOlderCheck ) # checked = "checked" :: # ( / crawlingIfOlderCheck ) # / > run this crawl once< br / >
< input type = "radio" name = "recrawl" value = "scheduler" / > scheduled, repeat the craw l every
< input type = "radio" name = "recrawl" value = "scheduler" / > scheduled, look every
< select name = "repeat_time" >
< select name = "repeat_time" >
< option value = "1" > 1< / option > < option value = "2" > 2< / option > < option value = "3" > 3< / option >
< option value = "1" > 1< / option > < option value = "2" > 2< / option > < option value = "3" > 3< / option >
< option value = "4" > 4< / option > < option value = "5" > 5< / option > < option value = "6" > 6< / option >
< option value = "4" > 4< / option > < option value = "5" > 5< / option > < option value = "6" > 6< / option >
@ -65,12 +67,12 @@
< option value = "selminutes" > minutes< / option >
< option value = "selminutes" > minutes< / option >
< option value = "selhours" > hours< / option >
< option value = "selhours" > hours< / option >
< option value = "seldays" selected = "selected" > days< / option >
< option value = "seldays" selected = "selected" > days< / option >
< / select > automatically.
< / select > for new documents automatically.
< / dd >
< / dd >
< dt > < label > Path in Domain< / label > < / dt >
< dt > < label > Path in Domain< / label > < / dt >
< dd >
< dd >
< input type = "radio" name = "range" value= "domain" checked = "checked" / > full domain< br / >
< input type = "radio" name = "range" id= "rangeDomain" value= "domain" checked = "checked" / > full domain< br / >
< input type = "radio" name = "range" value= "subpath" / > only sub-path of given url
< input type = "radio" name = "range" id= "rangeSubpath" value= "subpath" / > only sub-path of given url
< / dd >
< / dd >
< input type = "hidden" name = "mustnotmatch" id = "mustnotmatch" value = "" >
< input type = "hidden" name = "mustnotmatch" id = "mustnotmatch" value = "" >
< input type = "hidden" name = "crawlingDomFilterCheck" id = "crawlingDomFilterCheck" value = "off" >
< input type = "hidden" name = "crawlingDomFilterCheck" id = "crawlingDomFilterCheck" value = "off" >
@ -102,7 +104,15 @@
< / form >
< / form >
< / fieldset >
< / fieldset >
< h3 > Hints< / h3 >
< ul >
< li > < h4 > Crawl Speed Limitation< / h4 > No more that two pages are loaded from the same host in one second (not more that 120 document per minute) to limit the load on the target server.< / li >
< li > < h4 > Target Balancer< / h4 > A second crawl for a different host increases the throughput to a maximum of 240 documents per minute since the crawler balances the load over all hosts.< / li >
< li > < h4 > High Speed Crawling< / h4 > A 'shallow crawl' which is not limited to a single host (or site)
can extend the pages per minute (ppm) rate to unlimited documents per minute when the number of target hosts is high.
This can be done using the < a href = "CrawlStartExpert_p.html" > Expert Crawl Start< / a > servlet.< / li >
< li > < h4 > Scheduler Steering< / h4 > The scheduler on crawls can be changed or removed using the < a href = "Table_API_p.html" > API Steering< / a > .
< / ul >
#%env/templates/footer.template%#
#%env/templates/footer.template%#
< / body >
< / body >
< / html >
< / html >