- redesign of crawl start servlet

- for domain-limited crawls, the domain is deleted now by default before
the crawl is started
pull/1/head
orbiter 12 years ago
parent 1c66de4bd4
commit b55ea2197f

@ -47,17 +47,17 @@
<td>
<table cellpadding="0" cellspacing="0">
<tr>
<td><label for="url"><span class="nobr">From URL&nbsp;(must start with<br/>http:// https:// ftp:// smb:// file://)</span></label>:</td>
<td width="160"><label for="url">One Start URL or a list of URLs:<br/>(must start with http:// https:// ftp:// smb:// file://)</label>:</td>
<td><input type="radio" name="crawlingMode" id="url" value="url" checked="checked" /></td>
<td>
<textarea name="crawlingURL" id="crawlingURL" cols="41" rows="3" size="41" onkeypress="changed()" onfocus="check('url')" >#[starturl]#</textarea>
<textarea name="crawlingURL" id="crawlingURL" cols="42" rows="3" size="41" onkeypress="changed()" onfocus="check('url')" >#[starturl]#</textarea>
</td>
</tr>
<tr>
<td></td>
<td></td>
<td>
<input name="bookmarkTitle" id="bookmarkTitle" type="text" size="50" maxlength="256" value="" readonly="readonly" style="background:transparent; border:0px"/>
<input name="bookmarkTitle" id="bookmarkTitle" type="text" size="46" maxlength="256" value="" readonly="readonly" style="background:transparent; border:0px"/>
</td>
</tr>
<tr>
@ -71,13 +71,13 @@
<td><label for="url"><span class="nobr">From Sitemap</span></label>:</td>
<td><input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled"/></td>
<td>
<input name="sitemapURL" type="text" size="41" maxlength="256" value="" readonly="readonly"/>
<input name="sitemapURL" type="text" size="48" maxlength="256" value="" readonly="readonly"/>
</td>
</tr>
<tr>
<td><label for="file"><span class="nobr">From File (enter a path<br/>within your local file system)</span></label>:</td>
<td><input type="radio" name="crawlingMode" id="file" value="file" onclick="document.getElementById('Crawler').rangeDomain.checked = true;"/></td>
<td><input type="text" name="crawlingFile" size="41" onfocus="check('file')"/><!--<input type="file" name="crawlingFile" size="18" onfocus="check('file')"/>--></td>
<td><input type="text" name="crawlingFile" size="48" onfocus="check('file')"/><!--<input type="file" name="crawlingFile" size="18" onfocus="check('file')"/>--></td>
</tr>
<tr>
<td colspan="3" class="commit">
@ -99,7 +99,7 @@
<td>
<input name="crawlingDepth" id="crawlingDepth" type="text" size="2" maxlength="2" value="#[crawlingDepth]#" />&nbsp;&nbsp;&nbsp;
<input type="checkbox" name="directDocByURL" id="directDocByURL" #(directDocByURLChecked)#::checked="checked"#(/directDocByURLChecked)# />also all linked non-parsable documents<br/>
Unlimited crawl depth for URLs matching with: <input name="crawlingDepthExtension" id="crawlingDepthExtension" type="text" size="30" maxlength="100" value="#[crawlingDepthExtension]#" />
Unlimited crawl depth for URLs matching with: <input name="crawlingDepthExtension" id="crawlingDepthExtension" type="text" size="40" maxlength="100" value="#[crawlingDepthExtension]#" />
</td>
<td>
This defines how often the Crawler will follow links (of links..) embedded in websites.
@ -109,6 +109,75 @@
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td><label for="mustmatch">Must-Match Filter</label>:</td>
<td>
<table border="0">
<tr><td width="160">on URLs for Crawling:<br/>
<input type="radio" name="range" id="rangeDomain" value="domain" onclick="document.getElementById('mustmatch').disabled=true;document.getElementById('deleteoldon').disabled=false;document.getElementById('deleteoldage').disabled=false;document.getElementById('deleteoldon').checked=true;"/>Restrict to start domain(s)<br />
<input type="radio" name="range" id="rangeSubpath" value="subpath" onclick="document.getElementById('mustmatch').disabled=true;document.getElementById('deleteoldon').disabled=false;document.getElementById('deleteoldage').disabled=false;document.getElementById('deleteoldon').checked=true;" />Restrict to sub-path(s)<br />
<input type="radio" name="range" id="rangeWide" value="wide" checked="checked" onclick="document.getElementById('mustmatch').disabled=false;document.getElementById('deleteoldoff').checked=true;document.getElementById('deleteoldon').disabled=true;document.getElementById('deleteoldage').disabled=true;"/>Use filter</td>
<td valign="bottom"><input name="mustmatch" id="mustmatch" type="text" size="55" maxlength="1000" value="#[mustmatch]#" /></td></tr>
<tr><td>on IPs for Crawling:</td><td><input name="ipMustmatch" id="ipMustmatch" type="text" size="55" maxlength="100" value="#[ipMustmatch]#" /></td></tr>
<tr><td>on URLs for Indexing</td><td><input name="indexmustmatch" id="indexmustmatch" type="text" size="55" maxlength="100" value="#[indexmustmatch]#" /></td></tr>
</table>
</td>
<td>
The filter is a <b><a href="http://download.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html">regular expression</a></b>
that <b>must match</b> with the URLs which are used to be crawled; default is 'catch all'.
Example: to allow only urls that contain the word 'science', set the filter to '.*science.*'.
You can also use an automatic domain-restriction to fully crawl a single domain.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td><label for="mustnotmatch">Must-Not-Match Filter</label>:</td>
<td>
<table border="0">
<tr><td width="160">on URLs for Crawling:</td><td><input name="mustnotmatch" id="mustnotmatch" type="text" size="55" maxlength="1000" value="#[mustnotmatch]#" /></td></tr>
<tr><td>on IPs for Crawling:</td><td><input name="ipMustnotmatch" id="ipMustnotmatch" type="text" size="55" maxlength="1000" value="#[ipMustnotmatch]#" /></td></tr>
<tr><td>on URLs for Indexing:</td><td><input name="indexmustnotmatch" id="indexmustnotmatch" type="text" size="55" maxlength="1000" value="#[indexmustnotmatch]#" /></td></tr>
</table>
</td>
<td>
The filter is a <b><a href="http://download.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html">regular expression</a></b>
that <b>must not match</b> with the URLs to allow that the content of the url is indexed.
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td>Document Deletion</td>
<td>
<dl>
<dt>No Deletion<input type="radio" name="deleteold" id="deleteoldoff" value="off" checked="checked"/></dt>
<dd>Do not delete any document before the crawl is started.</dd>
<dt>Delete start host<input type="radio" name="deleteold" id="deleteoldon" value="on" disabled="true"/></dt>
<dd>For each host in the start url list, delete all documents from that host.</dd>
<dt>Delete only old<input type="radio" name="deleteold" id="deleteoldage" value="age" disabled="true"/></dt>
<dd>Treat documents that are loaded
<select name="deleteIfOlderNumber" id="deleteIfOlderNumber">
<option value="1">1</option><option value="2">2</option><option value="3">3</option>
<option value="4">4</option><option value="5">5</option><option value="6">6</option>
<option value="7">7</option>
<option value="8">8</option><option value="9">9</option><option value="10">10</option>
<option value="12">12</option><option value="14" selected="selected">14</option><option value="21">21</option>
<option value="28">28</option><option value="30">30</option>
</select>
<select name="deleteIfOlderUnit" id="deleteIfOlderUnit">
<option value="year">years</option>
<option value="month">months</option>
<option value="day" selected="selected">days</option>
<option value="hour">hours</option>
</select> ago as stale and delete them before the crawl is started.
</dd>
</dl>
</td>
<td>
After a crawl was done in the past, document may become stale and eventually they are also deleted on the target host.
To remove old files from the search index it is not sufficient to just consider them for re-load but it may be necessary
to delete them because they simply do not exist any more. Use this in combination with re-crawl while this time should be longer.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td>Document Double-Check</td>
<td>
<dl>
@ -139,74 +208,7 @@
to use that check the 're-load' option.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td><label for="mustmatch">Must-Match Filter for URLs for crawling</label>:</td>
<td>
<input type="radio" name="range" id="rangeWide" value="wide" checked="checked" onclick="document.getElementById('deleteold').checked=false;document.getElementById('deleteold').disabled=true;"/>Use filter&nbsp;&nbsp;
<input name="mustmatch" id="mustmatch" type="text" size="60" maxlength="100" value="#[mustmatch]#" /><br />
<input type="radio" name="range" id="rangeDomain" value="domain" onclick="document.getElementById('deleteold').disabled=false;document.getElementById('deleteold').checked=true;"/>Restrict to start domain<br />
<input type="radio" name="range" id="rangeSubpath" value="subpath" onclick="document.getElementById('deleteold').disabled=false;document.getElementById('deleteold').checked=true;" />Restrict to sub-path<br />
<input type="checkbox" name="deleteold" id="deleteold" disabled/>Delete all old documents in domain/subpath
</td>
<td>
The filter is a <b><a href="http://download.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html">regular expression</a></b>
that <b>must match</b> with the URLs which are used to be crawled; default is 'catch all'.
Example: to allow only urls that contain the word 'science', set the filter to '.*science.*'.
You can also use an automatic domain-restriction to fully crawl a single domain.
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td><label for="mustnotmatch">Must-Not-Match Filter for URLs for crawling</label>:</td>
<td>
<input name="mustnotmatch" id="mustnotmatch" type="text" size="60" maxlength="100" value="#[mustnotmatch]#" />
</td>
<td>
The filter is a <b><a href="http://download.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html">regular expression</a></b>
that <b>must not match</b> to allow that the page is accepted for crawling.
The empty string is a never-match filter which should do well for most cases.
If you don't know what this means, please leave this field empty.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td><label for="indexmustmatch">Must-Match Filter for URLs for indexing</label>:</td>
<td>
<input name="indexmustmatch" id="indexmustmatch" type="text" size="60" maxlength="100" value="#[indexmustmatch]#" /><br />
</td>
<td>
The filter is a <b><a href="http://download.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html">regular expression</a></b>
that <b>must match</b> with the URLs to allow that the content of the url is indexed.
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td><label for="indexmustnotmatch">Must-Not-Match Filter for URLs for indexing</label>:</td>
<td>
<input name="indexmustnotmatch" id="indexmustnotmatch" type="text" size="60" maxlength="100" value="#[indexmustnotmatch]#" />
</td>
<td>
The filter is a <b><a href="http://download.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html">regular expression</a></b>
that <b>must not match</b> with the URLs to allow that the content of the url is indexed.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td><label for="ipMustmatch">Must-Match Filter for IPs</label>:</td>
<td>
<input name="ipMustmatch" id="ipMustmatch" type="text" size="60" maxlength="100" value="#[ipMustmatch]#" />
</td>
<td>
Like the MUST-Match Filter for URLs this filter must match, but only for the IP of the host.
YaCy performs a DNS lookup for each host and this filter restricts the crawl to specific IPs
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td><label for="ipMustnotmatch">Must-Not-Match Filter for IPs</label>:</td>
<td>
<input name="ipMustnotmatch" id="ipMustnotmatch" type="text" size="60" maxlength="100" value="#[ipMustnotmatch]#" />
</td>
<td>
This filter must not match on the IP of the crawled host.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td><label for="crawlingCountryMustMatch">Must-Match List for Country Codes</label>:</td>
<td>
<input type="radio" name="countryMustMatchSwitch" id="countryMustMatchSwitch" value="true" />Use filter&nbsp;&nbsp;
@ -218,7 +220,7 @@
the IP of the server that hosts the page. The filter is not a regular expressions but a list of country codes, separated by comma.
</td>
</tr>
<tr valign="top" class="TableCellDark">
<tr valign="top" class="TableCellLight">
<td>Maximum Pages per Domain:</td>
<td>
<label for="crawlingDomMaxCheck">Use</label>:
@ -232,7 +234,7 @@
the given depth. Domains outside the given depth are then sorted-out anyway.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<tr valign="top" class="TableCellDark">
<td><label for="crawlingQ">Accept URLs with '?' / dynamic URLs</label>:</td>
<td><input type="checkbox" name="crawlingQ" id="crawlingQ" #(crawlingQChecked)#::checked="checked"#(/crawlingQChecked)# /></td>
<td>
@ -240,14 +242,14 @@
is accessed with URLs containing question marks. If you are unsure, do not check this to avoid crawl loops.
</td>
</tr>
<tr valign="top" class="TableCellDark">
<tr valign="top" class="TableCellLight">
<td><label for="storeHTCache">Store to Web Cache</label>:</td>
<td><input type="checkbox" name="storeHTCache" id="storeHTCache" #(storeHTCacheChecked)#::checked="checked"#(/storeHTCacheChecked)# /></td>
<td>
This option is used by default for proxy prefetch, but is not needed for explicit crawling.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<tr valign="top" class="TableCellDark">
<td><label for="mustmatch">Policy for usage of Web Cache</label>:</td>
<td>
<input type="radio" name="cachePolicy" value="nocache" />no&nbsp;cache&nbsp;&nbsp;&nbsp;
@ -263,7 +265,7 @@
<b>cache&nbsp;only</b>: never go online, use all content from cache. If no cache exist, treat content as unavailable
</td>
</tr>
<tr valign="top" class="TableCellDark">
<tr valign="top" class="TableCellLight">
<td>Do Local Indexing:</td>
<td>
<label for="indexText">index text</label>:
@ -276,7 +278,7 @@
Document Cache without indexing.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<tr valign="top" class="TableCellDark">
<td><label for="crawlOrder">Do Remote Indexing</label>:</td>
<td>
<table border="0" cellpadding="2" cellspacing="0">

@ -80,6 +80,7 @@
<input type="checkbox" name="crawlingQ" id="crawlingQ" #(crawlingQChecked)#::checked="checked"#(/crawlingQChecked)# /> allow <a href="http://en.wikipedia.org/wiki/Query_string">query-strings</a> (urls with a '?' in the path)
<input type="hidden" name="directDocByURL" id="directDocByURL" value="off" />
<input type="hidden" name="recrawl" id="recrawl" value="nodoubles" />
<input type="hidden" name="deleteold" id="deleteold" value="on" />
<input type="hidden" name="storeHTCache" id="storeHTCache" value="on" />
<input type="hidden" name="cachePolicy" id="cachePolicy" value="iffresh" />
<input type="hidden" name="indexText" id="indexText" value="on" />

@ -151,7 +151,10 @@ public class Crawler_p {
if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL_STRING; // avoid that all urls are filtered out if bad value was submitted
final boolean fullDomain = "domain".equals(post.get("range", "wide")); // special property in simple crawl start
final boolean subPath = "subpath".equals(post.get("range", "wide")); // special property in simple crawl start
final boolean deleteold = (fullDomain || subPath || !CrawlProfile.MATCH_ALL_STRING.equals(newcrawlingMustMatch)) && post.getBoolean("deleteold");
final boolean restrictedcrawl = fullDomain || subPath || !CrawlProfile.MATCH_ALL_STRING.equals(newcrawlingMustMatch);
final boolean deleteold = restrictedcrawl && post.getBoolean("deleteold");
final boolean deleteage = restrictedcrawl && "age".equals(post.get("deleteold","off"));
String crawlingStart0 = post.get("crawlingURL","").trim(); // the crawljob start url
String[] rootURLs0 = crawlingStart0.indexOf('\n') > 0 || crawlingStart0.indexOf('\r') > 0 ? crawlingStart0.split("[\\r\\n]+") : crawlingStart0.split(Pattern.quote("|"));

Loading…
Cancel
Save