- redesign of crawl start servlet

- for domain-limited crawls, the domain is deleted now by default before the crawl is started
13 years ago · b55ea2197f
parent 1c66de4bd4
commit b55ea2197f
3 changed files with 86 additions and 80 deletions
--- a/htroot/CrawlStartExpert_p.html
+++ b/htroot/CrawlStartExpert_p.html
@ -47,17 +47,17 @@
          <td>
            <table cellpadding="0" cellspacing="0">
              <tr>
-                <td><label for="url"><span class="nobr">From URL&nbsp;(must start with<br/>http:// https:// ftp:// smb:// file://)</span></label>:</td>
+                <td width="160"><label for="url">One Start URL or a list of URLs:<br/>(must start with http:// https:// ftp:// smb:// file://)</label>:</td>
                <td><input type="radio" name="crawlingMode" id="url" value="url" checked="checked" /></td>
                <td>
-                  <textarea name="crawlingURL" id="crawlingURL" cols="41" rows="3" size="41" onkeypress="changed()" onfocus="check('url')" >#[starturl]#</textarea>
+                  <textarea name="crawlingURL" id="crawlingURL" cols="42" rows="3" size="41" onkeypress="changed()" onfocus="check('url')" >#[starturl]#</textarea>
                </td>
              </tr>
              <tr>
                <td></td>
                <td></td>
                <td>
-                  <input name="bookmarkTitle" id="bookmarkTitle" type="text" size="50" maxlength="256" value="" readonly="readonly" style="background:transparent; border:0px"/>
+                  <input name="bookmarkTitle" id="bookmarkTitle" type="text" size="46" maxlength="256" value="" readonly="readonly" style="background:transparent; border:0px"/>
                </td>
              </tr>
              <tr>
@ -71,13 +71,13 @@
                <td><label for="url"><span class="nobr">From Sitemap</span></label>:</td>
                <td><input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled"/></td>
                <td>
-                  <input name="sitemapURL" type="text" size="41" maxlength="256" value="" readonly="readonly"/>
+                  <input name="sitemapURL" type="text" size="48" maxlength="256" value="" readonly="readonly"/>
                </td>              
              </tr>
              <tr>
                <td><label for="file"><span class="nobr">From File (enter a path<br/>within your local file system)</span></label>:</td>
                <td><input type="radio" name="crawlingMode" id="file" value="file" onclick="document.getElementById('Crawler').rangeDomain.checked = true;"/></td>
-                <td><input type="text" name="crawlingFile" size="41" onfocus="check('file')"/><!--<input type="file" name="crawlingFile" size="18" onfocus="check('file')"/>--></td>
+                <td><input type="text" name="crawlingFile" size="48" onfocus="check('file')"/><!--<input type="file" name="crawlingFile" size="18" onfocus="check('file')"/>--></td>
              </tr>
              <tr>
              	<td colspan="3" class="commit">
@ -99,7 +99,7 @@
          <td>
            <input name="crawlingDepth" id="crawlingDepth" type="text" size="2" maxlength="2" value="#[crawlingDepth]#" />&nbsp;&nbsp;&nbsp;
            <input type="checkbox" name="directDocByURL" id="directDocByURL" #(directDocByURLChecked)#::checked="checked"#(/directDocByURLChecked)# />also all linked non-parsable documents<br/>
-            Unlimited crawl depth for URLs matching with: <input name="crawlingDepthExtension" id="crawlingDepthExtension" type="text" size="30" maxlength="100" value="#[crawlingDepthExtension]#" />
+            Unlimited crawl depth for URLs matching with: <input name="crawlingDepthExtension" id="crawlingDepthExtension" type="text" size="40" maxlength="100" value="#[crawlingDepthExtension]#" />
          </td>
          <td>
            This defines how often the Crawler will follow links (of links..) embedded in websites.
@ -109,6 +109,75 @@
          </td>
        </tr>
        <tr valign="top" class="TableCellDark">
+          <td><label for="mustmatch">Must-Match Filter</label>:</td>
+          <td>
+            <table border="0">
+          
+          
+          <tr><td width="160">on URLs for Crawling:<br/>
+			<input type="radio" name="range" id="rangeDomain" value="domain" onclick="document.getElementById('mustmatch').disabled=true;document.getElementById('deleteoldon').disabled=false;document.getElementById('deleteoldage').disabled=false;document.getElementById('deleteoldon').checked=true;"/>Restrict to start domain(s)<br />
+			<input type="radio" name="range" id="rangeSubpath" value="subpath" onclick="document.getElementById('mustmatch').disabled=true;document.getElementById('deleteoldon').disabled=false;document.getElementById('deleteoldage').disabled=false;document.getElementById('deleteoldon').checked=true;" />Restrict to sub-path(s)<br />
+			<input type="radio" name="range" id="rangeWide" value="wide" checked="checked" onclick="document.getElementById('mustmatch').disabled=false;document.getElementById('deleteoldoff').checked=true;document.getElementById('deleteoldon').disabled=true;document.getElementById('deleteoldage').disabled=true;"/>Use filter</td>
+			<td valign="bottom"><input name="mustmatch" id="mustmatch" type="text" size="55" maxlength="1000" value="#[mustmatch]#" /></td></tr>
+		    <tr><td>on IPs for Crawling:</td><td><input name="ipMustmatch" id="ipMustmatch" type="text" size="55" maxlength="100" value="#[ipMustmatch]#" /></td></tr>
+		    <tr><td>on URLs for Indexing</td><td><input name="indexmustmatch" id="indexmustmatch" type="text" size="55" maxlength="100" value="#[indexmustmatch]#" /></td></tr>
+			</table>
+		  </td>
+          <td>
+            The filter is a <b><a href="http://download.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html">regular expression</a></b>
+            that <b>must match</b> with the URLs which are used to be crawled; default is 'catch all'.
+            Example: to allow only urls that contain the word 'science', set the filter to '.*science.*'. 
+            You can also use an automatic domain-restriction to fully crawl a single domain.
+          </td>
+        </tr>
+        <tr valign="top" class="TableCellLight">
+          <td><label for="mustnotmatch">Must-Not-Match Filter</label>:</td>
+          <td>
+            <table border="0">
+            <tr><td width="160">on URLs for Crawling:</td><td><input name="mustnotmatch" id="mustnotmatch" type="text" size="55" maxlength="1000" value="#[mustnotmatch]#" /></td></tr>
+		    <tr><td>on IPs for Crawling:</td><td><input name="ipMustnotmatch" id="ipMustnotmatch" type="text" size="55" maxlength="1000" value="#[ipMustnotmatch]#" /></td></tr>
+		    <tr><td>on URLs for Indexing:</td><td><input name="indexmustnotmatch" id="indexmustnotmatch" type="text" size="55" maxlength="1000" value="#[indexmustnotmatch]#" /></td></tr>
+			</table>
+		  </td>
+          <td>
+            The filter is a <b><a href="http://download.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html">regular expression</a></b>
+            that <b>must not match</b> with the URLs to allow that the content of the url is indexed.
+          </td>
+        </tr>
+        <tr valign="top" class="TableCellDark">
+          <td>Document Deletion</td>
+          <td>
+            <dl>
+            <dt>No Deletion<input type="radio" name="deleteold" id="deleteoldoff" value="off" checked="checked"/></dt>
+            <dd>Do not delete any document before the crawl is started.</dd>
+            <dt>Delete start host<input type="radio" name="deleteold" id="deleteoldon" value="on" disabled="true"/></dt>
+            <dd>For each host in the start url list, delete all documents from that host.</dd>
+			<dt>Delete only old<input type="radio" name="deleteold" id="deleteoldage" value="age" disabled="true"/></dt>
+			<dd>Treat documents that are loaded
+			<select name="deleteIfOlderNumber" id="deleteIfOlderNumber">
+              <option value="1">1</option><option value="2">2</option><option value="3">3</option>
+              <option value="4">4</option><option value="5">5</option><option value="6">6</option>
+              <option value="7">7</option>
+              <option value="8">8</option><option value="9">9</option><option value="10">10</option>
+              <option value="12">12</option><option value="14" selected="selected">14</option><option value="21">21</option>
+              <option value="28">28</option><option value="30">30</option>
+			</select>
+			<select name="deleteIfOlderUnit" id="deleteIfOlderUnit">
+              <option value="year">years</option>
+              <option value="month">months</option>
+              <option value="day" selected="selected">days</option>
+              <option value="hour">hours</option>
+			</select> ago as stale and delete them before the crawl is started.
+			</dd>
+            </dl>
+          </td>
+          <td>
+            After a crawl was done in the past, document may become stale and eventually they are also deleted on the target host.
+            To remove old files from the search index it is not sufficient to just consider them for re-load but it may be necessary
+            to delete them because they simply do not exist any more. Use this in combination with re-crawl while this time should be longer.
+          </td>
+        </tr>
+        <tr valign="top" class="TableCellLight">
          <td>Document Double-Check</td>
          <td>
            <dl>
@ -139,74 +208,7 @@
            to use that check the 're-load' option.
          </td>
        </tr>
-        <tr valign="top" class="TableCellLight">
-          <td><label for="mustmatch">Must-Match Filter for URLs for crawling</label>:</td>
-          <td>
-			<input type="radio" name="range" id="rangeWide" value="wide" checked="checked" onclick="document.getElementById('deleteold').checked=false;document.getElementById('deleteold').disabled=true;"/>Use filter&nbsp;&nbsp;
-			<input name="mustmatch" id="mustmatch" type="text" size="60" maxlength="100" value="#[mustmatch]#" /><br />
-			<input type="radio" name="range" id="rangeDomain" value="domain" onclick="document.getElementById('deleteold').disabled=false;document.getElementById('deleteold').checked=true;"/>Restrict to start domain<br />
-			<input type="radio" name="range" id="rangeSubpath" value="subpath" onclick="document.getElementById('deleteold').disabled=false;document.getElementById('deleteold').checked=true;" />Restrict to sub-path<br />
-			<input type="checkbox" name="deleteold" id="deleteold" disabled/>Delete all old documents in domain/subpath
-		  </td>
-          <td>
-            The filter is a <b><a href="http://download.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html">regular expression</a></b>
-            that <b>must match</b> with the URLs which are used to be crawled; default is 'catch all'.
-            Example: to allow only urls that contain the word 'science', set the filter to '.*science.*'. 
-            You can also use an automatic domain-restriction to fully crawl a single domain.
-          </td>
-        </tr>
        <tr valign="top" class="TableCellDark">
-          <td><label for="mustnotmatch">Must-Not-Match Filter for URLs for crawling</label>:</td>
-          <td>
-			<input name="mustnotmatch" id="mustnotmatch" type="text" size="60" maxlength="100" value="#[mustnotmatch]#" />
-		  </td>
-          <td>
-            The filter is a <b><a href="http://download.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html">regular expression</a></b>
-            that <b>must not match</b> to allow that the page is accepted for crawling.
-            The empty string is a never-match filter which should do well for most cases.
-            If you don't know what this means, please leave this field empty.
-          </td>
-        </tr>
-        <tr valign="top" class="TableCellLight">
-          <td><label for="indexmustmatch">Must-Match Filter for URLs for indexing</label>:</td>
-          <td>
-			<input name="indexmustmatch" id="indexmustmatch" type="text" size="60" maxlength="100" value="#[indexmustmatch]#" /><br />
-		  </td>
-          <td>
-            The filter is a <b><a href="http://download.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html">regular expression</a></b>
-            that <b>must match</b> with the URLs to allow that the content of the url is indexed.
-          </td>
-        </tr>
-        <tr valign="top" class="TableCellDark">
-          <td><label for="indexmustnotmatch">Must-Not-Match Filter for URLs for indexing</label>:</td>
-          <td>
-			<input name="indexmustnotmatch" id="indexmustnotmatch" type="text" size="60" maxlength="100" value="#[indexmustnotmatch]#" />
-		  </td>
-          <td>
-            The filter is a <b><a href="http://download.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html">regular expression</a></b>
-            that <b>must not match</b> with the URLs to allow that the content of the url is indexed.
-          </td>
-        </tr>
-        <tr valign="top" class="TableCellLight">
-          <td><label for="ipMustmatch">Must-Match Filter for IPs</label>:</td>
-          <td>
-			<input name="ipMustmatch" id="ipMustmatch" type="text" size="60" maxlength="100" value="#[ipMustmatch]#" />
-		  </td>
-          <td>
-            Like the MUST-Match Filter for URLs this filter must match, but only for the IP of the host.
-            YaCy performs a DNS lookup for each host and this filter restricts the crawl to specific IPs
-          </td>
-        </tr>
-        <tr valign="top" class="TableCellDark">
-          <td><label for="ipMustnotmatch">Must-Not-Match Filter for IPs</label>:</td>
-          <td>
-			<input name="ipMustnotmatch" id="ipMustnotmatch" type="text" size="60" maxlength="100" value="#[ipMustnotmatch]#" />
-		  </td>
-          <td>
-            This filter must not match on the IP of the crawled host.
-          </td>
-        </tr>
-        <tr valign="top" class="TableCellLight">
          <td><label for="crawlingCountryMustMatch">Must-Match List for Country Codes</label>:</td>
          <td>
 			<input type="radio" name="countryMustMatchSwitch" id="countryMustMatchSwitch" value="true" />Use filter&nbsp;&nbsp;
@ -218,7 +220,7 @@
            the IP of the server that hosts the page. The filter is not a regular expressions but a list of country codes, separated by comma.
          </td>
        </tr>
-        <tr valign="top" class="TableCellDark">
+        <tr valign="top" class="TableCellLight">
          <td>Maximum Pages per Domain:</td>
          <td>
            <label for="crawlingDomMaxCheck">Use</label>:
@ -232,7 +234,7 @@
            the given depth. Domains outside the given depth are then sorted-out anyway.
          </td>
        </tr>
-        <tr valign="top" class="TableCellLight">
+        <tr valign="top" class="TableCellDark">
          <td><label for="crawlingQ">Accept URLs with '?' / dynamic URLs</label>:</td>
          <td><input type="checkbox" name="crawlingQ" id="crawlingQ" #(crawlingQChecked)#::checked="checked"#(/crawlingQChecked)# /></td>
          <td>
@ -240,14 +242,14 @@
            is accessed with URLs containing question marks. If you are unsure, do not check this to avoid crawl loops.
          </td>
        </tr>
-        <tr valign="top" class="TableCellDark">
+        <tr valign="top" class="TableCellLight">
          <td><label for="storeHTCache">Store to Web Cache</label>:</td>
          <td><input type="checkbox" name="storeHTCache" id="storeHTCache" #(storeHTCacheChecked)#::checked="checked"#(/storeHTCacheChecked)# /></td>
          <td>
            This option is used by default for proxy prefetch, but is not needed for explicit crawling.
          </td>
        </tr>
-        <tr valign="top" class="TableCellLight">
+        <tr valign="top" class="TableCellDark">
          <td><label for="mustmatch">Policy for usage of Web Cache</label>:</td>
          <td>
 			<input type="radio" name="cachePolicy" value="nocache" />no&nbsp;cache&nbsp;&nbsp;&nbsp;
@ -263,7 +265,7 @@
              <b>cache&nbsp;only</b>: never go online, use all content from cache. If no cache exist, treat content as unavailable
          </td>
        </tr>
-        <tr valign="top" class="TableCellDark">
+        <tr valign="top" class="TableCellLight">
          <td>Do Local Indexing:</td>
          <td>
            <label for="indexText">index text</label>:
@ -276,7 +278,7 @@
            Document Cache without indexing.
          </td>
        </tr>
-        <tr valign="top" class="TableCellLight">
+        <tr valign="top" class="TableCellDark">
          <td><label for="crawlOrder">Do Remote Indexing</label>:</td>
          <td>
            <table border="0" cellpadding="2" cellspacing="0">
--- a/htroot/CrawlStartSite_p.html
+++ b/htroot/CrawlStartSite_p.html
@ -80,6 +80,7 @@
 		<input type="checkbox" name="crawlingQ" id="crawlingQ" #(crawlingQChecked)#::checked="checked"#(/crawlingQChecked)# /> allow <a href="http://en.wikipedia.org/wiki/Query_string">query-strings</a> (urls with a '?' in the path)
        <input type="hidden" name="directDocByURL" id="directDocByURL" value="off" />
        <input type="hidden" name="recrawl" id="recrawl" value="nodoubles" />
+		<input type="hidden" name="deleteold" id="deleteold" value="on" />
 		<input type="hidden" name="storeHTCache" id="storeHTCache" value="on" />
        <input type="hidden" name="cachePolicy" id="cachePolicy" value="iffresh" />
        <input type="hidden" name="indexText" id="indexText" value="on" />
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@ -151,7 +151,10 @@ public class Crawler_p {
                if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL_STRING; // avoid that all urls are filtered out if bad value was submitted
                final boolean fullDomain = "domain".equals(post.get("range", "wide")); // special property in simple crawl start
                final boolean subPath    = "subpath".equals(post.get("range", "wide")); // special property in simple crawl start
-                final boolean deleteold = (fullDomain || subPath || !CrawlProfile.MATCH_ALL_STRING.equals(newcrawlingMustMatch)) && post.getBoolean("deleteold");
+
+                final boolean restrictedcrawl = fullDomain || subPath || !CrawlProfile.MATCH_ALL_STRING.equals(newcrawlingMustMatch);
+                final boolean deleteold = restrictedcrawl && post.getBoolean("deleteold");
+                final boolean deleteage = restrictedcrawl && "age".equals(post.get("deleteold","off"));
                
                String crawlingStart0 = post.get("crawlingURL","").trim(); // the crawljob start url
                String[] rootURLs0 = crawlingStart0.indexOf('\n') > 0 || crawlingStart0.indexOf('\r') > 0 ? crawlingStart0.split("[\\r\\n]+") : crawlingStart0.split(Pattern.quote("|"));