- removed scheduled crawling options in crawl start because it is

superfluous there; it can be changed in the scheduler servlet. It's also confusing in the presence of the delete-option, which will be implemented next. - removed unused crawl start servlet - some refactoring to make the time parser reusable
13 years ago · 1c66de4bd4
parent 2e7219f9fd
commit 1c66de4bd4
6 changed files with 25 additions and 253 deletions
--- a/htroot/CrawlStartExpert_p.html
+++ b/htroot/CrawlStartExpert_p.html
@ -109,14 +109,14 @@
          </td>
        </tr>
        <tr valign="top" class="TableCellDark">
-          <td>Scheduled re-crawl</td>
+          <td>Document Double-Check</td>
          <td>
            <dl>
-            <dt>no&nbsp;doubles<input type="radio" name="recrawl" value="nodoubles" #(crawlingIfOlderCheck)#checked="checked"::#(/crawlingIfOlderCheck)#/></dt>
-            <dd>run this crawl once and never load any page that is already known, only the start-url may be loaded again.</dd>
-			<dt>re-load<input type="radio" name="recrawl" value="reload" #(crawlingIfOlderCheck)#::checked="checked"#(/crawlingIfOlderCheck)# /></dt>
-			<dd>run this crawl once, but treat urls that are known since<br/>
-			<select name="crawlingIfOlderNumber" id="crawlingIfOlderNumber">
+            <dt>No&nbsp;Doubles<input type="radio" name="recrawl" value="nodoubles" checked="checked"/></dt>
+            <dd>Never load any page that is already known.<br/>Only the start-url may be loaded again.</dd>
+			<dt>Re-load<input type="radio" name="recrawl" value="reload"/></dt>
+			<dd>Treat documents that are loaded
+			<select name="reloadIfOlderNumber" id="reloadIfOlderNumber">
              <option value="1">1</option><option value="2">2</option><option value="3">3</option>
              <option value="4">4</option><option value="5">5</option><option value="6">6</option>
              <option value="7" selected="selected">7</option>
@ -124,36 +124,19 @@
              <option value="12">12</option><option value="14">14</option><option value="21">21</option>
              <option value="28">28</option><option value="30">30</option>
 			</select>
-			<select name="crawlingIfOlderUnit">
-              <option value="year"   #(crawlingIfOlderUnitYearCheck)#::selected="selected"#(/crawlingIfOlderUnitYearCheck)#>years</option>
-              <option value="month"  #(crawlingIfOlderUnitMonthCheck)#::selected="selected"#(/crawlingIfOlderUnitMonthCheck)#>months</option>
-              <option value="day"    #(crawlingIfOlderUnitDayCheck)#::selected="selected"#(/crawlingIfOlderUnitDayCheck)#>days</option>
-              <option value="hour"   #(crawlingIfOlderUnitHourCheck)#::selected="selected"#(/crawlingIfOlderUnitHourCheck)#>hours</option>
-			</select> not as double and load them again. No scheduled re-crawl.
-			</dd>
-			<dt>scheduled<input type="radio" name="recrawl" value="scheduler"/></dt>
-			<dd>after starting this crawl, repeat the crawl every<br/>
-			<select name="repeat_time">
-              <option value="1">1</option><option value="2">2</option><option value="3">3</option>
-              <option value="4">4</option><option value="5">5</option><option value="6">6</option>
-              <option value="7" selected="selected">7</option>
-              <option value="8">8</option><option value="9">9</option><option value="10">10</option>
-              <option value="12">12</option><option value="14">14</option><option value="21">21</option>
-              <option value="28">28</option><option value="30">30</option>
-			</select>
-			<select name="repeat_unit">
-              <option value="selminutes">minutes</option>
-              <option value="selhours">hours</option>
-              <option value="seldays" selected="selected">days</option>
-            </select> automatically.
+			<select name="reloadIfOlderUnit" id="reloadIfOlderUnit">
+              <option value="year">years</option>
+              <option value="month">months</option>
+              <option value="day" selected="selected">days</option>
+              <option value="hour">hours</option>
+			</select> ago as stale and load them again. If they are younger, they are ignored.
 			</dd>
            </dl>
          </td>
          <td>
            A web crawl performs a double-check on all links found in the internet against the internal database. If the same url is found again,
            then the url is treated as double when you check the 'no doubles' option. A url may be loaded again when it has reached a specific age,
-            to use that check the 're-load' option. When you want that this web crawl is repeated automatically, then check the 'scheduled' option.
-            In this case the crawl is repeated after the given time and no url from the previous crawl is omitted as double.
+            to use that check the 're-load' option.
          </td>
        </tr>
        <tr valign="top" class="TableCellLight">
--- a/htroot/CrawlStartExpert_p.java
+++ b/htroot/CrawlStartExpert_p.java
@ -53,13 +53,6 @@ public class CrawlStartExpert_p {
        prop.put("ipMustnotmatch", sb.getConfig("crawlingIPMustNotMatch", CrawlProfile.MATCH_NEVER_STRING));
        prop.put("countryMustMatch", sb.getConfig("crawlingCountryMustMatch", ""));

-        prop.put("crawlingIfOlderCheck", "0");
-        prop.put("crawlingIfOlderUnitYearCheck", "0");
-        prop.put("crawlingIfOlderUnitMonthCheck", "0");
-        prop.put("crawlingIfOlderUnitDayCheck", "1");
-        prop.put("crawlingIfOlderUnitHourCheck", "0");
-        prop.put("crawlingIfOlderNumber", "7");
-
        final int crawlingDomFilterDepth = env.getConfigInt("crawlingDomFilterDepth", -1);
        prop.put("crawlingDomFilterCheck", (crawlingDomFilterDepth == -1) ? "0" : "1");
        prop.put("crawlingDomFilterDepth", (crawlingDomFilterDepth == -1) ? 1 : crawlingDomFilterDepth);
--- a/htroot/CrawlStartSite_p.html
+++ b/htroot/CrawlStartSite_p.html
@ -58,24 +58,6 @@
          </tr>
          </table><br/>
 		  <input type="hidden" name="crawlingDepth" id="crawlingDepth" value="99" />
-        </dd>      
-        <dt><label>Scheduler</label></dt>
-        <dd>
-            <input type="radio" name="recrawl" value="nodoubles" #(crawlingIfOlderCheck)#checked="checked"::#(/crawlingIfOlderCheck)#/>run this crawl once<br/>
-			<input type="radio" name="recrawl" value="scheduler"/>scheduled, look every
-			<select name="repeat_time">
-              <option value="1">1</option><option value="2">2</option><option value="3">3</option>
-              <option value="4">4</option><option value="5">5</option><option value="6">6</option>
-              <option value="7" selected="selected">7</option>
-              <option value="8">8</option><option value="9">9</option><option value="10">10</option>
-              <option value="12">12</option><option value="14">14</option><option value="21">21</option>
-              <option value="28">28</option><option value="30">30</option>
-			</select>
-			<select name="repeat_unit">
-              <option value="selminutes">minutes</option>
-              <option value="selhours">hours</option>
-              <option value="seldays" selected="selected">days</option>
-            </select> for new documents automatically.
        </dd>
        <dt><label>Path</label></dt>
        <dd>
@ -97,6 +79,7 @@
        <dd>
 		<input type="checkbox" name="crawlingQ" id="crawlingQ" #(crawlingQChecked)#::checked="checked"#(/crawlingQChecked)# /> allow <a href="http://en.wikipedia.org/wiki/Query_string">query-strings</a> (urls with a '?' in the path)
        <input type="hidden" name="directDocByURL" id="directDocByURL" value="off" />
+        <input type="hidden" name="recrawl" id="recrawl" value="nodoubles" />
 		<input type="hidden" name="storeHTCache" id="storeHTCache" value="on" />
        <input type="hidden" name="cachePolicy" id="cachePolicy" value="iffresh" />
        <input type="hidden" name="indexText" id="indexText" value="on" />
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@ -214,35 +214,14 @@ public class Crawler_p {

                // recrawl
                final String recrawl = post.get("recrawl", "nodoubles"); // nodoubles, reload, scheduler
-                boolean crawlingIfOlderCheck = "on".equals(post.get("crawlingIfOlderCheck", "off"));
-                int crawlingIfOlderNumber = post.getInt("crawlingIfOlderNumber", -1);
-                String crawlingIfOlderUnit = post.get("crawlingIfOlderUnit","year"); // year, month, day, hour
-                int repeat_time = post.getInt("repeat_time", -1);
-                final String repeat_unit = post.get("repeat_unit", "seldays"); // selminutes, selhours, seldays
-
-                if ("scheduler".equals(recrawl) && repeat_time > 0) {
-                    // set crawlingIfOlder attributes that are appropriate for scheduled crawling
-                    crawlingIfOlderCheck = true;
-                    crawlingIfOlderNumber = "selminutes".equals(repeat_unit) ? 1 : "selhours".equals(repeat_unit) ? repeat_time / 2 : repeat_time * 12;
-                    crawlingIfOlderUnit = "hour";
-                } else if ("reload".equals(recrawl)) {
-                    repeat_time = -1;
-                    crawlingIfOlderCheck = true;
-                } else if ("nodoubles".equals(recrawl)) {
-                    repeat_time = -1;
-                    crawlingIfOlderCheck = false;
+                long crawlingIfOlder = 0;
+                if ("reload".equals(recrawl)) {
+                    crawlingIfOlder = timeParser(true, post.getInt("reloadIfOlderNumber", -1), post.get("reloadIfOlderUnit","year")); // year, month, day, hour
                }
-                final long crawlingIfOlder = recrawlIfOlderC(crawlingIfOlderCheck, crawlingIfOlderNumber, crawlingIfOlderUnit);
                env.setConfig("crawlingIfOlder", crawlingIfOlder);

                // store this call as api call
-                if (repeat_time > 0) {
-                    // store as scheduled api call
-                    sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + ((rootURLs.size() == 0) ? post.get("crawlingFile", "") : rootURLs.iterator().next().toNormalform(true)), repeat_time, repeat_unit.substring(3));
-                } else {
-                    // store just a protocol
-                    sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + ((rootURLs.size() == 0) ? post.get("crawlingFile", "") : rootURLs.iterator().next().toNormalform(true)));
-                }
+                sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + ((rootURLs.size() == 0) ? post.get("crawlingFile", "") : rootURLs.iterator().next().toNormalform(true)));

                final boolean crawlingDomMaxCheck = "on".equals(post.get("crawlingDomMaxCheck", "off"));
                final int crawlingDomMaxPages = (crawlingDomMaxCheck) ? post.getInt("crawlingDomMaxPages", -1) : -1;
@ -564,13 +543,14 @@ public class Crawler_p {
        return prop;
    }

-    private static long recrawlIfOlderC(final boolean recrawlIfOlderCheck, final int recrawlIfOlderNumber, final String crawlingIfOlderUnit) {
+    private static long timeParser(final boolean recrawlIfOlderCheck, final int number, final String unit) {
        if (!recrawlIfOlderCheck) return 0L;
-        if ("year".equals(crawlingIfOlderUnit)) return System.currentTimeMillis() - recrawlIfOlderNumber * 1000L * 60L * 60L * 24L * 365L;
-        if ("month".equals(crawlingIfOlderUnit)) return System.currentTimeMillis() - recrawlIfOlderNumber * 1000L * 60L * 60L * 24L * 30L;
-        if ("day".equals(crawlingIfOlderUnit)) return System.currentTimeMillis() - recrawlIfOlderNumber * 1000L * 60L * 60L * 24L;
-        if ("hour".equals(crawlingIfOlderUnit)) return System.currentTimeMillis() - recrawlIfOlderNumber * 1000L * 60L * 60L;
-        return System.currentTimeMillis() - recrawlIfOlderNumber;
+        if ("year".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L * 365L;
+        if ("month".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L * 30L;
+        if ("day".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L;
+        if ("hour".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L * 60L;
+        if ("minute".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L;
+        return 0L;
    }

    private static void setPerformance(final Switchboard sb, final serverObjects post) {
--- a/htroot/IndexCreateDomainCrawl_p.html
+++ b/htroot/IndexCreateDomainCrawl_p.html
@ -1,71 +0,0 @@
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml">
-  <head>
-    <title>YaCy '#[clientname]#': Index Creation with a Web Crawl for a Single Domain</title>
-    #%env/templates/metas.template%#
-    <script type="text/javascript" src="/js/ajax.js"></script>
-    <script type="text/javascript" src="/js/IndexCreate.js"></script>
-  </head>
-  <body id="IndexCreate">
-    #%env/templates/header.template%#
-    #%env/templates/submenuIndexCreate.template%#
-    <h2>Easy Crawl Start</h2>
-    
-    <p id="startCrawling">
-    <strong>Start Crawling Job:</strong>&nbsp;
-    You can define URLs as start points for Web page crawling and start crawling here.
-    "Crawling" means that YaCy will download the given web-site, extract all links in it
-    and then download the content behind these links.
-    This is repeated as long as specified under "Crawling Depth".
-    </p>
-    
-    <form action="Crawler_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
-      <input type="hidden" name="crawlingFilter" value=".*" />
-      <input type="hidden" name="crawlingIfOlderCheck" value="off" />
-      <input type="hidden" name="crawlingDomFilterCheck" value="off" />
-      <input type="hidden" name="crawlingDomMaxCheck" value="off" />
-      <input type="hidden" name="crawlingQ" value="off" />
-      <input type="hidden" name="storeHTCache" value="on" />
-      <input type="hidden" name="indexText" value="on" />
-      <input type="hidden" name="indexMedia" value="on" />
-      <input type="hidden" name="crawlOrder" value="on" />
-      <input type="hidden" name="intention" value="simple web crawl" />
-      <input type="hidden" name="xsstopw" value="off" />
-      <table border="0" cellpadding="5" cellspacing="1">
-        <tr class="TableHeader">
-          <td><strong>Attribut</strong></td>
-          <td><strong>Value</strong></td>
-          <td><strong>Description</strong></td>
-        </tr>
-        <tr valign="top" class="TableCellSummary">
-          <td>Starting Point:</td>
-          <td>
-            <input name="crawlingURL" type="text" size="41" maxlength="256" value="http://" onkeypress="changed()" />
-            <span id="robotsOK"></span><br />
-            <span id="title"><br/></span>
-            <img src="/env/grafics/empty.gif" name="ajax" alt="empty" />
-          </td>
-          <td>
-            Enter here the start url of the web crawl. 
-          </td>
-        </tr>
-        <tr valign="top" class="TableCellLight">
-          <td><label for="crawlingDepth">Crawling Range</label>:</td>
-          <td>
-          <input type="radio" name="range" value="wide" checked="checked" />Wide: depth <input name="crawlingDepth" id="crawlingDepth" type="text" size="2" maxlength="2" value="#[crawlingDepth]#" />&nbsp;&nbsp;|&nbsp;&nbsp;
-          <input type="radio" name="range" value="domain" />Complete Domain
-          </td>
-          <td>
-            The range defines if the crawl shall consider a complete domain, or a wide crawl up to a specific depth.
-          </td>
-        </tr>
-        
-        <tr valign="top" class="TableCellLight">
-          <td colspan="3"><input type="submit" name="crawlingstart" value="Start New Distributed Crawl" class="submitready" style="width:240px;"/></td>
-        </tr>
-      </table>
-    </form>
-    
-    #%env/templates/footer.template%#
-  </body>
-</html>
--- a/htroot/IndexCreateDomainCrawl_p.java
+++ b/htroot/IndexCreateDomainCrawl_p.java
@ -1,96 +0,0 @@
-// IndexCreateDomainCrawl_p
-// (C) 2004 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
-// first published 02.12.2004 as IndexCreate_p.java on http://yacy.net
-//
-// This is a part of YaCy, a peer-to-peer based web search engine
-//
-// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
-// $LastChangedRevision: 1986 $
-// $LastChangedBy: orbiter $
-//
-// LICENSE
-//
-// This program is free software; you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation; either version 2 of the License, or
-// (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with this program; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-
-import net.yacy.cora.protocol.RequestHeader;
-import net.yacy.search.SwitchboardConstants;
-import net.yacy.server.serverObjects;
-import net.yacy.server.serverSwitch;
-
-public class IndexCreateDomainCrawl_p {
-
-    public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, @SuppressWarnings("unused") final serverObjects post, final serverSwitch env) {
-        // return variable that accumulates replacements
-        //Switchboard sb = (Switchboard) env;
-        serverObjects prop = new serverObjects();
-
-        // define visible variables
-        prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0"));
-        prop.put("crawlingDepth", Math.min(3, env.getConfigLong("crawlingDepth", 0)));
-        prop.put("crawlingFilter", env.getConfig("crawlingFilter", "0"));
-
-        int crawlingIfOlder = (int) env.getConfigLong("crawlingIfOlder", -1);
-        prop.put("crawlingIfOlderCheck", (crawlingIfOlder == -1) ? "0" : "1");
-        prop.put("crawlingIfOlderUnitYearCheck", "0");
-        prop.put("crawlingIfOlderUnitMonthCheck", "0");
-        prop.put("crawlingIfOlderUnitDayCheck", "0");
-        prop.put("crawlingIfOlderUnitHourCheck", "0");
-        prop.put("crawlingIfOlderUnitMinuteCheck", "0");
-        if ((crawlingIfOlder == -1) || (crawlingIfOlder == Integer.MAX_VALUE)) {
-            prop.put("crawlingIfOlderNumber", "-1");
-            prop.put("crawlingIfOlderUnitYearCheck", "1");
-        } else if (crawlingIfOlder >= 60*24*365) {
-            prop.put("crawlingIfOlderNumber", Math.round((float)crawlingIfOlder / (float)(60*24*365)));
-            prop.put("crawlingIfOlderUnitYearCheck", "1");
-        } else if (crawlingIfOlder >= 60*24*30) {
-            prop.put("crawlingIfOlderNumber", Math.round((float)crawlingIfOlder / (float)(60*24*30)));
-            prop.put("crawlingIfOlderUnitMonthCheck", "1");
-        } else if (crawlingIfOlder >= 60*24) {
-            prop.put("crawlingIfOlderNumber", Math.round((float)crawlingIfOlder / (float)(60*24)));
-            prop.put("crawlingIfOlderUnitDayCheck", "1");
-        } else if (crawlingIfOlder >= 60) {
-            prop.put("crawlingIfOlderNumber", Math.round(crawlingIfOlder / 60f));
-            prop.put("crawlingIfOlderUnitHourCheck", "1");
-        } else {
-            prop.put("crawlingIfOlderNumber", crawlingIfOlder);
-            prop.put("crawlingIfOlderUnitMinuteCheck", "1");
-        }
-        int crawlingDomFilterDepth = (int) env.getConfigLong("crawlingDomFilterDepth", -1);
-        prop.put("crawlingDomFilterCheck", (crawlingDomFilterDepth == -1) ? "0" : "1");
-        prop.put("crawlingDomFilterDepth", (crawlingDomFilterDepth == -1) ? 1 : crawlingDomFilterDepth);
-        int crawlingDomMaxPages = (int) env.getConfigLong("crawlingDomMaxPages", -1);
-        prop.put("crawlingDomMaxCheck", (crawlingDomMaxPages == -1) ? "0" : "1");
-        prop.put("crawlingDomMaxPages", (crawlingDomMaxPages == -1) ? 10000 : crawlingDomMaxPages);
-        prop.put("crawlingQChecked", env.getConfigBool("crawlingQ", false) ? "1" : "0");
-        prop.put("storeHTCacheChecked", env.getConfigBool("storeHTCache", false) ? "1" : "0");
-        prop.put("indexingTextChecked", env.getConfigBool("indexText", false) ? "1" : "0");
-        prop.put("indexingMediaChecked", env.getConfigBool("indexMedia", false) ? "1" : "0");
-        prop.put("crawlOrderChecked", env.getConfigBool("crawlOrder", false) ? "1" : "0");
-
-        long LCbusySleep = env.getConfigLong(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, 100L);
-        int LCppm = (LCbusySleep == 0) ? 1000 : (int) (60000L / LCbusySleep);
-        prop.put("crawlingSpeedMaxChecked", (LCppm >= 1000) ? "1" : "0");
-        prop.put("crawlingSpeedCustChecked", ((LCppm > 10) && (LCppm < 1000)) ? "1" : "0");
-        prop.put("crawlingSpeedMinChecked", (LCppm <= 10) ? "1" : "0");
-        prop.put("customPPMdefault", ((LCppm > 10) && (LCppm < 1000)) ? Integer.toString(LCppm) : "");
-
-        prop.put("xsstopwChecked", env.getConfigBool("xsstopw", false) ? "1" : "0");
-        prop.put("xdstopwChecked", env.getConfigBool("xdstopw", false) ? "1" : "0");
-        prop.put("xpstopwChecked", env.getConfigBool("xpstopw", false) ? "1" : "0");
-
-        // return rewrite properties
-        return prop;
-    }
-}