- removed scheduled crawling options in crawl start because it is

superfluous there; it can be changed in the scheduler servlet. It's also
confusing in the presence of the delete-option, which will be
implemented next.
- removed unused crawl start servlet
- some refactoring to make the time parser reusable
pull/1/head
orbiter 12 years ago
parent 2e7219f9fd
commit 1c66de4bd4

@ -109,14 +109,14 @@
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td>Scheduled re-crawl</td>
<td>Document Double-Check</td>
<td>
<dl>
<dt>no&nbsp;doubles<input type="radio" name="recrawl" value="nodoubles" #(crawlingIfOlderCheck)#checked="checked"::#(/crawlingIfOlderCheck)#/></dt>
<dd>run this crawl once and never load any page that is already known, only the start-url may be loaded again.</dd>
<dt>re-load<input type="radio" name="recrawl" value="reload" #(crawlingIfOlderCheck)#::checked="checked"#(/crawlingIfOlderCheck)# /></dt>
<dd>run this crawl once, but treat urls that are known since<br/>
<select name="crawlingIfOlderNumber" id="crawlingIfOlderNumber">
<dt>No&nbsp;Doubles<input type="radio" name="recrawl" value="nodoubles" checked="checked"/></dt>
<dd>Never load any page that is already known.<br/>Only the start-url may be loaded again.</dd>
<dt>Re-load<input type="radio" name="recrawl" value="reload"/></dt>
<dd>Treat documents that are loaded
<select name="reloadIfOlderNumber" id="reloadIfOlderNumber">
<option value="1">1</option><option value="2">2</option><option value="3">3</option>
<option value="4">4</option><option value="5">5</option><option value="6">6</option>
<option value="7" selected="selected">7</option>
@ -124,36 +124,19 @@
<option value="12">12</option><option value="14">14</option><option value="21">21</option>
<option value="28">28</option><option value="30">30</option>
</select>
<select name="crawlingIfOlderUnit">
<option value="year" #(crawlingIfOlderUnitYearCheck)#::selected="selected"#(/crawlingIfOlderUnitYearCheck)#>years</option>
<option value="month" #(crawlingIfOlderUnitMonthCheck)#::selected="selected"#(/crawlingIfOlderUnitMonthCheck)#>months</option>
<option value="day" #(crawlingIfOlderUnitDayCheck)#::selected="selected"#(/crawlingIfOlderUnitDayCheck)#>days</option>
<option value="hour" #(crawlingIfOlderUnitHourCheck)#::selected="selected"#(/crawlingIfOlderUnitHourCheck)#>hours</option>
</select> not as double and load them again. No scheduled re-crawl.
</dd>
<dt>scheduled<input type="radio" name="recrawl" value="scheduler"/></dt>
<dd>after starting this crawl, repeat the crawl every<br/>
<select name="repeat_time">
<option value="1">1</option><option value="2">2</option><option value="3">3</option>
<option value="4">4</option><option value="5">5</option><option value="6">6</option>
<option value="7" selected="selected">7</option>
<option value="8">8</option><option value="9">9</option><option value="10">10</option>
<option value="12">12</option><option value="14">14</option><option value="21">21</option>
<option value="28">28</option><option value="30">30</option>
</select>
<select name="repeat_unit">
<option value="selminutes">minutes</option>
<option value="selhours">hours</option>
<option value="seldays" selected="selected">days</option>
</select> automatically.
<select name="reloadIfOlderUnit" id="reloadIfOlderUnit">
<option value="year">years</option>
<option value="month">months</option>
<option value="day" selected="selected">days</option>
<option value="hour">hours</option>
</select> ago as stale and load them again. If they are younger, they are ignored.
</dd>
</dl>
</td>
<td>
A web crawl performs a double-check on all links found in the internet against the internal database. If the same url is found again,
then the url is treated as double when you check the 'no doubles' option. A url may be loaded again when it has reached a specific age,
to use that check the 're-load' option. When you want that this web crawl is repeated automatically, then check the 'scheduled' option.
In this case the crawl is repeated after the given time and no url from the previous crawl is omitted as double.
to use that check the 're-load' option.
</td>
</tr>
<tr valign="top" class="TableCellLight">

@ -53,13 +53,6 @@ public class CrawlStartExpert_p {
prop.put("ipMustnotmatch", sb.getConfig("crawlingIPMustNotMatch", CrawlProfile.MATCH_NEVER_STRING));
prop.put("countryMustMatch", sb.getConfig("crawlingCountryMustMatch", ""));
prop.put("crawlingIfOlderCheck", "0");
prop.put("crawlingIfOlderUnitYearCheck", "0");
prop.put("crawlingIfOlderUnitMonthCheck", "0");
prop.put("crawlingIfOlderUnitDayCheck", "1");
prop.put("crawlingIfOlderUnitHourCheck", "0");
prop.put("crawlingIfOlderNumber", "7");
final int crawlingDomFilterDepth = env.getConfigInt("crawlingDomFilterDepth", -1);
prop.put("crawlingDomFilterCheck", (crawlingDomFilterDepth == -1) ? "0" : "1");
prop.put("crawlingDomFilterDepth", (crawlingDomFilterDepth == -1) ? 1 : crawlingDomFilterDepth);

@ -58,24 +58,6 @@
</tr>
</table><br/>
<input type="hidden" name="crawlingDepth" id="crawlingDepth" value="99" />
</dd>
<dt><label>Scheduler</label></dt>
<dd>
<input type="radio" name="recrawl" value="nodoubles" #(crawlingIfOlderCheck)#checked="checked"::#(/crawlingIfOlderCheck)#/>run this crawl once<br/>
<input type="radio" name="recrawl" value="scheduler"/>scheduled, look every
<select name="repeat_time">
<option value="1">1</option><option value="2">2</option><option value="3">3</option>
<option value="4">4</option><option value="5">5</option><option value="6">6</option>
<option value="7" selected="selected">7</option>
<option value="8">8</option><option value="9">9</option><option value="10">10</option>
<option value="12">12</option><option value="14">14</option><option value="21">21</option>
<option value="28">28</option><option value="30">30</option>
</select>
<select name="repeat_unit">
<option value="selminutes">minutes</option>
<option value="selhours">hours</option>
<option value="seldays" selected="selected">days</option>
</select> for new documents automatically.
</dd>
<dt><label>Path</label></dt>
<dd>
@ -97,6 +79,7 @@
<dd>
<input type="checkbox" name="crawlingQ" id="crawlingQ" #(crawlingQChecked)#::checked="checked"#(/crawlingQChecked)# /> allow <a href="http://en.wikipedia.org/wiki/Query_string">query-strings</a> (urls with a '?' in the path)
<input type="hidden" name="directDocByURL" id="directDocByURL" value="off" />
<input type="hidden" name="recrawl" id="recrawl" value="nodoubles" />
<input type="hidden" name="storeHTCache" id="storeHTCache" value="on" />
<input type="hidden" name="cachePolicy" id="cachePolicy" value="iffresh" />
<input type="hidden" name="indexText" id="indexText" value="on" />

@ -214,35 +214,14 @@ public class Crawler_p {
// recrawl
final String recrawl = post.get("recrawl", "nodoubles"); // nodoubles, reload, scheduler
boolean crawlingIfOlderCheck = "on".equals(post.get("crawlingIfOlderCheck", "off"));
int crawlingIfOlderNumber = post.getInt("crawlingIfOlderNumber", -1);
String crawlingIfOlderUnit = post.get("crawlingIfOlderUnit","year"); // year, month, day, hour
int repeat_time = post.getInt("repeat_time", -1);
final String repeat_unit = post.get("repeat_unit", "seldays"); // selminutes, selhours, seldays
if ("scheduler".equals(recrawl) && repeat_time > 0) {
// set crawlingIfOlder attributes that are appropriate for scheduled crawling
crawlingIfOlderCheck = true;
crawlingIfOlderNumber = "selminutes".equals(repeat_unit) ? 1 : "selhours".equals(repeat_unit) ? repeat_time / 2 : repeat_time * 12;
crawlingIfOlderUnit = "hour";
} else if ("reload".equals(recrawl)) {
repeat_time = -1;
crawlingIfOlderCheck = true;
} else if ("nodoubles".equals(recrawl)) {
repeat_time = -1;
crawlingIfOlderCheck = false;
long crawlingIfOlder = 0;
if ("reload".equals(recrawl)) {
crawlingIfOlder = timeParser(true, post.getInt("reloadIfOlderNumber", -1), post.get("reloadIfOlderUnit","year")); // year, month, day, hour
}
final long crawlingIfOlder = recrawlIfOlderC(crawlingIfOlderCheck, crawlingIfOlderNumber, crawlingIfOlderUnit);
env.setConfig("crawlingIfOlder", crawlingIfOlder);
// store this call as api call
if (repeat_time > 0) {
// store as scheduled api call
sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + ((rootURLs.size() == 0) ? post.get("crawlingFile", "") : rootURLs.iterator().next().toNormalform(true)), repeat_time, repeat_unit.substring(3));
} else {
// store just a protocol
sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + ((rootURLs.size() == 0) ? post.get("crawlingFile", "") : rootURLs.iterator().next().toNormalform(true)));
}
sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + ((rootURLs.size() == 0) ? post.get("crawlingFile", "") : rootURLs.iterator().next().toNormalform(true)));
final boolean crawlingDomMaxCheck = "on".equals(post.get("crawlingDomMaxCheck", "off"));
final int crawlingDomMaxPages = (crawlingDomMaxCheck) ? post.getInt("crawlingDomMaxPages", -1) : -1;
@ -564,13 +543,14 @@ public class Crawler_p {
return prop;
}
private static long recrawlIfOlderC(final boolean recrawlIfOlderCheck, final int recrawlIfOlderNumber, final String crawlingIfOlderUnit) {
private static long timeParser(final boolean recrawlIfOlderCheck, final int number, final String unit) {
if (!recrawlIfOlderCheck) return 0L;
if ("year".equals(crawlingIfOlderUnit)) return System.currentTimeMillis() - recrawlIfOlderNumber * 1000L * 60L * 60L * 24L * 365L;
if ("month".equals(crawlingIfOlderUnit)) return System.currentTimeMillis() - recrawlIfOlderNumber * 1000L * 60L * 60L * 24L * 30L;
if ("day".equals(crawlingIfOlderUnit)) return System.currentTimeMillis() - recrawlIfOlderNumber * 1000L * 60L * 60L * 24L;
if ("hour".equals(crawlingIfOlderUnit)) return System.currentTimeMillis() - recrawlIfOlderNumber * 1000L * 60L * 60L;
return System.currentTimeMillis() - recrawlIfOlderNumber;
if ("year".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L * 365L;
if ("month".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L * 30L;
if ("day".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L * 60L * 24L;
if ("hour".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L * 60L;
if ("minute".equals(unit)) return System.currentTimeMillis() - number * 1000L * 60L;
return 0L;
}
private static void setPerformance(final Switchboard sb, final serverObjects post) {

@ -1,71 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>YaCy '#[clientname]#': Index Creation with a Web Crawl for a Single Domain</title>
#%env/templates/metas.template%#
<script type="text/javascript" src="/js/ajax.js"></script>
<script type="text/javascript" src="/js/IndexCreate.js"></script>
</head>
<body id="IndexCreate">
#%env/templates/header.template%#
#%env/templates/submenuIndexCreate.template%#
<h2>Easy Crawl Start</h2>
<p id="startCrawling">
<strong>Start Crawling Job:</strong>&nbsp;
You can define URLs as start points for Web page crawling and start crawling here.
"Crawling" means that YaCy will download the given web-site, extract all links in it
and then download the content behind these links.
This is repeated as long as specified under "Crawling Depth".
</p>
<form action="Crawler_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<input type="hidden" name="crawlingFilter" value=".*" />
<input type="hidden" name="crawlingIfOlderCheck" value="off" />
<input type="hidden" name="crawlingDomFilterCheck" value="off" />
<input type="hidden" name="crawlingDomMaxCheck" value="off" />
<input type="hidden" name="crawlingQ" value="off" />
<input type="hidden" name="storeHTCache" value="on" />
<input type="hidden" name="indexText" value="on" />
<input type="hidden" name="indexMedia" value="on" />
<input type="hidden" name="crawlOrder" value="on" />
<input type="hidden" name="intention" value="simple web crawl" />
<input type="hidden" name="xsstopw" value="off" />
<table border="0" cellpadding="5" cellspacing="1">
<tr class="TableHeader">
<td><strong>Attribut</strong></td>
<td><strong>Value</strong></td>
<td><strong>Description</strong></td>
</tr>
<tr valign="top" class="TableCellSummary">
<td>Starting Point:</td>
<td>
<input name="crawlingURL" type="text" size="41" maxlength="256" value="http://" onkeypress="changed()" />
<span id="robotsOK"></span><br />
<span id="title"><br/></span>
<img src="/env/grafics/empty.gif" name="ajax" alt="empty" />
</td>
<td>
Enter here the start url of the web crawl.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td><label for="crawlingDepth">Crawling Range</label>:</td>
<td>
<input type="radio" name="range" value="wide" checked="checked" />Wide: depth <input name="crawlingDepth" id="crawlingDepth" type="text" size="2" maxlength="2" value="#[crawlingDepth]#" />&nbsp;&nbsp;|&nbsp;&nbsp;
<input type="radio" name="range" value="domain" />Complete Domain
</td>
<td>
The range defines if the crawl shall consider a complete domain, or a wide crawl up to a specific depth.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td colspan="3"><input type="submit" name="crawlingstart" value="Start New Distributed Crawl" class="submitready" style="width:240px;"/></td>
</tr>
</table>
</form>
#%env/templates/footer.template%#
</body>
</html>

@ -1,96 +0,0 @@
// IndexCreateDomainCrawl_p
// (C) 2004 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 02.12.2004 as IndexCreate_p.java on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.search.SwitchboardConstants;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
public class IndexCreateDomainCrawl_p {
public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, @SuppressWarnings("unused") final serverObjects post, final serverSwitch env) {
// return variable that accumulates replacements
//Switchboard sb = (Switchboard) env;
serverObjects prop = new serverObjects();
// define visible variables
prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0"));
prop.put("crawlingDepth", Math.min(3, env.getConfigLong("crawlingDepth", 0)));
prop.put("crawlingFilter", env.getConfig("crawlingFilter", "0"));
int crawlingIfOlder = (int) env.getConfigLong("crawlingIfOlder", -1);
prop.put("crawlingIfOlderCheck", (crawlingIfOlder == -1) ? "0" : "1");
prop.put("crawlingIfOlderUnitYearCheck", "0");
prop.put("crawlingIfOlderUnitMonthCheck", "0");
prop.put("crawlingIfOlderUnitDayCheck", "0");
prop.put("crawlingIfOlderUnitHourCheck", "0");
prop.put("crawlingIfOlderUnitMinuteCheck", "0");
if ((crawlingIfOlder == -1) || (crawlingIfOlder == Integer.MAX_VALUE)) {
prop.put("crawlingIfOlderNumber", "-1");
prop.put("crawlingIfOlderUnitYearCheck", "1");
} else if (crawlingIfOlder >= 60*24*365) {
prop.put("crawlingIfOlderNumber", Math.round((float)crawlingIfOlder / (float)(60*24*365)));
prop.put("crawlingIfOlderUnitYearCheck", "1");
} else if (crawlingIfOlder >= 60*24*30) {
prop.put("crawlingIfOlderNumber", Math.round((float)crawlingIfOlder / (float)(60*24*30)));
prop.put("crawlingIfOlderUnitMonthCheck", "1");
} else if (crawlingIfOlder >= 60*24) {
prop.put("crawlingIfOlderNumber", Math.round((float)crawlingIfOlder / (float)(60*24)));
prop.put("crawlingIfOlderUnitDayCheck", "1");
} else if (crawlingIfOlder >= 60) {
prop.put("crawlingIfOlderNumber", Math.round(crawlingIfOlder / 60f));
prop.put("crawlingIfOlderUnitHourCheck", "1");
} else {
prop.put("crawlingIfOlderNumber", crawlingIfOlder);
prop.put("crawlingIfOlderUnitMinuteCheck", "1");
}
int crawlingDomFilterDepth = (int) env.getConfigLong("crawlingDomFilterDepth", -1);
prop.put("crawlingDomFilterCheck", (crawlingDomFilterDepth == -1) ? "0" : "1");
prop.put("crawlingDomFilterDepth", (crawlingDomFilterDepth == -1) ? 1 : crawlingDomFilterDepth);
int crawlingDomMaxPages = (int) env.getConfigLong("crawlingDomMaxPages", -1);
prop.put("crawlingDomMaxCheck", (crawlingDomMaxPages == -1) ? "0" : "1");
prop.put("crawlingDomMaxPages", (crawlingDomMaxPages == -1) ? 10000 : crawlingDomMaxPages);
prop.put("crawlingQChecked", env.getConfigBool("crawlingQ", false) ? "1" : "0");
prop.put("storeHTCacheChecked", env.getConfigBool("storeHTCache", false) ? "1" : "0");
prop.put("indexingTextChecked", env.getConfigBool("indexText", false) ? "1" : "0");
prop.put("indexingMediaChecked", env.getConfigBool("indexMedia", false) ? "1" : "0");
prop.put("crawlOrderChecked", env.getConfigBool("crawlOrder", false) ? "1" : "0");
long LCbusySleep = env.getConfigLong(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, 100L);
int LCppm = (LCbusySleep == 0) ? 1000 : (int) (60000L / LCbusySleep);
prop.put("crawlingSpeedMaxChecked", (LCppm >= 1000) ? "1" : "0");
prop.put("crawlingSpeedCustChecked", ((LCppm > 10) && (LCppm < 1000)) ? "1" : "0");
prop.put("crawlingSpeedMinChecked", (LCppm <= 10) ? "1" : "0");
prop.put("customPPMdefault", ((LCppm > 10) && (LCppm < 1000)) ? Integer.toString(LCppm) : "");
prop.put("xsstopwChecked", env.getConfigBool("xsstopw", false) ? "1" : "0");
prop.put("xdstopwChecked", env.getConfigBool("xdstopw", false) ? "1" : "0");
prop.put("xpstopwChecked", env.getConfigBool("xpstopw", false) ? "1" : "0");
// return rewrite properties
return prop;
}
}
Loading…
Cancel
Save