You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
519 lines
31 KiB
519 lines
31 KiB
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
|
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" >
|
|
<head>
|
|
<title>YaCy '#[clientname]#': Crawl Start</title>
|
|
#%env/templates/metas.template%#
|
|
<script type="text/javascript" src="js/ajax.js"></script>
|
|
<script type="text/javascript" src="js/IndexCreate.js"></script>
|
|
<script type="text/javascript">
|
|
//<![CDATA[
|
|
/**
|
|
* Set the state of all elements based on other elements state.
|
|
* @param {String} cId id of the element that had changed it's state
|
|
*/
|
|
function setStates(cId) {
|
|
// order matters!
|
|
// crawl start points
|
|
if ($('#url').isChecked()) {
|
|
$('#crawlingURL').enable();
|
|
$('#sitemapURL, #crawlingFile').disable();
|
|
if (cId === "url") { $('#crawlingURL').focus(); }
|
|
} else if ($('#sitemap').isChecked()) {
|
|
$('#sitemapURL').enable();
|
|
$('#crawlingURL, #crawlingFile').disable();
|
|
if (cId === "sitemap") { $('#sitemapURL').focus(); }
|
|
} else if ($('#file').isChecked()) {
|
|
$('#crawlingFile').enable();
|
|
$('#crawlingURL, #sitemapURL').disable();
|
|
if (cId === "file") { $('#crawlingFile').focus(); }
|
|
}
|
|
|
|
// Load Filters
|
|
if (cId === "rangeDomain" || cId === "rangeSubpath" ||
|
|
cId === "rangeWide" || typeof cId === 'undefined') {
|
|
if ($('#rangeDomain').isChecked() ||
|
|
$('#rangeSubpath').isChecked()) {
|
|
// restrict to sub-path / domain
|
|
$('#mustmatch').disable();
|
|
// skip these on initial load
|
|
if (typeof cId !== 'undefined') {
|
|
$('#deleteoldoff, #deleteoldage').uncheck();
|
|
$('#deleteoldon').check();
|
|
}
|
|
} else if ($('#rangeWide').isChecked()) {
|
|
// use Filter
|
|
$('#mustmatch').enable();
|
|
// skip these on initial load
|
|
if (typeof cId !== 'undefined') {
|
|
$('#deleteoldon, #deleteoldage').uncheck();
|
|
$('#deleteoldoff').check();
|
|
if (cId === "rangeWide") { $('#mustmatch').focus(); }
|
|
}
|
|
}
|
|
}
|
|
|
|
// crawl start: From File
|
|
if ($("#sitelist").isChecked()) {
|
|
document.getElementById('rangeDomainDescription').innerHTML ='Restrict to the domains in the link-list';
|
|
document.getElementById('rangeSubpathDescription').innerHTML ='Restrict to the subpaths in the link-list';
|
|
|
|
if ($("#rangeWide").isChecked()) {
|
|
// we allow also #rangeSubpath
|
|
$('#rangeDomain').check();
|
|
}
|
|
}
|
|
|
|
// Delete only old
|
|
if ($('#deleteoldage').isChecked()) {
|
|
$('#deleteIfOlderNumber, #deleteIfOlderUnit').enable();
|
|
} else {
|
|
$('#deleteIfOlderNumber, #deleteIfOlderUnit').disable();
|
|
}
|
|
|
|
// Reload if old
|
|
if ($('#reloadoldage').isChecked()) {
|
|
$('#reloadIfOlderNumber, #reloadIfOlderUnit').enable();
|
|
} else {
|
|
$('#reloadIfOlderNumber, #reloadIfOlderUnit').disable();
|
|
}
|
|
|
|
// Use Must-Match List for Country Codes?
|
|
if ($('#noCountryMustMatchSwitch').isChecked()) {
|
|
$('#countryMustMatchList').disable();
|
|
} else {
|
|
$('#countryMustMatchList').enable();
|
|
if (cId === "countryMustMatchSwitch") {
|
|
$('#countryMustMatchList').focus();
|
|
}
|
|
}
|
|
|
|
// Maximum pages per domain
|
|
if ($('#crawlingDomMaxCheck').isChecked()) {
|
|
$('#crawlingDomMaxPages').enable();
|
|
if (cId === "crawlingDomMaxCheck") {
|
|
$('#crawlingDomMaxPages').focus();
|
|
}
|
|
} else {
|
|
$('#crawlingDomMaxPages').disable();
|
|
}
|
|
|
|
// Remote crawl
|
|
if ($('#crawlOrder').isChecked()) {
|
|
$('#intention').enable();
|
|
if (cId === "crawlOrder") { $('#intention').focus(); }
|
|
} else {
|
|
$('#intention').disable();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Disable element if value matches val.
|
|
* @param {String} id element id
|
|
* @param {String} val value to comapre to elements value */
|
|
function disableIf(id, val) {
|
|
var e = $('#'+id);
|
|
if (e.val() === val) {
|
|
e.disable();
|
|
}
|
|
}
|
|
|
|
$(document).ready(function() {
|
|
(function($) {
|
|
/** Disable a form element. */
|
|
$.fn.disable = function() {
|
|
return this.each(function() {
|
|
$(this).prop('disabled', true);
|
|
});
|
|
};
|
|
|
|
/** Enable a form element. */
|
|
$.fn.enable = function() {
|
|
return this.each(function() {
|
|
$(this).prop('disabled', false);
|
|
});
|
|
};
|
|
|
|
/** Check DOM & properties if element is checkeds. */
|
|
$.fn.isChecked = function() {
|
|
return $(this).attr("checked") || $(this).prop("checked");
|
|
};
|
|
|
|
/** Set checked state for checkoxes/radio buttons. */
|
|
$.fn.check = function() {
|
|
return this.each(function() {
|
|
$(this).attr("checked", "checked").prop("checked", true);
|
|
});
|
|
};
|
|
|
|
/** Unset checked state for checkoxes/radio buttons. */
|
|
$.fn.uncheck = function() {
|
|
return this.each(function() {
|
|
$(this).removeAttr("checked").prop("checked", false);
|
|
});
|
|
};
|
|
})(jQuery);
|
|
|
|
/**
|
|
* On form submission remove text fields with default values as they
|
|
* are set to those by yacy values by yacy, if missing.
|
|
* @param {eventObject} ev */
|
|
$('#Crawler').on('submit', function(ev){
|
|
var defaultMatchAll = "#[matchAllStr]#";
|
|
var defaultMatchNone = "#[matchNoneStr]#";
|
|
|
|
// remove empty textfields
|
|
disableIf('crawlingDepthExtension', '');
|
|
disableIf('intention', '');
|
|
|
|
// remove if MATCH_NEVER_STRING
|
|
disableIf('mustnotmatch', defaultMatchNone);
|
|
disableIf('ipMustnotmatch', defaultMatchNone);
|
|
disableIf('indexmustnotmatch', defaultMatchNone);
|
|
disableIf('indexcontentmustnotmatch', defaultMatchNone);
|
|
|
|
// remove if MATCH_ALL_STRING
|
|
disableIf('mustmatch', defaultMatchAll);
|
|
disableIf('ipMustmatch', defaultMatchAll);
|
|
disableIf('indexmustmatch', defaultMatchAll);
|
|
disableIf('indexcontentmustmatch', defaultMatchAll);
|
|
|
|
// remove default collection name
|
|
disableIf('collection', '#[defaultCollection]#');
|
|
});
|
|
|
|
// add event handlers to all checkoxes & radio buttons
|
|
$(document).on('change', 'input:checkbox,input:radio', function() {
|
|
setStates($(this).attr("id"));
|
|
});
|
|
|
|
// set initial states
|
|
if ($('#crawlingURL').val() !== '') { changed(); }
|
|
setStates();
|
|
});
|
|
//]]>
|
|
</script>
|
|
<style type="text/css">
|
|
.nobr {
|
|
white-space: nowrap;
|
|
}
|
|
</style>
|
|
</head>
|
|
<body id="IndexCreate">
|
|
|
|
#%env/templates/header.template%#
|
|
#%env/templates/submenuIndexCreate.template%#
|
|
|
|
<div id="api">
|
|
<a href="http://www.yacy-websuche.de/wiki/index.php/Dev:API#Managing_crawl_jobs" id="apilink" target="_blank"><img src="env/grafics/api.png" width="60" height="40" alt="API"/></a>
|
|
<span>Click on this API button to see a documentation of the POST request parameter for crawl starts.</span>
|
|
</div>
|
|
|
|
<h2>Expert Crawl Start</h2>
|
|
|
|
<p id="startCrawling">
|
|
<strong>Start Crawling Job:</strong>
|
|
You can define URLs as start points for Web page crawling and start crawling here.
|
|
"Crawling" means that YaCy will download the given website, extract all links in it and then download the content behind these links.
|
|
This is repeated as long as specified under "Crawling Depth".
|
|
A crawl can also be started using wget and the <a href="http://www.yacy-websuche.de/wiki/index.php/Dev:API#Managing_crawl_jobs" target="_blank">post arguments</a> for this web page.
|
|
</p>
|
|
|
|
<form id="Crawler" action="Crawler_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
|
|
<fieldset>
|
|
<legend>Crawl Job</legend>
|
|
<p>A Crawl Job consist of one or more start point, crawl limitations and document freshness rules.</p>
|
|
<fieldset>
|
|
<legend>Start Point</legend>
|
|
<dl>
|
|
<dt>One Start URL or a list of URLs:<br/>(must start with http:// https:// ftp:// smb:// file://)</dt>
|
|
<dd>
|
|
<span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">Define the start-url(s) here. You can submit more than one URL, each line one URL please.
|
|
Each of these URLs are the root for a crawl start, existing start URLs are always re-loaded.
|
|
Other already visited URLs are sorted out as "double", if they are not allowed using the re-crawl option.
|
|
</span></span>
|
|
<input type="radio" align="top" name="crawlingMode" id="url" value="url" #(crawlingMode_url)#::checked="checked"#(/crawlingMode_url)# />
|
|
<textarea name="crawlingURL" id="crawlingURL" cols="64" rows="3" size="41" onkeypress="changed()">#[starturl]#</textarea>
|
|
|
|
<span id="robotsOK"></span>
|
|
<span id="title"><br/></span>
|
|
<img id="ajax" src="env/grafics/empty.gif" alt="empty" />
|
|
</dd>
|
|
<dt></dt>
|
|
<dd>
|
|
<input name="bookmarkTitle" id="bookmarkTitle" type="text" size="46" maxlength="256" value="#[bookmarkTitle]#" readonly="readonly" style="background:transparent; border:0px"/>
|
|
</dd>
|
|
<dt>From Link-List of URL</dt>
|
|
<dd>
|
|
<input type="radio" name="crawlingMode" id="sitelist" value="sitelist" #(has_url)#disabled="disabled"::#(/has_url)# #(crawlingMode_sitelist)#::checked="checked"#(/crawlingMode_sitelist)#/><br />
|
|
<div id="sitelistURLs"></div>
|
|
</dd>
|
|
<dt>From Sitemap</dt>
|
|
<dd>
|
|
<input type="radio" name="crawlingMode" id="sitemap" value="sitemap" #(crawlingMode_sitemap)#::checked="checked"#(/crawlingMode_sitemap)# #(has_sitemapURL)#disabled="disabled"::#(/has_sitemapURL)#/><input name="sitemapURL" id="sitemapURL" type="text" size="71" maxlength="256" value="#[sitemapURL]#"/>
|
|
</dd>
|
|
<dt>From File (enter a path<br/>within your local file system)</dt>
|
|
<dd>
|
|
<input type="radio" name="crawlingMode" id="file" value="file" #(crawlingMode_file)#::checked="checked"#(/crawlingMode_file)#/><input type="text" name="crawlingFile" id="crawlingFile" value="#[crawlingFile]#" size="71" maxlength="256"/>
|
|
</dd>
|
|
</dl>
|
|
</fieldset>
|
|
<fieldset>
|
|
<legend>Crawler Filter</legend>
|
|
<p>These are limitations on the crawl stacker. The filters will be applied before a web page is loaded.</p>
|
|
<dl>
|
|
<dt>Crawling Depth</dt>
|
|
<dd>
|
|
<span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
|
|
This defines how often the Crawler will follow links (of links..) embedded in websites.
|
|
0 means that only the page you enter under "Starting Point" will be added
|
|
to the index. 2-4 is good for normal indexing. Values over 8 are not useful, since a depth-8 crawl will
|
|
index approximately 25.600.000.000 pages, maybe this is the whole WWW.
|
|
</span></span>
|
|
<input name="crawlingDepth" id="crawlingDepth" type="text" size="2" maxlength="2" value="#[crawlingDepth]#" />
|
|
<input type="checkbox" name="directDocByURL" id="directDocByURL" #(directDocByURLChecked)#::checked="checked"#(/directDocByURLChecked)# />also all linked non-parsable documents
|
|
</dd>
|
|
<dt>Unlimited crawl depth for URLs matching with</dt>
|
|
<dd>
|
|
<input name="crawlingDepthExtension" id="crawlingDepthExtension" type="text" size="40" maxlength="100" value="#[crawlingDepthExtension]#" />
|
|
</dd>
|
|
|
|
<dt>Maximum Pages per Domain</dt>
|
|
<dd>
|
|
<span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
|
|
You can limit the maximum number of pages that are fetched and indexed from a single domain with this option.
|
|
You can combine this limitation with the 'Auto-Dom-Filter', so that the limit is applied to all the domains within
|
|
the given depth. Domains outside the given depth are then sorted-out anyway.
|
|
</span></span>
|
|
<label for="crawlingDomMaxCheck">Use</label>:
|
|
<input type="checkbox" name="crawlingDomMaxCheck" id="crawlingDomMaxCheck" #(crawlingDomMaxCheck)#::checked="checked"#(/crawlingDomMaxCheck)# />
|
|
<label for="crawlingDomMaxPages">Page-Count</label>:
|
|
<input name="crawlingDomMaxPages" id="crawlingDomMaxPages" type="text" size="6" maxlength="6" value="#[crawlingDomMaxPages]#" />
|
|
</dd>
|
|
|
|
<dt><label for="Constraints">misc. Constraints</label></dt>
|
|
<dd>
|
|
<span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
|
|
A questionmark is usually a hint for a dynamic page. URLs pointing to dynamic content should usually not be crawled.
|
|
However, there are sometimes web pages with static content that
|
|
is accessed with URLs containing question marks. If you are unsure, do not check this to avoid crawl loops.
|
|
Following frames is NOT done by Gxxg1e, but we do by default to have a richer content. 'nofollow' in robots metadata can be overridden; this does not affect obeying of the robots.txt which is never ignored.
|
|
</span></span>
|
|
Accept URLs with query-part ('?'): <input type="checkbox" name="crawlingQ" id="crawlingQ" #(crawlingQChecked)#::checked="checked"#(/crawlingQChecked)# />
|
|
Obey html-robots-noindex: <input type="checkbox" name="obeyHtmlRobotsNoindex" id="obeyHtmlRobotsNoindex" #(obeyHtmlRobotsNoindexChecked)#::checked="checked"#(/obeyHtmlRobotsNoindexChecked)# /><!--
|
|
Follow Frames: <input type="checkbox" name="followFrames" id="followFrames" #(followFramesChecked)#::checked="checked"#(/followFramesChecked)# /> -->
|
|
</dd>
|
|
<dt>Load Filter on URLs</dt>
|
|
<dd><span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
|
|
The filter is a <b><a href="http://download.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html" target="_blank">regular expression</a></b>.
|
|
Example: to allow only urls that contain the word 'science', set the must-match filter to '.*science.*'.
|
|
You can also use an automatic domain-restriction to fully crawl a single domain.
|
|
</span></span>
|
|
<table border="0">
|
|
<tr><td width="110"><img src="env/grafics/plus.gif"> must-match</td><td></td></tr>
|
|
<tr><td colspan="2"><input type="radio" name="range" id="rangeDomain" value="domain" #(range_domain)#::checked="checked"#(/range_domain)#/><div id="rangeDomainDescription" style="display:inline">Restrict to start domain(s)</div></td></tr>
|
|
<tr><td colspan="2"><input type="radio" name="range" id="rangeSubpath" value="subpath" #(range_subpath)#::checked="checked"#(/range_subpath)#/><div id="rangeSubpathDescription" style="display:inline">Restrict to sub-path(s)</div></td></tr>
|
|
<tr><td><input type="radio" name="range" id="rangeWide" value="wide" #(range_wide)#::checked="checked"#(/range_wide)#/>Use filter</td>
|
|
<td valign="bottom"><input name="mustmatch" id="mustmatch" type="text" size="55" maxlength="100000" value="#[mustmatch]#" onblur="if (this.value=='') this.value='.*';"/></td><td>(must not be empty)</td></tr>
|
|
<tr><td><img src="env/grafics/minus.gif"> must-not-match</td><td><input name="mustnotmatch" id="mustnotmatch" type="text" size="55" maxlength="100000" value="#[mustnotmatch]#" /></td></tr>
|
|
</table>
|
|
</dd>
|
|
<dt>Load Filter on IPs</dt>
|
|
<dd>
|
|
<table border="0">
|
|
<tr><td width="110"><img src="env/grafics/plus.gif"> must-match</td><td><input name="ipMustmatch" id="ipMustmatch" type="text" size="55" maxlength="100000" value="#[ipMustmatch]#" onblur="if (this.value=='') this.value='.*';"/></td><td>(must not be empty)</td></tr>
|
|
<tr><td><img src="env/grafics/minus.gif"> must-not-match</td><td><input name="ipMustnotmatch" id="ipMustnotmatch" type="text" size="55" maxlength="100000" value="#[ipMustnotmatch]#" /></td></tr>
|
|
</table>
|
|
</dd>
|
|
<dt><label for="crawlingCountryMustMatch">Must-Match List for Country Codes</label>
|
|
</dt>
|
|
<dd><span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
|
|
Crawls can be restricted to specific countries. This uses the country code that can be computed from
|
|
the IP of the server that hosts the page. The filter is not a regular expressions but a list of country codes, separated by comma.
|
|
</span></span>
|
|
<input type="radio" name="countryMustMatchSwitch" id="noCountryMustMatchSwitch" value="0" #(countryMustMatchSwitchChecked)#checked="checked"::#(/countryMustMatchSwitchChecked)# />no country code restriction<br />
|
|
<input type="radio" name="countryMustMatchSwitch" id="countryMustMatchSwitch" value="1" #(countryMustMatchSwitchChecked)#::checked="checked"#(/countryMustMatchSwitchChecked)# />Use filter
|
|
<input name="countryMustMatchList" id="countryMustMatchList" type="text" size="60" maxlength="256" value="#[countryMustMatch]#" />
|
|
</dd>
|
|
</dl>
|
|
</fieldset>
|
|
<fieldset>
|
|
<legend>Document Filter</legend>
|
|
<p>These are limitations on index feeder. The filters will be applied after a web page was loaded.</p>
|
|
<dl>
|
|
<dt>Filter on URLs</dt>
|
|
<dd>
|
|
<span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
|
|
The filter is a <b><a href="http://download.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html" target="_blank">regular expression</a></b>
|
|
that <b>must not match</b> with the URLs to allow that the content of the url is indexed.
|
|
</span></span>
|
|
<table border="0">
|
|
<tr><td width="110"><img src="env/grafics/plus.gif"> must-match</td><td><input name="indexmustmatch" id="indexmustmatch" type="text" size="55" maxlength="100000" value="#[indexmustmatch]#" onblur="if (this.value=='') this.value='.*';"/></td><td>(must not be empty)</td></tr>
|
|
<tr><td><img src="env/grafics/minus.gif"> must-not-match</td><td><input name="indexmustnotmatch" id="indexmustnotmatch" type="text" size="55" maxlength="100000" value="#[indexmustnotmatch]#" /></td></tr>
|
|
</table>
|
|
</dd>
|
|
<dt>Filter on Content of Document<br/>(all visible text, including camel-case-tokenized url and title)</dt>
|
|
<dd>
|
|
<table border="0">
|
|
<tr><td width="110"><img src="env/grafics/plus.gif"> must-match</td><td><input name="indexcontentmustmatch" id="indexcontentmustmatch" type="text" size="55" maxlength="100000" value="#[indexcontentmustmatch]#" onblur="if (this.value=='') this.value='.*';"/></td><td>(must not be empty)</td></tr>
|
|
<tr><td><img src="env/grafics/minus.gif"> must-not-match</td><td><input name="indexcontentmustnotmatch" id="indexcontentmustnotmatch" type="text" size="55" maxlength="100000" value="#[indexcontentmustnotmatch]#" /></td></tr>
|
|
</table>
|
|
</dd>
|
|
</dl>
|
|
</fieldset>
|
|
<fieldset>
|
|
<legend>Clean-Up before Crawl Start</legend>
|
|
<dl>
|
|
<dt>No Deletion</dt>
|
|
<dd><span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
|
|
After a crawl was done in the past, document may become stale and eventually they are also deleted on the target host.
|
|
To remove old files from the search index it is not sufficient to just consider them for re-load but it may be necessary
|
|
to delete them because they simply do not exist any more. Use this in combination with re-crawl while this time should be longer.
|
|
</span></span><input type="radio" name="deleteold" id="deleteoldoff" value="off" #(deleteold_off)#::checked="checked"#(/deleteold_off)#/>Do not delete any document before the crawl is started.</dd>
|
|
<dt>Delete sub-path</dt>
|
|
<dd><input type="radio" name="deleteold" id="deleteoldon" value="on" #(deleteold_on)#::checked="checked"#(/deleteold_on)#/>For each host in the start url list, delete all documents (in the given subpath) from that host.</dd>
|
|
<dt>Delete only old</dt>
|
|
<dd><input type="radio" name="deleteold" id="deleteoldage" value="age" #(deleteold_age)#::checked="checked"#(/deleteold_age)#/>Treat documents that are loaded
|
|
<select name="deleteIfOlderNumber" id="deleteIfOlderNumber">
|
|
#(deleteIfOlderSelect)#::
|
|
#{list}#<option value="#[name]#" #(default)#::selected="selected"#(/default)#>#[name]#</option>#{/list}#
|
|
#(/deleteIfOlderSelect)#
|
|
</select>
|
|
<select name="deleteIfOlderUnit" id="deleteIfOlderUnit">
|
|
#(deleteIfOlderUnitSelect)#::
|
|
#{list}#<option value="#[value]#" #(default)#::selected="selected"#(/default)#>#[name]#</option>#{/list}#
|
|
#(/deleteIfOlderUnitSelect)#
|
|
</select> ago as stale and delete them before the crawl is started.
|
|
</dd>
|
|
</dl>
|
|
</fieldset>
|
|
<fieldset>
|
|
<legend>Double-Check Rules</legend>
|
|
<dl>
|
|
<dt>No Doubles</dt>
|
|
<dd><span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
|
|
A web crawl performs a double-check on all links found in the internet against the internal database. If the same url is found again,
|
|
then the url is treated as double when you check the 'no doubles' option. A url may be loaded again when it has reached a specific age,
|
|
to use that check the 're-load' option.
|
|
</span></span><input type="radio" name="recrawl" id="reloadoldoff" value="nodoubles" #(recrawl_nodoubles)#::checked="checked"#(/recrawl_nodoubles)#/>Never load any page that is already known. Only the start-url may be loaded again.</dd>
|
|
<dt>Re-load</dt>
|
|
<dd><input type="radio" name="recrawl" id="reloadoldage" value="reload" #(recrawl_reload)#::checked="checked"#(/recrawl_reload)#/>Treat documents that are loaded
|
|
<select name="reloadIfOlderNumber" id="reloadIfOlderNumber">
|
|
#(reloadIfOlderSelect)#::
|
|
#{list}#<option value="#[name]#" #(default)#::selected="selected"#(/default)#>#[name]#</option>#{/list}#
|
|
#(/reloadIfOlderSelect)#
|
|
</select>
|
|
<select name="reloadIfOlderUnit" id="reloadIfOlderUnit">
|
|
#(reloadIfOlderUnitSelect)#::
|
|
#{list}#<option value="#[value]#" #(default)#::selected="selected"#(/default)#>#[name]#</option>#{/list}#
|
|
#(/reloadIfOlderUnitSelect)#
|
|
</select> ago as stale and load them again. If they are younger, they are ignored.
|
|
</dd>
|
|
</dl>
|
|
</fieldset>
|
|
<fieldset>
|
|
<legend>Document Cache</legend>
|
|
<dl><dt><label for="storeHTCache">Store to Web Cache</label></dt>
|
|
<dd>
|
|
<span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
|
|
This option is used by default for proxy prefetch, but is not needed for explicit crawling.
|
|
</span></span>
|
|
<input type="checkbox" name="storeHTCache" id="storeHTCache" #(storeHTCacheChecked)#::checked="checked"#(/storeHTCacheChecked)# />
|
|
</dd>
|
|
|
|
<dt><label for="mustmatch">Policy for usage of Web Cache</label></dt>
|
|
<dd>
|
|
<span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
|
|
The caching policy states when to use the cache during crawling:
|
|
<b>no cache</b>: never use the cache, all content from fresh internet source;
|
|
<b>if fresh</b>: use the cache if the cache exists and is fresh using the proxy-fresh rules;
|
|
<b>if exist</b>: use the cache if the cache exist. Do no check freshness. Otherwise use online source;
|
|
<b>cache only</b>: never go online, use all content from cache. If no cache exist, treat content as unavailable
|
|
</span></span>
|
|
<input type="radio" name="cachePolicy" value="nocache" #(cachePolicy_nocache)#::checked="checked"#(/cachePolicy_nocache)#/>no cache
|
|
<input type="radio" name="cachePolicy" value="iffresh" #(cachePolicy_iffresh)#::checked="checked"#(/cachePolicy_iffresh)# />if fresh
|
|
<input type="radio" name="cachePolicy" value="ifexist" #(cachePolicy_ifexist)#::checked="checked"#(/cachePolicy_ifexist)#/>if exist
|
|
<input type="radio" name="cachePolicy" value="cacheonly" #(cachePolicy_cacheonly)#::checked="checked"#(/cachePolicy_cacheonly)#/>cache only
|
|
</dd>
|
|
</dl>
|
|
</fieldset>
|
|
#(agentSelect)#<input type="hidden" name="agentName" id="agentName" value="#[defaultAgentName]#" />::
|
|
<fieldset>
|
|
<legend>Robot Behaviour</legend>
|
|
<dl>
|
|
<dt><label for="collection">Use Special User Agent and robot identification</label></dt>
|
|
<dd>
|
|
<span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
|
|
You are running YaCy in non-p2p mode and because YaCy can be used as replacement for commercial search appliances
|
|
(like the GSA) the user must be able to crawl all web pages that are granted to such commercial plattforms.
|
|
Not having this option would be a strong handicap for professional usage of this software. Therefore you are able to select
|
|
alternative user agents here which have different crawl timings and also identify itself with another user agent and obey the corresponding robots rule.
|
|
</span></span>
|
|
<select name="agentName" id="agentName">
|
|
#{list}#
|
|
<option value="#[name]#">#[name]#</option>
|
|
#{/list}#
|
|
</select>
|
|
</dd>
|
|
</dl>
|
|
</fieldset>
|
|
#(/agentSelect)#
|
|
<fieldset>
|
|
<legend>Index Administration</legend>
|
|
<dl>
|
|
<dt>Do Local Indexing</dt>
|
|
<dd>
|
|
<span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
|
|
This enables indexing of the webpages the crawler will download. This should be switched on by default, unless you want to crawl only to fill the
|
|
Document Cache without indexing.
|
|
</span></span>
|
|
<label for="indexText">index text</label>:
|
|
<input type="checkbox" name="indexText" id="indexText" #(indexingTextChecked)#::checked="checked"#(/indexingTextChecked)# />
|
|
<label for="indexMedia">index media</label>:
|
|
<input type="checkbox" name="indexMedia" id="indexMedia" #(indexingMediaChecked)#::checked="checked"#(/indexingMediaChecked)# />
|
|
</dd>
|
|
|
|
<dt><label for="crawlOrder">Do Remote Indexing</label></dt>
|
|
<dd>
|
|
<span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
|
|
If checked, the crawler will contact other peers and use them as remote indexers for your crawl.
|
|
If you need your crawling results locally, you should switch this off.
|
|
Only senior and principal peers can initiate or receive remote crawls.
|
|
<strong>A YaCyNews message will be created to inform all peers about a global crawl</strong>,
|
|
so they can omit starting a crawl with the same start point.
|
|
</span></span>
|
|
<table border="0">
|
|
<tr>
|
|
<td>
|
|
<input type="checkbox" name="crawlOrder" id="crawlOrder" #(crawlOrderChecked)#::checked="checked"#(/crawlOrderChecked)# />
|
|
</td>
|
|
<td>
|
|
<label for="intention">Describe your intention to start this global crawl (optional)</label>:<br />
|
|
<input name="intention" id="intention" type="text" size="40" maxlength="100" value="#[intention]#" /><br />
|
|
This message will appear in the 'Other Peer Crawl Start' table of other peers.
|
|
</td>
|
|
</tr>
|
|
</table>
|
|
</dd>
|
|
|
|
<dt><label for="collection">Add Crawl result to collection(s)</label></dt>
|
|
<dd>
|
|
<span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
|
|
A crawl result can be tagged with names which are candidates for a collection request.
|
|
These tags can be selected with the <a href="gsa/search?q=www&site=#[collection]#">GSA interface</a> using the 'site' operator.
|
|
To use this option, the 'collection_sxt'-field must be switched on in the <a href="IndexFederated_p.html">Solr Schema</a>
|
|
</span></span>
|
|
<input name="collection" id="collection" type="text" size="60" maxlength="100" value="#[collection]#" #(collectionEnabled)#disabled="disabled"::#(/collectionEnabled)# />
|
|
</dd>
|
|
|
|
</dl>
|
|
</fieldset>
|
|
|
|
<dt><input type="hidden" name="crawlingstart" value="1"/><input type="submit" value="Start New Crawl Job" class="btn btn-primary"/></dt><dd></dd>
|
|
</dl>
|
|
</fieldset>
|
|
</form>
|
|
|
|
#%env/templates/footer.template%#
|
|
</body>
|
|
</html>
|