enabling all crawl profiles in all network modes

also: increased default internet crawl speed to
4 urls/s/host
pull/402/head
Michael Peter Christen 4 years ago
parent 32ca669bfb
commit d0abb0cedb

@ -56,13 +56,13 @@
if ($("#sitelist").isChecked()) {
document.getElementById('rangeDomainDescription').innerHTML ='Restrict to the domains in the link-list';
document.getElementById('rangeSubpathDescription').innerHTML ='Restrict to the subpaths in the link-list';
if ($("#rangeWide").isChecked()) {
// we allow also #rangeSubpath
$('#rangeDomain').check();
}
}
// Delete only old
if ($('#deleteoldage').isChecked()) {
$('#deleteIfOlderNumber, #deleteIfOlderUnit').enable();
@ -100,15 +100,15 @@
// Remote crawl
var remoteCrawlerDisabledInfo = document.getElementById('remoteCrawlerDisabledInfo');
if ($('#crawlOrder').isChecked()) {
if(remoteCrawlerDisabledInfo != null) {
remoteCrawlerDisabledInfo.className = '';
}
if(remoteCrawlerDisabledInfo != null) {
remoteCrawlerDisabledInfo.className = '';
}
$('#intention').enable();
if (cId === "crawlOrder") { $('#intention').focus(); }
} else {
if(remoteCrawlerDisabledInfo != null) {
remoteCrawlerDisabledInfo.className = 'hidden';
}
if(remoteCrawlerDisabledInfo != null) {
remoteCrawlerDisabledInfo.className = 'hidden';
}
$('#intention').disable();
}
}
@ -206,23 +206,23 @@
//]]>
</script>
<style type="text/css">
.nobr {
white-space: nowrap;
}
.nobr {
white-space: nowrap;
}
</style>
</head>
<body id="IndexCreate">
#%env/templates/header.template%#
#%env/templates/submenuIndexCreate.template%#
<div id="api">
<a href="http://www.yacy-websearch.net/wiki/index.php/Dev:APICrawler" id="apilink" target="_blank"><img src="env/grafics/api.png" width="60" height="40" alt="API"/></a>
<span>Click on this API button to see a documentation of the POST request parameter for crawl starts.</span>
</div>
<h2>Expert Crawl Start</h2>
<p id="startCrawling">
<strong>Start Crawling Job:</strong>&nbsp;
You can define URLs as start points for Web page crawling and start crawling here.
@ -230,7 +230,7 @@
This is repeated as long as specified under "Crawling Depth".
A crawl can also be started using wget and the <a href="http://www.yacy-websearch.net/wiki/index.php/Dev:APICrawler" target="_blank">post arguments</a> for this web page.
</p>
<form id="Crawler" action="Crawler_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset>
<legend>Crawl Job</legend>
@ -248,30 +248,30 @@
<textarea name="crawlingURL" id="crawlingURL" cols="64" rows="3" onkeydown="changed()">#[starturl]#</textarea>
&nbsp;
<span id="robotsOK"></span>
<span id="title"><br/></span>
<img id="ajax" src="env/grafics/empty.gif" alt="empty" />
</dd>
<dt></dt>
<dd>
<input name="bookmarkTitle" id="bookmarkTitle" type="text" size="46" maxlength="256" value="#[bookmarkTitle]#" readonly="readonly" style="background:transparent; border:0px"/>
</dd>
<dt>From Link-List of URL</dt>
<dd>
<input type="radio" name="crawlingMode" id="sitelist" value="sitelist" #(has_url)#disabled="disabled"::#(/has_url)# #(crawlingMode_sitelist)#::checked="checked"#(/crawlingMode_sitelist)#/><br />
<span id="title"><br/></span>
<img id="ajax" src="env/grafics/empty.gif" alt="empty" />
</dd>
<dt></dt>
<dd>
<input name="bookmarkTitle" id="bookmarkTitle" type="text" size="46" maxlength="256" value="#[bookmarkTitle]#" readonly="readonly" style="background:transparent; border:0px"/>
</dd>
<dt>From Link-List of URL</dt>
<dd>
<input type="radio" name="crawlingMode" id="sitelist" value="sitelist" #(has_url)#disabled="disabled"::#(/has_url)# #(crawlingMode_sitelist)#::checked="checked"#(/crawlingMode_sitelist)#/><br />
<div id="sitelistURLs"></div>
<button id="expandSiteListBtn" style="visibility:hidden" type="button" onclick="this.disabled = true;loadInfos(true);" class="btn btn-default btn-xs" title="Show all links">
<span class="glyphicon glyphicon-option-horizontal"></span>
<span class="glyphicon glyphicon-option-horizontal"></span>
</button>
</dd>
<dt>From Sitemap</dt>
<dd>
<input type="radio" name="crawlingMode" id="sitemap" value="sitemap" #(crawlingMode_sitemap)#::checked="checked"#(/crawlingMode_sitemap)# #(has_sitemapURL)#disabled="disabled"::#(/has_sitemapURL)#/><input name="sitemapURL" id="sitemapURL" type="text" size="71" maxlength="256" value="#[sitemapURL]#"/>
</dd>
<dt>From File (enter a path<br/>within your local file system)</dt>
<dd>
<input type="radio" name="crawlingMode" id="file" value="file" #(crawlingMode_file)#::checked="checked"#(/crawlingMode_file)#/><input type="text" name="crawlingFile" id="crawlingFile" value="#[crawlingFile]#" size="71" maxlength="256"/>
</dd>
</dl>
</dd>
<dt>From Sitemap</dt>
<dd>
<input type="radio" name="crawlingMode" id="sitemap" value="sitemap" #(crawlingMode_sitemap)#::checked="checked"#(/crawlingMode_sitemap)# #(has_sitemapURL)#disabled="disabled"::#(/has_sitemapURL)#/><input name="sitemapURL" id="sitemapURL" type="text" size="71" maxlength="256" value="#[sitemapURL]#"/>
</dd>
<dt>From File (enter a path<br/>within your local file system)</dt>
<dd>
<input type="radio" name="crawlingMode" id="file" value="file" #(crawlingMode_file)#::checked="checked"#(/crawlingMode_file)#/><input type="text" name="crawlingFile" id="crawlingFile" value="#[crawlingFile]#" size="71" maxlength="256"/>
</dd>
</dl>
</fieldset>
<fieldset>
<legend>Crawler Filter</legend>
@ -305,7 +305,7 @@
<label for="crawlingDomMaxPages">Page-Count</label>:
<input name="crawlingDomMaxPages" id="crawlingDomMaxPages" type="text" size="6" maxlength="6" value="#[crawlingDomMaxPages]#" />
</dd>
<dt><label>misc. Constraints</label></dt>
<dd>
<span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
@ -319,29 +319,29 @@
Obey html-robots-nofollow: <input type="checkbox" name="obeyHtmlRobotsNofollow" id="obeyHtmlRobotsNofollow" #(obeyHtmlRobotsNofollowChecked)#::checked="checked"#(/obeyHtmlRobotsNofollowChecked)# /><!--<br/>
Follow Frames: <input type="checkbox" name="followFrames" id="followFrames" #(followFramesChecked)#::checked="checked"#(/followFramesChecked)# />-->
</dd>
<dt>Media Type detection</dt>
<dd>
<div class="info" style="float:right">
<img src="env/grafics/i16.gif" width="16" height="16" alt="Media Type checking info"/>
<span style="right:0px; width:30em;" id="mediaTypeCheckingInfo">
Not loading URLs with unsupported file extension is faster but less accurate.
Indeed, for some web resources the actual Media Type is not consistent with the URL file extension. Here are some examples:
<ul>
<li><a href="https://en.wikipedia.org/wiki/.de" target="_blank">https://en.wikipedia.org/wiki/.de</a> : the .de extension is unknown, but the actual Media Type of this page is text/html</li>
<li><a href="https://en.wikipedia.org/wiki/Ask.com" target="_blank">https://en.wikipedia.org/wiki/Ask.com</a> : the .com extension is not supported (executable file format), but the actual Media Type of this page is text/html</li>
<li><a href="https://commons.wikimedia.org/wiki/File:YaCy_logo.png" target="_blank">https://commons.wikimedia.org/wiki/File:YaCy_logo.png</a> : the .png extension is a supported image format, but the actual Media Type of this page is text/html</li>
</ul>
</span>
</div>
<label>
<input type="radio" aria-describedby="mediaTypeCheckingInfo" name="crawlerAlwaysCheckMediaType" value="false" #(crawlerAlwaysCheckMediaType)#checked="checked"::#(/crawlerAlwaysCheckMediaType)# /> Do not load URLs with an unsupported file extension
</label>
<label>
<input type="radio" name="crawlerAlwaysCheckMediaType" value="true" #(crawlerAlwaysCheckMediaType)#::checked="checked"#(/crawlerAlwaysCheckMediaType)# /> Always cross check file extension against Content-Type header
</label>
</dd>
<dt>Load Filter on URLs</dt>
<dd><span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
<dt>Media Type detection</dt>
<dd>
<div class="info" style="float:right">
<img src="env/grafics/i16.gif" width="16" height="16" alt="Media Type checking info"/>
<span style="right:0px; width:30em;" id="mediaTypeCheckingInfo">
Not loading URLs with unsupported file extension is faster but less accurate.
Indeed, for some web resources the actual Media Type is not consistent with the URL file extension. Here are some examples:
<ul>
<li><a href="https://en.wikipedia.org/wiki/.de" target="_blank">https://en.wikipedia.org/wiki/.de</a> : the .de extension is unknown, but the actual Media Type of this page is text/html</li>
<li><a href="https://en.wikipedia.org/wiki/Ask.com" target="_blank">https://en.wikipedia.org/wiki/Ask.com</a> : the .com extension is not supported (executable file format), but the actual Media Type of this page is text/html</li>
<li><a href="https://commons.wikimedia.org/wiki/File:YaCy_logo.png" target="_blank">https://commons.wikimedia.org/wiki/File:YaCy_logo.png</a> : the .png extension is a supported image format, but the actual Media Type of this page is text/html</li>
</ul>
</span>
</div>
<label>
<input type="radio" aria-describedby="mediaTypeCheckingInfo" name="crawlerAlwaysCheckMediaType" value="false" #(crawlerAlwaysCheckMediaType)#checked="checked"::#(/crawlerAlwaysCheckMediaType)# /> Do not load URLs with an unsupported file extension
</label>
<label>
<input type="radio" name="crawlerAlwaysCheckMediaType" value="true" #(crawlerAlwaysCheckMediaType)#::checked="checked"#(/crawlerAlwaysCheckMediaType)# /> Always cross check file extension against Content-Type header
</label>
</dd>
<dt>Load Filter on URLs</dt>
<dd><span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
The filter is a <b><a href="https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html" target="_blank">regular expression</a></b>.
Example: to allow only urls that contain the word 'science', set the must-match filter to '.*science.*'.
You can also use an automatic domain-restriction to fully crawl a single domain.
@ -349,159 +349,159 @@
</span></span>
<table style="border-width: 0px">
<tr><td style="width:110px"><img src="env/grafics/plus.gif" alt=""> must-match</td><td></td></tr>
<tr><td colspan="2"><input type="radio" name="range" id="rangeDomain" value="domain" #(range_domain)#::checked="checked"#(/range_domain)#/><div id="rangeDomainDescription" style="display:inline">Restrict to start domain(s)</div></td></tr>
<tr><td colspan="2"><input type="radio" name="range" id="rangeSubpath" value="subpath" #(range_subpath)#::checked="checked"#(/range_subpath)#/><div id="rangeSubpathDescription" style="display:inline">Restrict to sub-path(s)</div></td></tr>
<tr><td><input type="radio" name="range" id="rangeWide" value="wide" #(range_wide)#::checked="checked"#(/range_wide)#/>Use filter</td>
<td style="vertical-align: bottom"><input name="mustmatch" id="mustmatch" type="text" size="55" maxlength="100000" value="#[mustmatch]#" onblur="if (this.value=='') this.value='.*';"/> (must not be empty)</td></tr>
<tr><td colspan="2"><input type="radio" name="range" id="rangeDomain" value="domain" #(range_domain)#::checked="checked"#(/range_domain)#/><div id="rangeDomainDescription" style="display:inline">Restrict to start domain(s)</div></td></tr>
<tr><td colspan="2"><input type="radio" name="range" id="rangeSubpath" value="subpath" #(range_subpath)#::checked="checked"#(/range_subpath)#/><div id="rangeSubpathDescription" style="display:inline">Restrict to sub-path(s)</div></td></tr>
<tr><td><input type="radio" name="range" id="rangeWide" value="wide" #(range_wide)#::checked="checked"#(/range_wide)#/>Use filter</td>
<td style="vertical-align: bottom"><input name="mustmatch" id="mustmatch" type="text" size="55" maxlength="100000" value="#[mustmatch]#" onblur="if (this.value=='') this.value='.*';"/> (must not be empty)</td></tr>
<tr><td><img src="env/grafics/minus.gif" alt=""> must-not-match</td><td><input name="mustnotmatch" id="mustnotmatch" type="text" size="55" maxlength="100000" value="#[mustnotmatch]#" /></td></tr>
</table>
</dd>
<dt>Load Filter on URL origin of links</dt>
<dd>
<span class="info" style="float:right">
<img src="env/grafics/i16.gif" width="16" height="16" alt="info"/>
<span style="right:0px;">
The filter is a <b><a href="https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html" target="_blank">regular expression</a></b>.
Example: to allow loading only links from pages on example.org domain, set the must-match filter to '.*example.org.*'.
Attention: you can test the functionality of your regular expressions using the <a href="RegexTest.html">Regular Expression Tester</a> within YaCy.
</span>
</span>
<table style="border-width: 0px">
<tr>
<td style="width:110px"><img src="env/grafics/plus.gif" alt=""> must-match</td>
<td><input name="crawlerOriginURLMustMatch" id="crawlerOriginURLMustMatch" type="text" size="55" maxlength="100000" value="#[crawlerOriginURLMustMatch]#" onblur="if (this.value=='') this.value='.*';"/> (must not be empty)</td>
</tr>
<tr>
<td><img src="env/grafics/minus.gif" alt=""> must-not-match</td>
<td><input name="crawlerOriginURLMustNotMatch" id="crawlerOriginURLMustNotMatch" type="text" size="55" maxlength="100000" value="#[crawlerOriginURLMustNotMatch]#" /></td>
</tr>
</table>
</dd>
<dt>Load Filter on IPs</dt>
<dd>
</table>
</dd>
<dt>Load Filter on URL origin of links</dt>
<dd>
<span class="info" style="float:right">
<img src="env/grafics/i16.gif" width="16" height="16" alt="info"/>
<span style="right:0px;">
The filter is a <b><a href="https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html" target="_blank">regular expression</a></b>.
Example: to allow loading only links from pages on example.org domain, set the must-match filter to '.*example.org.*'.
Attention: you can test the functionality of your regular expressions using the <a href="RegexTest.html">Regular Expression Tester</a> within YaCy.
</span>
</span>
<table style="border-width: 0px">
<tr>
<td style="width:110px"><img src="env/grafics/plus.gif" alt=""> must-match</td>
<td><input name="crawlerOriginURLMustMatch" id="crawlerOriginURLMustMatch" type="text" size="55" maxlength="100000" value="#[crawlerOriginURLMustMatch]#" onblur="if (this.value=='') this.value='.*';"/> (must not be empty)</td>
</tr>
<tr>
<td><img src="env/grafics/minus.gif" alt=""> must-not-match</td>
<td><input name="crawlerOriginURLMustNotMatch" id="crawlerOriginURLMustNotMatch" type="text" size="55" maxlength="100000" value="#[crawlerOriginURLMustNotMatch]#" /></td>
</tr>
</table>
</dd>
<dt>Load Filter on IPs</dt>
<dd>
<table style="border-width: 0px">
<tr><td style="width:110px"><img src="env/grafics/plus.gif" alt=""> must-match</td><td><input name="ipMustmatch" id="ipMustmatch" type="text" size="55" maxlength="100000" value="#[ipMustmatch]#" onblur="if (this.value=='') this.value='.*';"/> (must not be empty)</td></tr>
<tr><td><img src="env/grafics/minus.gif" alt=""> must-not-match</td><td><input name="ipMustnotmatch" id="ipMustnotmatch" type="text" size="55" maxlength="100000" value="#[ipMustnotmatch]#" /></td></tr>
</table>
</dd>
<tr><td><img src="env/grafics/minus.gif" alt=""> must-not-match</td><td><input name="ipMustnotmatch" id="ipMustnotmatch" type="text" size="55" maxlength="100000" value="#[ipMustnotmatch]#" /></td></tr>
</table>
</dd>
<dt><label>Must-Match List for Country Codes</label>
</dt>
<dd><span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
Crawls can be restricted to specific countries. This uses the country code that can be computed from
the IP of the server that hosts the page. The filter is not a regular expressions but a list of country codes, separated by comma.
</span></span>
<input type="radio" name="countryMustMatchSwitch" id="noCountryMustMatchSwitch" value="0" #(countryMustMatchSwitchChecked)#checked="checked"::#(/countryMustMatchSwitchChecked)# />no country code restriction<br />
<input type="radio" name="countryMustMatchSwitch" id="countryMustMatchSwitch" value="1" #(countryMustMatchSwitchChecked)#::checked="checked"#(/countryMustMatchSwitchChecked)# />Use filter&nbsp;&nbsp;
<input name="countryMustMatchList" id="countryMustMatchList" type="text" size="60" maxlength="256" value="#[countryMustMatch]#" />
</dd>
</dl>
<input type="radio" name="countryMustMatchSwitch" id="noCountryMustMatchSwitch" value="0" #(countryMustMatchSwitchChecked)#checked="checked"::#(/countryMustMatchSwitchChecked)# />no country code restriction<br />
<input type="radio" name="countryMustMatchSwitch" id="countryMustMatchSwitch" value="1" #(countryMustMatchSwitchChecked)#::checked="checked"#(/countryMustMatchSwitchChecked)# />Use filter&nbsp;&nbsp;
<input name="countryMustMatchList" id="countryMustMatchList" type="text" size="60" maxlength="256" value="#[countryMustMatch]#" />
</dd>
</dl>
</fieldset>
<fieldset>
<legend>Document Filter</legend>
<p>These are limitations on index feeder. The filters will be applied after a web page was loaded.</p>
<dl>
<dt>Filter on URLs</dt>
<dd>
<dt>Filter on URLs</dt>
<dd>
<span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
The filter is a <b><a href="https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html" target="_blank">regular expression</a></b>
that <b>must not match</b> with the URLs to allow that the content of the url is indexed.
Attention: you can test the functionality of your regular expressions using the <a href="RegexTest.html">Regular Expression Tester</a> within YaCy.
</span></span>
<table style="border-width: 0px">
<tr><td style="width:110px"><img src="env/grafics/plus.gif" alt=""> must-match</td><td><input name="indexmustmatch" id="indexmustmatch" type="text" size="55" maxlength="100000" value="#[indexmustmatch]#" onblur="if (this.value=='') this.value='.*';"/> (must not be empty)</td></tr>
<tr><td><img src="env/grafics/minus.gif" alt=""> must-not-match</td><td><input name="indexmustnotmatch" id="indexmustnotmatch" type="text" size="55" maxlength="100000" value="#[indexmustnotmatch]#" /></td></tr>
</table>
</dd>
<dt>Filter on Content of Document<br/>(all visible text, including camel-case-tokenized url and title)</dt>
<dd>
<tr><td style="width:110px"><img src="env/grafics/plus.gif" alt=""> must-match</td><td><input name="indexmustmatch" id="indexmustmatch" type="text" size="55" maxlength="100000" value="#[indexmustmatch]#" onblur="if (this.value=='') this.value='.*';"/> (must not be empty)</td></tr>
<tr><td><img src="env/grafics/minus.gif" alt=""> must-not-match</td><td><input name="indexmustnotmatch" id="indexmustnotmatch" type="text" size="55" maxlength="100000" value="#[indexmustnotmatch]#" /></td></tr>
</table>
</dd>
<dt>Filter on Content of Document<br/>(all visible text, including camel-case-tokenized url and title)</dt>
<dd>
<table style="border-width: 0px">
<tr><td style="width:110px"><img src="env/grafics/plus.gif" alt=""> must-match</td><td><input name="indexcontentmustmatch" id="indexcontentmustmatch" type="text" size="55" maxlength="100000" value="#[indexcontentmustmatch]#" onblur="if (this.value=='') this.value='.*';"/> (must not be empty)</td></tr>
<tr><td><img src="env/grafics/minus.gif" alt=""> must-not-match</td><td><input name="indexcontentmustnotmatch" id="indexcontentmustnotmatch" type="text" size="55" maxlength="100000" value="#[indexcontentmustnotmatch]#" /></td></tr>
</table>
</dd>
<dt>Filter on Document Media Type (aka MIME type)</dt>
<dd>
<div class="info" style="float:right">
<img src="env/grafics/i16.gif" width="16" height="16" alt="Media Type filter info"/>
<span style="right:0px;" id="mediaTypeMustMatchInfo">
The filter is a <b><a href="https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html" target="_blank">regular expression</a></b>
that <b>must match</b> with the document Media Type (also known as MIME Type) to allow the URL to be indexed.
Standard Media Types are described at the <a href="https://www.iana.org/assignments/media-types/media-types.xhtml" target="_blank">IANA registry</a>.
Attention: you can test the functionality of your regular expressions using the <a href="RegexTest.html">Regular Expression Tester</a> within YaCy.
</span>
</div>
<table style="border-width: 0px">
<tr>
<td style="width:110px"><img src="env/grafics/plus.gif" alt=""> must-match</td>
<td><input name="indexMediaTypeMustMatch" id="indexMediaTypeMustMatch" type="text" size="55" maxlength="100000" value="#[indexMediaTypeMustMatch]#" aria-describedby="mediaTypeMustMatchInfo" /></td>
</tr>
<tr>
<td><img src="env/grafics/minus.gif" alt=""> must-not-match</td>
<td><input name="indexMediaTypeMustNotMatch" id="indexMediaTypeMustNotMatch" type="text" size="55" maxlength="100000" value="#[indexMediaTypeMustNotMatch]#" /></td>
</tr>
</table>
</dd>
<dt>Solr query filter on any active <a href="IndexSchema_p.html" target="_blank">indexed</a> field(s)</dt>
<dd>
<div class="info" style="float:right">
<img src="env/grafics/i16.gif" width="16" height="16" alt="Solr query filter info"/>
<span style="right:0px;" id="indexSolrQueryInfo">
Each parsed document is checked against the given Solr query before being added to the index.
The query must be written in respect to the <a href="https://lucene.apache.org/solr/guide/6_6/the-standard-query-parser.html#the-standard-query-parser" target="_blank">standard</a> Solr query syntax.
</span>
</div>
<table style="border-width: 0px" role="presentation">
#(embeddedSolrConnected)#
<tr>
<td>
<div class="info"><p>The embedded local Solr index must be connected to use this kind of filter.</p>
<p>You can configure this with the <a href="IndexFederated_p.html">Index Sources &amp; targets</a> page.</p></div>
</td>
</tr>
::
<tr>
<td style="width:110px"><img src="env/grafics/plus.gif" alt=""> must-match</td>
<td>
<input name="indexSolrQueryMustMatch" id="indexSolrQueryMustMatch" type="text" size="55" maxlength="100000" value="#[indexSolrQueryMustMatch]#" aria-describedby="indexSolrQueryInfo" />
</td>
</tr>
<tr>
<td style="width:110px"><img src="env/grafics/minus.gif" alt=""> must-not-match</td>
<td>
<input name="indexSolrQueryMustNotMatch" id="indexSolrQueryMustNotMatch" type="text" size="55" maxlength="100000" value="#[indexSolrQueryMustNotMatch]#" aria-describedby="indexSolrQueryInfo" />
</td>
</tr>
#(/embeddedSolrConnected)#
</table>
</dd>
</dl>
<tr><td style="width:110px"><img src="env/grafics/plus.gif" alt=""> must-match</td><td><input name="indexcontentmustmatch" id="indexcontentmustmatch" type="text" size="55" maxlength="100000" value="#[indexcontentmustmatch]#" onblur="if (this.value=='') this.value='.*';"/> (must not be empty)</td></tr>
<tr><td><img src="env/grafics/minus.gif" alt=""> must-not-match</td><td><input name="indexcontentmustnotmatch" id="indexcontentmustnotmatch" type="text" size="55" maxlength="100000" value="#[indexcontentmustnotmatch]#" /></td></tr>
</table>
</dd>
<dt>Filter on Document Media Type (aka MIME type)</dt>
<dd>
<div class="info" style="float:right">
<img src="env/grafics/i16.gif" width="16" height="16" alt="Media Type filter info"/>
<span style="right:0px;" id="mediaTypeMustMatchInfo">
The filter is a <b><a href="https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html" target="_blank">regular expression</a></b>
that <b>must match</b> with the document Media Type (also known as MIME Type) to allow the URL to be indexed.
Standard Media Types are described at the <a href="https://www.iana.org/assignments/media-types/media-types.xhtml" target="_blank">IANA registry</a>.
Attention: you can test the functionality of your regular expressions using the <a href="RegexTest.html">Regular Expression Tester</a> within YaCy.
</span>
</div>
<table style="border-width: 0px">
<tr>
<td style="width:110px"><img src="env/grafics/plus.gif" alt=""> must-match</td>
<td><input name="indexMediaTypeMustMatch" id="indexMediaTypeMustMatch" type="text" size="55" maxlength="100000" value="#[indexMediaTypeMustMatch]#" aria-describedby="mediaTypeMustMatchInfo" /></td>
</tr>
<tr>
<td><img src="env/grafics/minus.gif" alt=""> must-not-match</td>
<td><input name="indexMediaTypeMustNotMatch" id="indexMediaTypeMustNotMatch" type="text" size="55" maxlength="100000" value="#[indexMediaTypeMustNotMatch]#" /></td>
</tr>
</table>
</dd>
<dt>Solr query filter on any active <a href="IndexSchema_p.html" target="_blank">indexed</a> field(s)</dt>
<dd>
<div class="info" style="float:right">
<img src="env/grafics/i16.gif" width="16" height="16" alt="Solr query filter info"/>
<span style="right:0px;" id="indexSolrQueryInfo">
Each parsed document is checked against the given Solr query before being added to the index.
The query must be written in respect to the <a href="https://lucene.apache.org/solr/guide/6_6/the-standard-query-parser.html#the-standard-query-parser" target="_blank">standard</a> Solr query syntax.
</span>
</div>
<table style="border-width: 0px" role="presentation">
#(embeddedSolrConnected)#
<tr>
<td>
<div class="info"><p>The embedded local Solr index must be connected to use this kind of filter.</p>
<p>You can configure this with the <a href="IndexFederated_p.html">Index Sources &amp; targets</a> page.</p></div>
</td>
</tr>
::
<tr>
<td style="width:110px"><img src="env/grafics/plus.gif" alt=""> must-match</td>
<td>
<input name="indexSolrQueryMustMatch" id="indexSolrQueryMustMatch" type="text" size="55" maxlength="100000" value="#[indexSolrQueryMustMatch]#" aria-describedby="indexSolrQueryInfo" />
</td>
</tr>
<tr>
<td style="width:110px"><img src="env/grafics/minus.gif" alt=""> must-not-match</td>
<td>
<input name="indexSolrQueryMustNotMatch" id="indexSolrQueryMustNotMatch" type="text" size="55" maxlength="100000" value="#[indexSolrQueryMustNotMatch]#" aria-describedby="indexSolrQueryInfo" />
</td>
</tr>
#(/embeddedSolrConnected)#
</table>
</dd>
</dl>
</fieldset>
<fieldset>
<legend>Content Filter</legend>
<p>These are limitations on parts of a document. The filter will be applied after a web page was loaded.</p>
<dl>
<dt>Filter div or nav class names</dt>
<dd>
<dt>Filter div or nav class names</dt>
<dd>
<table style="border-width: 0px">
<tr><td style="width:110px">set of CSS class names</td><td><input name="ignoreclassname" id="ignoreclassname" type="text" size="55" maxlength="100000" value="#[ignoreclassname]#" onblur="if (this.value=='') this.value='';"/></td><td>comma-separated list of &lt;div&gt; or &lt;nav&gt; element class names which should be filtered out</td></tr>
</table>
</dd>
</dl>
<tr><td style="width:110px">set of CSS class names</td><td><input name="ignoreclassname" id="ignoreclassname" type="text" size="55" maxlength="100000" value="#[ignoreclassname]#" onblur="if (this.value=='') this.value='';"/></td><td>comma-separated list of &lt;div&gt; or &lt;nav&gt; element class names which should be filtered out</td></tr>
</table>
</dd>
</dl>
</fieldset>
<fieldset>
<legend>Clean-Up before Crawl Start</legend>
<dl>
<dt><label for="cleanSearchCache">Clean up search events cache</label></dt>
<dd>
<input type="checkbox" name="cleanSearchCache" id="cleanSearchCache" #(cleanSearchCacheChecked)#::checked="checked"#(/cleanSearchCacheChecked)# aria-describedby="cleanSearchCacheInfo"/>
<div class="info" style="float:right">
<img src="env/grafics/i16.gif" width="16" height="16" alt="Clean up search events cache info"/>
<span style="right:0px;" id="cleanSearchCacheInfo">
Check this option to be sure to get fresh search results including newly crawled documents. Beware that it will also interrupt any refreshing/resorting of search results currently requested from browser-side.
</span>
</div>
<input type="checkbox" name="cleanSearchCache" id="cleanSearchCache" #(cleanSearchCacheChecked)#::checked="checked"#(/cleanSearchCacheChecked)# aria-describedby="cleanSearchCacheInfo"/>
<div class="info" style="float:right">
<img src="env/grafics/i16.gif" width="16" height="16" alt="Clean up search events cache info"/>
<span style="right:0px;" id="cleanSearchCacheInfo">
Check this option to be sure to get fresh search results including newly crawled documents. Beware that it will also interrupt any refreshing/resorting of search results currently requested from browser-side.
</span>
</div>
</dd>
<dt>No Deletion</dt>
<dd><span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
@ -511,9 +511,9 @@
</span></span><input type="radio" name="deleteold" id="deleteoldoff" value="off" #(deleteold_off)#::checked="checked"#(/deleteold_off)#/>Do not delete any document before the crawl is started.</dd>
<dt>Delete sub-path</dt>
<dd><input type="radio" name="deleteold" id="deleteoldon" value="on" #(deleteold_on)#::checked="checked"#(/deleteold_on)#/>For each host in the start url list, delete all documents (in the given subpath) from that host.</dd>
<dt>Delete only old</dt>
<dd><input type="radio" name="deleteold" id="deleteoldage" value="age" #(deleteold_age)#::checked="checked"#(/deleteold_age)#/>Treat documents that are loaded
<select name="deleteIfOlderNumber" id="deleteIfOlderNumber">
<dt>Delete only old</dt>
<dd><input type="radio" name="deleteold" id="deleteoldage" value="age" #(deleteold_age)#::checked="checked"#(/deleteold_age)#/>Treat documents that are loaded
<select name="deleteIfOlderNumber" id="deleteIfOlderNumber">
#(deleteIfOlderSelect)#::
#{list}#<option value="#[name]#" #(default)#::selected="selected"#(/default)#>#[name]#</option>#{/list}#
#(/deleteIfOlderSelect)#
@ -523,8 +523,8 @@
#{list}#<option value="#[value]#" #(default)#::selected="selected"#(/default)#>#[name]#</option>#{/list}#
#(/deleteIfOlderUnitSelect)#
</select> ago as stale and delete them before the crawl is started.
</dd>
</dl>
</dd>
</dl>
</fieldset>
<fieldset>
<legend>Double-Check Rules</legend>
@ -535,8 +535,8 @@
then the url is treated as double when you check the 'no doubles' option. A url may be loaded again when it has reached a specific age,
to use that check the 're-load' option.
</span></span><input type="radio" name="recrawl" id="reloadoldoff" value="nodoubles" #(recrawl_nodoubles)#::checked="checked"#(/recrawl_nodoubles)#/>Never load any page that is already known. Only the start-url may be loaded again.</dd>
<dt>Re-load</dt>
<dd><input type="radio" name="recrawl" id="reloadoldage" value="reload" #(recrawl_reload)#::checked="checked"#(/recrawl_reload)#/>Treat documents that are loaded
<dt>Re-load</dt>
<dd><input type="radio" name="recrawl" id="reloadoldage" value="reload" #(recrawl_reload)#::checked="checked"#(/recrawl_reload)#/>Treat documents that are loaded
<select name="reloadIfOlderNumber" id="reloadIfOlderNumber">
#(reloadIfOlderSelect)#::
#{list}#<option value="#[name]#" #(default)#::selected="selected"#(/default)#>#[name]#</option>#{/list}#
@ -547,8 +547,8 @@
#{list}#<option value="#[value]#" #(default)#::selected="selected"#(/default)#>#[name]#</option>#{/list}#
#(/reloadIfOlderUnitSelect)#
</select> ago as stale and load them again. If they are younger, they are ignored.
</dd>
</dl>
</dd>
</dl>
</fieldset>
<fieldset>
<legend>Document Cache</legend>
@ -569,36 +569,35 @@
<b>if&nbsp;exist</b>: use the cache if the cache exist. Do no check freshness. Otherwise use online source;
<b>cache&nbsp;only</b>: never go online, use all content from cache. If no cache exist, treat content as unavailable
</span></span>
<input type="radio" name="cachePolicy" value="nocache" #(cachePolicy_nocache)#::checked="checked"#(/cachePolicy_nocache)#/>no&nbsp;cache&nbsp;&nbsp;&nbsp;
<input type="radio" name="cachePolicy" value="iffresh" #(cachePolicy_iffresh)#::checked="checked"#(/cachePolicy_iffresh)# />if&nbsp;fresh&nbsp;&nbsp;&nbsp;
<input type="radio" name="cachePolicy" value="ifexist" #(cachePolicy_ifexist)#::checked="checked"#(/cachePolicy_ifexist)#/>if&nbsp;exist&nbsp;&nbsp;&nbsp;
<input type="radio" name="cachePolicy" value="cacheonly" #(cachePolicy_cacheonly)#::checked="checked"#(/cachePolicy_cacheonly)#/>cache&nbsp;only
</dd>
</dl>
<input type="radio" name="cachePolicy" value="nocache" #(cachePolicy_nocache)#::checked="checked"#(/cachePolicy_nocache)#/>no&nbsp;cache&nbsp;&nbsp;&nbsp;
<input type="radio" name="cachePolicy" value="iffresh" #(cachePolicy_iffresh)#::checked="checked"#(/cachePolicy_iffresh)# />if&nbsp;fresh&nbsp;&nbsp;&nbsp;
<input type="radio" name="cachePolicy" value="ifexist" #(cachePolicy_ifexist)#::checked="checked"#(/cachePolicy_ifexist)#/>if&nbsp;exist&nbsp;&nbsp;&nbsp;
<input type="radio" name="cachePolicy" value="cacheonly" #(cachePolicy_cacheonly)#::checked="checked"#(/cachePolicy_cacheonly)#/>cache&nbsp;only
</dd>
</dl>
</fieldset>
#(agentSelect)#<input type="hidden" name="agentName" id="agentName" value="#[defaultAgentName]#" />::
<fieldset>
<legend>Robot Behaviour</legend>
<dl>
<dt><label>Use Special User Agent and robot identification</label></dt>
<dd>
<span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
You are running YaCy in non-p2p mode and because YaCy can be used as replacement for commercial search appliances
(like the GSA) the user must be able to crawl all web pages that are granted to such commercial plattforms.
Because YaCy can be used as replacement for commercial search appliances
(like the Google Search Appliance aka GSA) the user must be able to crawl all web pages that are granted to such commercial platforms.
Not having this option would be a strong handicap for professional usage of this software. Therefore you are able to select
alternative user agents here which have different crawl timings and also identify itself with another user agent and obey the corresponding robots rule.
</span></span>
<select name="agentName" id="agentName">
#{list}#
<select name="agentName" id="agentName">
#{list}#
<option value="#[name]#" #(default)#::selected="selected"#(/default)#>#[name]#</option>
#{/list}#
</select>
</dd>
</dl>
#{/list}#
</select>
</dd>
</dl>
</fieldset>
#(/agentSelect)#
#(vocabularySelect)#::
#(vocabularySelect)#::
<fieldset>
<legend>Enrich Vocabulary</legend>
<dl>
@ -609,18 +608,18 @@
</span></span>
<table class="table table-condensed">
<tr><th>Vocabulary</th><th>Class</th></tr>
#{vocabularyset}#
<tr>
<td>#[name]#</td>
<td><input name="vocabulary_#[name]#_class" id="vocabulary_#[name]#_class" type="text" size="55" maxlength="1028" value="#[value]#" /></td>
</tr>
#{vocabularyset}#
<tr>
<td>#[name]#</td>
<td><input name="vocabulary_#[name]#_class" id="vocabulary_#[name]#_class" type="text" size="55" maxlength="1028" value="#[value]#" /></td>
</tr>
#{/vocabularyset}#
</table>
</dd>
</dl>
</dd>
</dl>
</fieldset>
#(/vocabularySelect)#
#(/vocabularySelect)#
<fieldset>
<legend>Snapshot Creation</legend>
<dl>
@ -633,12 +632,12 @@
means a snapshot is only be generated if the crawl depth of a document is smaller or equal to the given number here. If the number is set to -1,
no snapshots are generated.
</span></span>
<input type="text" name="snapshotsMaxDepth" id="snapshotsMaxDepth" size="2" maxlength="2" value="#[snapshotsMaxDepth]#" />
</dd>
<input type="text" name="snapshotsMaxDepth" id="snapshotsMaxDepth" size="2" maxlength="2" value="#[snapshotsMaxDepth]#" />
</dd>
<dt><label>Multiple Snapshot Versions</label></dt>
<dd>
<input type="radio" name="snapshotsReplaceOld" value="on" checked="checked"/> replace old snapshots with new one&nbsp;&nbsp;&nbsp;
<input type="radio" name="snapshotsReplaceOld" value="off" /> add new versions for each crawl
<input type="radio" name="snapshotsReplaceOld" value="off" /> add new versions for each crawl
</dd>
<dt><label for="snapshotsMustnotmatch">must-not-match filter for snapshot generation</label></dt>
<dd><input name="snapshotsMustnotmatch" id="snapshotsMustnotmatch" type="text" size="55" maxlength="100000" value="#[snapshotsMustnotmatch]#" /></dd>
@ -647,14 +646,14 @@
<dt><label>Image Creation</label></dt>
<dd>
<div class="info">Only XML snapshots can be generated. as the <a href="https://wkhtmltopdf.org/" target="_blank">wkhtmltopdf</a> util is not found by YaCy on your system.
It is required to generate PDF snapshots from crawled pages that can then be converted to images.</div>
It is required to generate PDF snapshots from crawled pages that can then be converted to images.</div>
</dd>::
<dt><label>Image Creation</label></dt>
<dd>
<input type="checkbox" name="snapshotsLoadImage" id="snapshotsLoadImage"#(snapshotsLoadImageChecked)#:: checked="checked"#(/snapshotsLoadImageChecked)#/>
</dd>
#(/snapshotEnableImages)#
</dl>
#(/snapshotEnableImages)#
</dl>
</fieldset>
<fieldset>
<legend>Index Attributes</legend>
@ -683,11 +682,11 @@
<table style="border-width: 0px">
#(remoteCrawlerDisabled)#::
<tr #(crawlOrderChecked)#class="hidden"::#(/crawlOrderChecked)# id="remoteCrawlerDisabledInfo">
<td colspan="2"><div class="info"><p>Remote crawl results won't be added to the local index as the remote crawler is disabled on this peer.</p>
<p>You can activate it in the <a href="RemoteCrawl_p.html">Remote Crawl Configuration</a> page.</p></div>
</td>
</tr>
#(/remoteCrawlerDisabled)#
<td colspan="2"><div class="info"><p>Remote crawl results won't be added to the local index as the remote crawler is disabled on this peer.</p>
<p>You can activate it in the <a href="RemoteCrawl_p.html">Remote Crawl Configuration</a> page.</p></div>
</td>
</tr>
#(/remoteCrawlerDisabled)#
<tr>
<td>
<input type="checkbox" name="crawlOrder" id="crawlOrder" #(crawlOrderChecked)#::checked="checked"#(/crawlOrderChecked)#/>
@ -709,9 +708,9 @@
These tags can be selected with the <a href="gsa/search?q=www&site=#[collection]#">GSA interface</a> using the 'site' operator.
To use this option, the 'collection_sxt'-field must be switched on in the <a href="IndexFederated_p.html">Solr Schema</a>
</span></span>
<input name="collection" id="collection" type="text" size="60" maxlength="100" value="#[collection]#" #(collectionEnabled)#disabled="disabled"::#(/collectionEnabled)# />
</dd>
<input name="collection" id="collection" type="text" size="60" maxlength="100" value="#[collection]#" #(collectionEnabled)#disabled="disabled"::#(/collectionEnabled)# />
</dd>
<dt><label for="collection">Time Zone Offset</label></dt>
<dd>
<span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
@ -720,18 +719,18 @@
from dates without time zones to UTC, this offset must be given here. The offset is given in minutes;
Time zone offsets for locations east of UTC must be negative; offsets for zones west of UTC must be positve.
</span></span>
<input id="timezoneOffset" type="text" size="4" maxlength="4" name="timezoneOffset" value=""><script>document.getElementById("timezoneOffset").value = new Date().getTimezoneOffset();</script>
</dd>
</dl>
<input id="timezoneOffset" type="text" size="4" maxlength="4" name="timezoneOffset" value=""><script>document.getElementById("timezoneOffset").value = new Date().getTimezoneOffset();</script>
</dd>
</dl>
</fieldset>
<dl>
<dl>
<dt><input type="hidden" name="crawlingstart" value="1"/><input type="submit" value="Start New Crawl Job" class="btn btn-primary"/></dt><dd></dd>
</dl>
</fieldset>
</form>
#%env/templates/footer.template%#
</body>
</html>

@ -71,7 +71,7 @@ public class CrawlStartExpert {
} else {
prop.put("starturl", "");
}
// sitemap URL
if (post != null && post.containsKey("sitemapURL")) {
final String sitemapURL = post.get("sitemapURL", "");
@ -83,7 +83,7 @@ public class CrawlStartExpert {
} else {
prop.put("sitemapURL", "");
}
// crawling file
if (post != null && post.containsKey("crawlingFile")) {
final String crawlingFile = post.get("crawlingFile", "");
@ -144,12 +144,10 @@ public class CrawlStartExpert {
} else {
prop.put("bookmarkTitle", "");
}
// ---------- Crawling filter
final int crawlingDomMaxPages = env.getConfigInt(
"crawlingDomMaxPages", -1);
final int crawlingDomMaxPages = env.getConfigInt("crawlingDomMaxPages", -1);
// crawling depth
if (post != null && post.containsKey("crawlingDepth")) {
final Integer depth = post.getInt("crawlingDepth", -1);
@ -213,13 +211,13 @@ public class CrawlStartExpert {
prop.put("obeyHtmlRobotsNoindexChecked", post.getBoolean("obeyHtmlRobotsNoindex") ? 1 : 0);
prop.put("obeyHtmlRobotsNofollowChecked", post.getBoolean("obeyHtmlRobotsNofollow") ? 1 : 0);
}
// always cross-check URL file extension against actual Media Type ?
if (post == null) {
prop.put("crawlerAlwaysCheckMediaType", true);
} else {
prop.put("crawlerAlwaysCheckMediaType", post.getBoolean("crawlerAlwaysCheckMediaType"));
}
if (post == null) {
prop.put("crawlerAlwaysCheckMediaType", true);
} else {
prop.put("crawlerAlwaysCheckMediaType", post.getBoolean("crawlerAlwaysCheckMediaType"));
}
// Load Filter on URLs (range)
if (post != null && post.containsKey("range")) {
@ -248,22 +246,22 @@ public class CrawlStartExpert {
} else {
prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
}
// Filter on URL origin of links: must match
if (post != null && post.containsKey(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key)) {
prop.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key,
post.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING));
} else {
prop.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING);
}
// Filter on URL origin of links: must-not-match
if (post != null && post.containsKey(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key)) {
prop.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key,
post.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING));
} else {
prop.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING);
}
// Filter on URL origin of links: must match
if (post != null && post.containsKey(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key)) {
prop.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key,
post.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING));
} else {
prop.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING);
}
// Filter on URL origin of links: must-not-match
if (post != null && post.containsKey(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key)) {
prop.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key,
post.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING));
} else {
prop.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING);
}
// Load Filter on IPs: must match
if (post != null && post.containsKey("ipMustmatch")) {
@ -329,45 +327,45 @@ public class CrawlStartExpert {
} else {
prop.put("indexcontentmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
}
// Filter on Media Type of Document: must match
if (post != null && post.containsKey(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key)) {
prop.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key,
post.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING));
} else {
prop.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING);
}
// Filter on Media Type of Document: must-not-match
if (post != null && post.containsKey(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key)) {
prop.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key,
post.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING));
} else {
prop.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING);
}
// Filter with a Solr syntax query
/* Check that the embedded local Solr index is connected, as its schema is required to apply the eventual Solr filter query */
final EmbeddedInstance embeddedSolr = sb.index.fulltext().getEmbeddedInstance();
final SolrCore embeddedCore = embeddedSolr != null ? embeddedSolr.getDefaultCore() : null;
final boolean embeddedSolrConnected = embeddedSolr != null && embeddedCore != null;
prop.put("embeddedSolrConnected", embeddedSolrConnected);
if(embeddedSolrConnected) {
if (post != null && post.containsKey(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key)) {
prop.put("embeddedSolrConnected_" + CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key,
post.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, CrawlProfile.SOLR_MATCH_ALL_QUERY).trim());
} else {
prop.put("embeddedSolrConnected_" + CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, CrawlProfile.SOLR_MATCH_ALL_QUERY);
}
if (post != null && post.containsKey(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key)) {
prop.put("embeddedSolrConnected_" + CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key,
post.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, CrawlProfile.SOLR_EMPTY_QUERY).trim());
} else {
prop.put("embeddedSolrConnected_" + CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, CrawlProfile.SOLR_EMPTY_QUERY);
}
}
// Filter on Media Type of Document: must match
if (post != null && post.containsKey(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key)) {
prop.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key,
post.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING));
} else {
prop.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING);
}
// Filter on Media Type of Document: must-not-match
if (post != null && post.containsKey(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key)) {
prop.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key,
post.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING));
} else {
prop.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING);
}
// Filter with a Solr syntax query
/* Check that the embedded local Solr index is connected, as its schema is required to apply the eventual Solr filter query */
final EmbeddedInstance embeddedSolr = sb.index.fulltext().getEmbeddedInstance();
final SolrCore embeddedCore = embeddedSolr != null ? embeddedSolr.getDefaultCore() : null;
final boolean embeddedSolrConnected = embeddedSolr != null && embeddedCore != null;
prop.put("embeddedSolrConnected", embeddedSolrConnected);
if(embeddedSolrConnected) {
if (post != null && post.containsKey(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key)) {
prop.put("embeddedSolrConnected_" + CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key,
post.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, CrawlProfile.SOLR_MATCH_ALL_QUERY).trim());
} else {
prop.put("embeddedSolrConnected_" + CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, CrawlProfile.SOLR_MATCH_ALL_QUERY);
}
if (post != null && post.containsKey(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key)) {
prop.put("embeddedSolrConnected_" + CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key,
post.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, CrawlProfile.SOLR_EMPTY_QUERY).trim());
} else {
prop.put("embeddedSolrConnected_" + CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, CrawlProfile.SOLR_EMPTY_QUERY);
}
}
// ---------- Clean-Up before Crawl Start
@ -433,18 +431,17 @@ public class CrawlStartExpert {
} else {
prop.put("deleteIfOlderUnitSelect_list_2_default", 1);
}
// clean up search events cache ?
if (post != null && post.containsKey("cleanSearchCache")) {
prop.put("cleanSearchCacheChecked", post.getBoolean("cleanSearchCache"));
prop.put("cleanSearchCacheChecked", post.getBoolean("cleanSearchCache"));
} else {
/*
* no parameter passed : the checkbox is proposed unchecked
* when JavaScript search resort is enabled, as it heavily relies on search events cache
*/
prop.put("cleanSearchCacheChecked", !sb.getConfigBool(SwitchboardConstants.SEARCH_JS_RESORT,
SwitchboardConstants.SEARCH_JS_RESORT_DEFAULT));
/*
* no parameter passed : the checkbox is proposed unchecked
* when JavaScript search resort is enabled, as it heavily relies on search events cache
*/
prop.put("cleanSearchCacheChecked", !sb.getConfigBool(SwitchboardConstants.SEARCH_JS_RESORT,
SwitchboardConstants.SEARCH_JS_RESORT_DEFAULT));
}
// delete any document before the crawl is started?
@ -547,7 +544,7 @@ public class CrawlStartExpert {
prop.put("storeHTCacheChecked",
post.getBoolean("storeHTCache") ? 1 : 0);
}
// Policy for usage of Web Cache
if (post != null && post.containsKey("cachePolicy")) {
final String cachePolicy = post.get("cachePolicy", "");
@ -565,35 +562,29 @@ public class CrawlStartExpert {
}
// ---------- Agent name
if (sb.isP2PMode()) {
prop.put("agentSelect", 0);
} else {
prop.put("agentSelect", 1);
List<String> agentNames = new ArrayList<String>();
if (sb.isIntranetMode()) {
agentNames.add(ClientIdentification.yacyIntranetCrawlerAgentName);
}
if (sb.isGlobalMode()) {
agentNames.add(ClientIdentification.yacyInternetCrawlerAgentName);
}
agentNames.add(ClientIdentification.googleAgentName);
if (sb.isAllIPMode()) {
agentNames.add(ClientIdentification.browserAgentName);
if (ClientIdentification.getAgent(ClientIdentification.customAgentName) != null) agentNames.add(ClientIdentification.customAgentName);
}
String defaultAgentName = agentNames.get(0);
if (post != null && post.containsKey("agentName")) {
String agentName = post.get("agentName", sb.isIntranetMode() ? ClientIdentification.yacyIntranetCrawlerAgentName : ClientIdentification.yacyInternetCrawlerAgentName);
if (agentNames.contains(agentName)) defaultAgentName = agentName;
}
for (int i = 0; i < agentNames.size(); i++) {
prop.put("agentSelect_list_" + i + "_name", agentNames.get(i));
prop.put("agentSelect_list_" + i + "_default", agentNames.get(i).equals(defaultAgentName) ? 1 : 0);
}
prop.put("agentSelect_list", agentNames.size());
List<String> agentNames = new ArrayList<String>();
if (sb.isIntranetMode()) {
agentNames.add(ClientIdentification.yacyIntranetCrawlerAgentName);
}
if (sb.isGlobalMode()) {
agentNames.add(ClientIdentification.yacyInternetCrawlerAgentName);
}
agentNames.add(ClientIdentification.googleAgentName);
if (sb.isAllIPMode()) {
agentNames.add(ClientIdentification.browserAgentName);
if (ClientIdentification.getAgent(ClientIdentification.customAgentName) != null) agentNames.add(ClientIdentification.customAgentName);
}
prop.put("agentSelect_defaultAgentName", ClientIdentification.yacyInternetCrawlerAgentName);
String defaultAgentName = agentNames.get(0);
if (post != null && post.containsKey("agentName")) {
String agentName = post.get("agentName", sb.isIntranetMode() ? ClientIdentification.yacyIntranetCrawlerAgentName : ClientIdentification.yacyInternetCrawlerAgentName);
if (agentNames.contains(agentName)) defaultAgentName = agentName;
}
for (int i = 0; i < agentNames.size(); i++) {
prop.put("list_" + i + "_name", agentNames.get(i));
prop.put("list_" + i + "_default", agentNames.get(i).equals(defaultAgentName) ? 1 : 0);
}
prop.put("list", agentNames.size());
prop.put("defaultAgentName", sb.isIntranetMode() ? ClientIdentification.yacyIntranetCrawlerAgentName : ClientIdentification.yacyInternetCrawlerAgentName);
// ---------- Ignore Class Name
if (post != null && post.containsKey("ignoreclassname")) {
@ -602,7 +593,7 @@ public class CrawlStartExpert {
} else {
prop.put("ignoreclassname", "");
}
// ---------- Enrich Vocabulary
Collection<Tagging> vocs = LibraryProvider.autotagging.getVocabularies();
if (vocs.size() == 0) {
@ -618,7 +609,7 @@ public class CrawlStartExpert {
}
prop.put("vocabularySelect_vocabularyset", count);
}
// ---------- Snapshot generation
boolean wkhtmltopdfAvailable = Html2Image.wkhtmltopdfAvailable();
//boolean convertAvailable = Html2Image.convertAvailable();
@ -643,9 +634,9 @@ public class CrawlStartExpert {
// Do Remote Indexing?
if (sb.isP2PMode()) {
prop.put("remoteindexing", 1);
prop.put("remoteindexing_remoteCrawlerDisabled",
!sb.getConfigBool(SwitchboardConstants.CRAWLJOB_REMOTE, false));
prop.put("remoteindexing_remoteCrawlerDisabled_crawlOrderChecked", env.getConfigBool("crawlOrder", true));
prop.put("remoteindexing_remoteCrawlerDisabled",
!sb.getConfigBool(SwitchboardConstants.CRAWLJOB_REMOTE, false));
prop.put("remoteindexing_remoteCrawlerDisabled_crawlOrderChecked", env.getConfigBool("crawlOrder", true));
prop.put("remoteindexing_crawlOrderChecked", env.getConfigBool("crawlOrder", true));
prop.put("remoteindexing_intention", "");
} else {
@ -658,9 +649,9 @@ public class CrawlStartExpert {
post.getBoolean("indexMedia") ? 1 : 0);
if (sb.isP2PMode()) {
prop.put("remoteindexing", 1);
prop.put("remoteindexing_remoteCrawlerDisabled",
!sb.getConfigBool(SwitchboardConstants.CRAWLJOB_REMOTE, false));
prop.put("remoteindexing_remoteCrawlerDisabled_crawlOrderChecked", post.getBoolean("crawlOrder"));
prop.put("remoteindexing_remoteCrawlerDisabled",
!sb.getConfigBool(SwitchboardConstants.CRAWLJOB_REMOTE, false));
prop.put("remoteindexing_remoteCrawlerDisabled_crawlOrderChecked", post.getBoolean("crawlOrder"));
prop.put("remoteindexing_crawlOrderChecked", post.getBoolean("crawlOrder"));
prop.put("remoteindexing_intention", post.get("intention", ""));
} else {
@ -681,7 +672,7 @@ public class CrawlStartExpert {
prop.put("collection", collectionEnabled ? defaultCollection : "");
}
}
// return rewrite properties
return prop;
}

@ -33,7 +33,7 @@ public class ClientIdentification {
public static final int clientTimeoutInit = 10000;
public static final int minimumLocalDeltaInit = 10; // the minimum time difference between access of the same local domain
public static final int minimumGlobalDeltaInit = 500; // the minimum time difference between access of the same global domain
public static final int minimumGlobalDeltaInit = 250; // the minimum time difference between access of the same global domain
public static class Agent {
public final String userAgent; // the name that is send in http request to identify the agent

Loading…
Cancel
Save