Issue #156 : new option to clean up (or not) search cache on crawl start

Prevent also unnecessary search event cache clean-up on each access to
the crawl monitor page (Crawler_p.html).
pull/167/head
luccioman 7 years ago
parent eeb5fbb160
commit 519fc9a600

@ -381,6 +381,16 @@
<fieldset>
<legend>Clean-Up before Crawl Start</legend>
<dl>
<dt><label for="cleanSearchCache">Clean up search events cache</label></dt>
<dd>
<input type="checkbox" name="cleanSearchCache" id="cleanSearchCache" #(cleanSearchCacheChecked)#::checked="checked"#(/cleanSearchCacheChecked)# aria-describedby="cleanSearchCacheInfo"/>
<div class="info" style="float:right">
<img src="env/grafics/i16.gif" width="16" height="16" alt="Clean up search events cache info"/>
<span style="right:0px;" id="cleanSearchCacheInfo">
Check this option to be sure to get fresh search results including newly crawled documents. Beware that it will also interrupt any refreshing/resorting of search results currently requested from browser-side.
</span>
</div>
</dd>
<dt>No Deletion</dt>
<dd><span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
After a crawl was done in the past, document may become stale and eventually they are also deleted on the target host.

@ -35,6 +35,7 @@ import net.yacy.cora.util.Html2Image;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.document.LibraryProvider;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.schema.CollectionSchema;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
@ -364,6 +365,19 @@ public class CrawlStartExpert {
} else {
prop.put("deleteIfOlderUnitSelect_list_2_default", 1);
}
// clean up search events cache ?
if (post != null && post.containsKey("cleanSearchCache")) {
prop.put("cleanSearchCacheChecked", post.getBoolean("cleanSearchCache"));
} else {
/*
* no parameter passed : the checkbox is proposed unchecked
* when JavaScript search resort is enabled, as it heavily relies on search events cache
*/
prop.put("cleanSearchCacheChecked", !sb.getConfigBool(SwitchboardConstants.SEARCH_JS_RESORT,
SwitchboardConstants.SEARCH_JS_RESORT_DEFAULT));
}
// delete any document before the crawl is started?
if (post != null && post.containsKey("deleteold")) {

@ -86,6 +86,7 @@
<input type="hidden" name="recrawl" id="recrawl" value="reload" />
<input type="hidden" name="reloadIfOlderNumber" id="reloadIfOlderNumber" value="3" />
<input type="hidden" name="reloadIfOlderUnit" id="reloadIfOlderUnit" value="day" />
<input type="hidden" name="cleanSearchCache" id="cleanSearchCache" value=#(cleanSearchCacheChecked)#"off"::"on"#(/cleanSearchCacheChecked)# />
<input type="hidden" name="deleteold" id="deleteold" value="on" />
<input type="hidden" name="storeHTCache" id="storeHTCache" value="on" />
<input type="hidden" name="cachePolicy" id="cachePolicy" value="iffresh" />

@ -74,20 +74,18 @@ import net.yacy.search.schema.CollectionSchema;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
/**
* This servlet does NOT create the Crawler servlet page content! This controls
* a web crawl start or the crawl monitor page (Crawler_p.html). The interfaces for entering the web crawl parameters are
* in CrawlStartSite.html and CrawlStartExpert.html.
*/
public class Crawler_p {
// this servlet does NOT create the Crawler servlet page content!
// this servlet starts a web crawl. The interface for entering the web crawl parameters is in IndexCreate_p.html
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
// return variable that accumulates replacements
final Switchboard sb = (Switchboard) env;
// clean up all search events
SearchEventCache.cleanupEvents(true);
sb.index.clearCaches(); // every time the ranking is changed we need to remove old orderings
// inital values for AJAX Elements (without JavaScript)
final serverObjects prop = new serverObjects();
prop.put("rejected", 0);
@ -220,6 +218,12 @@ public class Crawler_p {
if (sb.peers == null) {
prop.put("info", "3");
} else {
if(post.getBoolean("cleanSearchCache")) {
// clean up all search events
SearchEventCache.cleanupEvents(true);
sb.index.clearCaches(); // every time the ranking is changed we need to remove old orderings
}
// remove crawlingFileContent before we record the call
String crawlingFileName = post.get("crawlingFile");

@ -34,6 +34,7 @@
<input type="hidden" name="mustnotmatch" value="(.*Recentchangeslinked.*)|(.*Whatlinkshere.*)|(.*MediaWiki.*)" />
<input type="hidden" name="range" value="subpath" />
<input type="hidden" name="crawlingIfOlderCheck" value="on"/>
<input type="hidden" name="cleanSearchCache" id="cleanSearchCache" value=#(cleanSearchCacheChecked)#"off"::"on"#(/cleanSearchCacheChecked)# />
<input type="hidden" name="crawlingIfOlderNumber" value="1" />
<input type="hidden" name="crawlingIfOlderUnit" value="day" />
<input type="hidden" name="crawlingDomFilterCheck" value="off" />

@ -26,6 +26,7 @@
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
@ -45,6 +46,18 @@ public class Load_MediawikiWiki {
}
prop.put("starturl", "http://");
prop.put("address", a);
// hidden form param : clean up search events cache ?
if (post != null && post.containsKey("cleanSearchCache")) {
prop.put("cleanSearchCacheChecked", post.getBoolean("cleanSearchCache"));
} else {
/*
* no parameter passed : no search event cache clean-up
* when JavaScript search resort is enabled, as it heavily relies on search events cache
*/
prop.put("cleanSearchCacheChecked", !sb.getConfigBool(SwitchboardConstants.SEARCH_JS_RESORT,
SwitchboardConstants.SEARCH_JS_RESORT_DEFAULT));
}
// return rewrite properties
return prop;

@ -45,6 +45,7 @@
<input type="hidden" name="mustmatch" value=".*" />
<input type="hidden" name="mustnotmatch" value=".*memberlist.*|.*previous.*|.*next.*|.*start=.*|.*p=.*" />
<input type="hidden" name="range" value="subpath" />
<input type="hidden" name="cleanSearchCache" id="cleanSearchCache" value=#(cleanSearchCacheChecked)#"off"::"on"#(/cleanSearchCacheChecked)# />
<input type="hidden" name="crawlingIfOlderCheck" value="on"/>
<input type="hidden" name="crawlingIfOlderNumber" value="1" />
<input type="hidden" name="crawlingIfOlderUnit" value="day" />

@ -44,6 +44,18 @@ public class Load_PHPBB3 {
final String repository = "http://" + a + "/";
prop.put("starturl", (intranet) ? repository : "http://");
prop.put("address", a);
// hidden form param : clean up search events cache ?
if (post != null && post.containsKey("cleanSearchCache")) {
prop.put("cleanSearchCacheChecked", post.getBoolean("cleanSearchCache"));
} else {
/*
* no parameter passed : no search event cache clean-up
* when JavaScript search resort is enabled, as it heavily relies on search events cache
*/
prop.put("cleanSearchCacheChecked", !sb.getConfigBool(SwitchboardConstants.SEARCH_JS_RESORT,
SwitchboardConstants.SEARCH_JS_RESORT_DEFAULT));
}
// return rewrite properties
return prop;

Loading…
Cancel
Save