Added a specific default crawl profile for the recrawl job.

- with only light constraint on known indexed documents load date, as it
can already been controlled by the selection query, and the goal of the
job is indeed to recrawl selected documents now
- using the iffresh cache strategy
pull/154/head
luccioman 7 years ago
parent adf3fa493d
commit b712a0671e

@ -61,6 +61,7 @@ public final class CrawlSwitchboard {
public static final String CRAWL_PROFILE_AUTOCRAWL_DEEP = "autocrawlDeep";
public static final String CRAWL_PROFILE_AUTOCRAWL_SHALLOW = "autocrawlShallow";
public static final String CRAWL_PROFILE_RECRAWL_JOB = "recrawlJob";
public static final String CRAWL_PROFILE_PROXY = "proxy";
public static final String CRAWL_PROFILE_REMOTE = "remote";
public static final String CRAWL_PROFILE_SNIPPET_LOCAL_TEXT = "snippetLocalText";
@ -88,7 +89,13 @@ public final class CrawlSwitchboard {
public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive1.heap";
// Default time cycle in minutes before an indexed URL by a given crawl profile can be accepted for recrawl */
/**
* The default recrawl time cycle in minutes for recrawl jobs. The recrawl date
* limit can be set up by the recrawl job selection query, but a default limit
* prevent unwanted overload on targets)
*/
public static final long CRAWL_PROFILE_RECRAWL_JOB_RECRAWL_CYCLE = 60L; // on hour
public static final long CRAWL_PROFILE_PROXY_RECRAWL_CYCLE = 60L * 24L; // one day
public static final long CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L; // 30 days
public static final long CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L; // 30 days
@ -104,7 +111,7 @@ public final class CrawlSwitchboard {
private final Map<String, RowHandleSet> profilesActiveCrawlsCounter;
public CrawlProfile defaultProxyProfile, defaultRemoteProfile, defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile;
public CrawlProfile defaultTextGreedyLearningProfile, defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile, defaultSurrogateProfile;
public CrawlProfile defaultAutocrawlDeepProfile, defaultAutocrawlShallowProfile;
public CrawlProfile defaultAutocrawlDeepProfile, defaultAutocrawlShallowProfile, defaultRecrawlJobProfile;
private Map<String, CrawlProfile> defaultPushProfiles; // for each collection one profile
private final File queuesRoot;
private Switchboard switchboard;
@ -466,6 +473,13 @@ public final class CrawlSwitchboard {
UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()),
this.defaultTextSnippetGlobalProfile);
this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST);
// generate new default entry for RecrawlBusyThread
this.defaultRecrawlJobProfile = RecrawlBusyThread.buildDefaultCrawlProfile();
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultRecrawlJobProfile.handle()),
this.defaultRecrawlJobProfile);
// generate new default entry for greedy learning
this.defaultTextGreedyLearningProfile =
new CrawlProfile(

@ -34,6 +34,8 @@ import org.apache.solr.common.SolrDocumentList;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.NoticedURL;
@ -183,7 +185,7 @@ public class RecrawlBusyThread extends AbstractBusyThread {
int added = 0;
if (!this.urlstack.isEmpty()) {
final CrawlProfile profile = sb.crawler.defaultTextSnippetGlobalProfile;
final CrawlProfile profile = sb.crawler.defaultRecrawlJobProfile;
for (final DigestURL url : this.urlstack) {
final Request request = sb.loader.request(url, true, true);
@ -302,6 +304,28 @@ public class RecrawlBusyThread extends AbstractBusyThread {
}
return true;
}
/**
* @return a new default CrawlProfile instance to be used for recrawl jobs.
*/
public static CrawlProfile buildDefaultCrawlProfile() {
CrawlProfile profile = new CrawlProfile(CrawlSwitchboard.CRAWL_PROFILE_RECRAWL_JOB, CrawlProfile.MATCH_ALL_STRING, // crawlerUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, // crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, // crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, // crawlerIpMustNotMatch
CrawlProfile.MATCH_NEVER_STRING, // crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, // crawlerNoDepthLimitMatch
CrawlProfile.MATCH_ALL_STRING, // indexUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, // indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, // indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, // indexContentMustNotMatch
0, false, CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_RECRAWL_JOB_RECRAWL_CYCLE), -1,
true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
true, true, true, false, -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFFRESH,
"robot_" + CrawlSwitchboard.CRAWL_PROFILE_RECRAWL_JOB,
ClientIdentification.yacyInternetCrawlerAgentName, null, null, 0);
return profile;
}
@Override
public int getJobCount() {

Loading…
Cancel
Save