Added a specific default crawl profile for the recrawl job.

- with only light constraint on known indexed documents load date, as it can already been controlled by the selection query, and the goal of the job is indeed to recrawl selected documents now - using the iffresh cache strategy
7 years ago · b712a0671e
parent adf3fa493d
commit b712a0671e
2 changed files with 41 additions and 3 deletions
--- a/source/net/yacy/crawler/CrawlSwitchboard.java
+++ b/source/net/yacy/crawler/CrawlSwitchboard.java
@ -61,6 +61,7 @@ public final class CrawlSwitchboard {
 	
    public static final String CRAWL_PROFILE_AUTOCRAWL_DEEP = "autocrawlDeep";
    public static final String CRAWL_PROFILE_AUTOCRAWL_SHALLOW = "autocrawlShallow";
+    public static final String CRAWL_PROFILE_RECRAWL_JOB = "recrawlJob";
    public static final String CRAWL_PROFILE_PROXY = "proxy";
    public static final String CRAWL_PROFILE_REMOTE = "remote";
    public static final String CRAWL_PROFILE_SNIPPET_LOCAL_TEXT = "snippetLocalText";
@ -88,7 +89,13 @@ public final class CrawlSwitchboard {
    public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive1.heap";

    // Default time cycle in minutes before an indexed URL by a given crawl profile can be accepted for recrawl */
-    
+
+	/**
+	 * The default recrawl time cycle in minutes for recrawl jobs. The recrawl date
+	 * limit can be set up by the recrawl job selection query, but a default limit
+	 * prevent unwanted overload on targets)
+	 */
+    public static final long CRAWL_PROFILE_RECRAWL_JOB_RECRAWL_CYCLE = 60L; // on hour
    public static final long CRAWL_PROFILE_PROXY_RECRAWL_CYCLE = 60L * 24L; // one day
    public static final long CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L; // 30 days
    public static final long CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L; // 30 days
@ -104,7 +111,7 @@ public final class CrawlSwitchboard {
    private final Map<String, RowHandleSet> profilesActiveCrawlsCounter;
    public CrawlProfile defaultProxyProfile, defaultRemoteProfile, defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile;
    public CrawlProfile defaultTextGreedyLearningProfile, defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile, defaultSurrogateProfile;
-    public CrawlProfile defaultAutocrawlDeepProfile, defaultAutocrawlShallowProfile;
+    public CrawlProfile defaultAutocrawlDeepProfile, defaultAutocrawlShallowProfile, defaultRecrawlJobProfile;
    private Map<String, CrawlProfile> defaultPushProfiles; // for each collection one profile
    private final File queuesRoot;
    private Switchboard switchboard;
@ -466,6 +473,13 @@ public final class CrawlSwitchboard {
            UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()),
            this.defaultTextSnippetGlobalProfile);
        this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST);
+        
+        // generate new default entry for RecrawlBusyThread 
+        this.defaultRecrawlJobProfile = RecrawlBusyThread.buildDefaultCrawlProfile();
+        this.profilesActiveCrawls.put(
+            UTF8.getBytes(this.defaultRecrawlJobProfile.handle()),
+            this.defaultRecrawlJobProfile);
+        
        // generate new default entry for greedy learning
        this.defaultTextGreedyLearningProfile =
            new CrawlProfile(
--- a/source/net/yacy/crawler/RecrawlBusyThread.java
+++ b/source/net/yacy/crawler/RecrawlBusyThread.java
@ -34,6 +34,8 @@ import org.apache.solr.common.SolrDocumentList;

 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.federate.solr.connector.SolrConnector;
+import net.yacy.cora.federate.yacy.CacheStrategy;
+import net.yacy.cora.protocol.ClientIdentification;
 import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.crawler.data.CrawlProfile;
 import net.yacy.crawler.data.NoticedURL;
@ -183,7 +185,7 @@ public class RecrawlBusyThread extends AbstractBusyThread {
        int added = 0;

        if (!this.urlstack.isEmpty()) {
-            final CrawlProfile profile = sb.crawler.defaultTextSnippetGlobalProfile;
+            final CrawlProfile profile = sb.crawler.defaultRecrawlJobProfile;

            for (final DigestURL url : this.urlstack) {
                final Request request = sb.loader.request(url, true, true);
@ -302,6 +304,28 @@ public class RecrawlBusyThread extends AbstractBusyThread {
        }
        return true;
    }
+    
+	/**
+	 * @return a new default CrawlProfile instance to be used for recrawl jobs.
+	 */
+	public static CrawlProfile buildDefaultCrawlProfile() {
+		CrawlProfile profile = new CrawlProfile(CrawlSwitchboard.CRAWL_PROFILE_RECRAWL_JOB, CrawlProfile.MATCH_ALL_STRING, // crawlerUrlMustMatch
+				CrawlProfile.MATCH_NEVER_STRING, // crawlerUrlMustNotMatch
+				CrawlProfile.MATCH_ALL_STRING, // crawlerIpMustMatch
+				CrawlProfile.MATCH_NEVER_STRING, // crawlerIpMustNotMatch
+				CrawlProfile.MATCH_NEVER_STRING, // crawlerCountryMustMatch
+				CrawlProfile.MATCH_NEVER_STRING, // crawlerNoDepthLimitMatch
+				CrawlProfile.MATCH_ALL_STRING, // indexUrlMustMatch
+				CrawlProfile.MATCH_NEVER_STRING, // indexUrlMustNotMatch
+				CrawlProfile.MATCH_ALL_STRING, // indexContentMustMatch
+				CrawlProfile.MATCH_NEVER_STRING, // indexContentMustNotMatch
+				0, false, CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_RECRAWL_JOB_RECRAWL_CYCLE), -1,
+				true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
+				true, true, true, false, -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFFRESH,
+				"robot_" + CrawlSwitchboard.CRAWL_PROFILE_RECRAWL_JOB,
+				ClientIdentification.yacyInternetCrawlerAgentName, null, null, 0);
+		return profile;
+	}

    @Override
    public int getJobCount() {