From 1728cd30c69875ca9539b45610090851c088183e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ryszard=20Go=C5=84?= Date: Tue, 12 Jan 2016 16:28:34 +0100 Subject: [PATCH] Create autocrawl profiles --- defaults/yacy.init | 9 +++ source/net/yacy/crawler/CrawlSwitchboard.java | 76 ++++++++++++++++++- .../net/yacy/search/SwitchboardConstants.java | 9 +++ 3 files changed, 92 insertions(+), 2 deletions(-) diff --git a/defaults/yacy.init b/defaults/yacy.init index 2525beda4..4b9907589 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -538,6 +538,15 @@ proxyURL.access=127.0.0.1,0:0:0:0:0:0:0:1 proxyURL.rewriteURLs=domainlist proxyURL.useforresults=false +# Autocrawl configuration +autocrawl=false +autocrawl.index.text=true +autocrawl.index.meia=true +autocrawl.deep.depth=3 +autocrawl.deep.recrawl=43200 +autocrawl.shallow.depth=1 +autocrawl.shallow.recrawl=1440 + # From the 'IndexCreate' menu point you can also define a crawling start point. # The crawling works the same way as the prefetch, but it is possible to # assign a different crawling depth. diff --git a/source/net/yacy/crawler/CrawlSwitchboard.java b/source/net/yacy/crawler/CrawlSwitchboard.java index 6a1c5476b..d2101eee7 100644 --- a/source/net/yacy/crawler/CrawlSwitchboard.java +++ b/source/net/yacy/crawler/CrawlSwitchboard.java @@ -58,7 +58,9 @@ import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; public final class CrawlSwitchboard { - + + public static final String CRAWL_PROFILE_AUTOCRAWL_DEEP = "autocrawlDeep"; + public static final String CRAWL_PROFILE_AUTOCRAWL_SHALLOW = "autocrawlShallow"; public static final String CRAWL_PROFILE_PROXY = "proxy"; public static final String CRAWL_PROFILE_REMOTE = "remote"; public static final String CRAWL_PROFILE_SNIPPET_LOCAL_TEXT = "snippetLocalText"; @@ -70,6 +72,8 @@ public final class CrawlSwitchboard { public static Set DEFAULT_PROFILES = new HashSet(); static { + DEFAULT_PROFILES.add(CRAWL_PROFILE_AUTOCRAWL_DEEP); + DEFAULT_PROFILES.add(CRAWL_PROFILE_AUTOCRAWL_SHALLOW); DEFAULT_PROFILES.add(CRAWL_PROFILE_PROXY); DEFAULT_PROFILES.add(CRAWL_PROFILE_REMOTE); DEFAULT_PROFILES.add(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT); @@ -98,6 +102,7 @@ public final class CrawlSwitchboard { private final Map profilesActiveCrawlsCounter; public CrawlProfile defaultProxyProfile, defaultRemoteProfile, defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile; public CrawlProfile defaultTextGreedyLearningProfile, defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile, defaultSurrogateProfile; + public CrawlProfile defaultAutocrawlDeepProfile, defaultAutocrawlShallowProfile; private Map defaultPushProfiles; // for each collection one profile private final File queuesRoot; private Switchboard switchboard; @@ -268,8 +273,75 @@ public final class CrawlSwitchboard { private void initActiveCrawlProfiles() { - // generate new default entry for proxy crawling final Switchboard sb = Switchboard.getSwitchboard(); + + // generate new default entry for deep auto crawl + this.defaultAutocrawlDeepProfile = + new CrawlProfile( + CRAWL_PROFILE_AUTOCRAWL_DEEP, + CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch + CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch + CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch + CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch + CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch + CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch + CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch + CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch + CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch + CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch + Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DEEP_DEPTH, "3")), + true, + CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DEEP_RECRAWL, "43200"))), + -1, + true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, + sb.getConfigBool(SwitchboardConstants.AUTOCRAWL_INDEX_TEXT, true), + sb.getConfigBool(SwitchboardConstants.AUTOCRAWL_INDEX_MEDIA, true), + false, + false, + -1, + false, true, CrawlProfile.MATCH_NEVER_STRING, + CacheStrategy.NOCACHE, + "robot_" + CRAWL_PROFILE_AUTOCRAWL_DEEP, + ClientIdentification.yacyInternetCrawlerAgentName, + null, + 0); + this.profilesActiveCrawls.put( + UTF8.getBytes(this.defaultAutocrawlDeepProfile.handle()), + this.defaultAutocrawlDeepProfile); + // generate new default entry for shallow auto crawl + this.defaultAutocrawlShallowProfile = + new CrawlProfile( + CRAWL_PROFILE_AUTOCRAWL_SHALLOW, + CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch + CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch + CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch + CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch + CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch + CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch + CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch + CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch + CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch + CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch + Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_SHALLOW_DEPTH, "1")), + true, + CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_SHALLOW_RECRAWL, "1440"))), + -1, + true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, + sb.getConfigBool(SwitchboardConstants.AUTOCRAWL_INDEX_TEXT, true), + sb.getConfigBool(SwitchboardConstants.AUTOCRAWL_INDEX_MEDIA, true), + false, + false, + -1, + false, true, CrawlProfile.MATCH_NEVER_STRING, + CacheStrategy.NOCACHE, + "robot_" + CRAWL_PROFILE_AUTOCRAWL_SHALLOW, + ClientIdentification.yacyInternetCrawlerAgentName, + null, + 0); + this.profilesActiveCrawls.put( + UTF8.getBytes(this.defaultAutocrawlShallowProfile.handle()), + this.defaultAutocrawlShallowProfile); + // generate new default entry for proxy crawling this.defaultProxyProfile = new CrawlProfile( CRAWL_PROFILE_PROXY, diff --git a/source/net/yacy/search/SwitchboardConstants.java b/source/net/yacy/search/SwitchboardConstants.java index 0c7f979a5..d11fa1776 100644 --- a/source/net/yacy/search/SwitchboardConstants.java +++ b/source/net/yacy/search/SwitchboardConstants.java @@ -308,6 +308,15 @@ public final class SwitchboardConstants { * @see Switchboard#PROXY_CACHE_LAYOUT_HASH */ public static final String PROXY_YACY_ONLY = "proxyYacyOnly"; + + public static final String AUTOCRAWL = "autocrawl"; + public static final String AUTOCRAWL_INDEX_TEXT = "autocrawl.index.text"; + public static final String AUTOCRAWL_INDEX_MEDIA = "autocrawl.index.media"; + public static final String AUTOCRAWL_DEEP_DEPTH = "autocrawl.deep.depth"; + public static final String AUTOCRAWL_DEEP_RECRAWL = "autocrawl.deep.recrawl"; + public static final String AUTOCRAWL_SHALLOW_DEPTH = "autocrawl.shallow.depth"; + public static final String AUTOCRAWL_SHALLOW_RECRAWL = "autocrawl.shallow.recrawl"; + ////////////////////////////////////////////////////////////////////////////////////////////// // Cluster settings