diff --git a/defaults/yacy.init b/defaults/yacy.init index e94de3478..c7e219a6a 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -557,6 +557,12 @@ xpstopw=true # Change to false if requesting hits from peers with modified stopwords-file and using the unchanged client-version filterOutStopwordsFromTopwords=true +# crawling steering: must-match/must-not-match +crawlingIPMustMatch=.* +crawlingIPMustNotMatch= +# the default country codes are all codes for countries in Europe +crawlingCountryMustMatch=AD,AL,AT,BA,BE,BG,BY,CH,CY,CZ,DE,DK,EE,ES,FI,FO,FR,GG,GI,GR,HR,HU,IE,IM,IS,IT,JE,LI,LT,LU,LV,MC,MD,MK,MT,NL,NO,PL,PT,RO,RU,SE,SI,SJ,SK,SM,TR,UA,UK,VA,YU + # performance-settings # delay-times for permanent loops (milliseconds) # the idlesleep is the pause that an proces sleeps if the last call to the diff --git a/htroot/CrawlProfileEditor_p.java b/htroot/CrawlProfileEditor_p.java index 3f229bbcd..d1b0e7e00 100644 --- a/htroot/CrawlProfileEditor_p.java +++ b/htroot/CrawlProfileEditor_p.java @@ -86,8 +86,8 @@ public class CrawlProfileEditor_p { static { labels.add(new eentry(CrawlProfile.NAME, "Name", true, eentry.STRING)); labels.add(new eentry(CrawlProfile.START_URL, "Start URL", true, eentry.STRING)); - labels.add(new eentry(CrawlProfile.FILTER_MUSTMATCH, "Must-Match Filter", false, eentry.STRING)); - labels.add(new eentry(CrawlProfile.FILTER_MUSTNOTMATCH, "Must-Not-Match Filter", false, eentry.STRING)); + labels.add(new eentry(CrawlProfile.FILTER_URL_MUSTMATCH, "Must-Match Filter", false, eentry.STRING)); + labels.add(new eentry(CrawlProfile.FILTER_URL_MUSTNOTMATCH, "Must-Not-Match Filter", false, eentry.STRING)); labels.add(new eentry(CrawlProfile.DEPTH, "Crawl Depth", false, eentry.INTEGER)); labels.add(new eentry(CrawlProfile.RECRAWL_IF_OLDER, "Recrawl If Older", false, eentry.INTEGER)); labels.add(new eentry(CrawlProfile.DOM_MAX_PAGES, "Domain Max. Pages", false, eentry.INTEGER)); @@ -159,8 +159,8 @@ public class CrawlProfileEditor_p { if ((post != null) && (selentry != null)) { if (post.containsKey("submit")) { try { - Pattern.compile(post.get(CrawlProfile.FILTER_MUSTMATCH, CrawlProfile.MATCH_ALL)); - Pattern.compile(post.get(CrawlProfile.FILTER_MUSTNOTMATCH, CrawlProfile.MATCH_NEVER)); + Pattern.compile(post.get(CrawlProfile.FILTER_URL_MUSTMATCH, CrawlProfile.MATCH_ALL)); + Pattern.compile(post.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH, CrawlProfile.MATCH_NEVER)); final Iterator lit = labels.iterator(); eentry tee; while (lit.hasNext()) { diff --git a/htroot/CrawlStartExpert_p.html b/htroot/CrawlStartExpert_p.html index 5000e0fd8..91ffbb23f 100644 --- a/htroot/CrawlStartExpert_p.html +++ b/htroot/CrawlStartExpert_p.html @@ -136,7 +136,7 @@ - : + : Use filter  
@@ -151,7 +151,7 @@ - : + : @@ -162,6 +162,37 @@ If you don't know what this means, please leave this field empty. + + : + + + + + Like the MUST-Match Filter for URLs this filter must match, but only for the IP of the host. + YaCy performs a DNS lookup for each host and this filter restricts the crawl to specific IPs + + + + : + + + + + This filter must not match on the IP of the crawled host. + + + + : + + Use filter   + + no country code restriction + + + Crawls can be restricted to specific countries. This uses the country code that can be computed from + the IP of the server that hosts the page. The filter is not a regular expressions but a list of country codes, separated by comma. + + Maximum Pages per Domain: diff --git a/htroot/CrawlStartExpert_p.java b/htroot/CrawlStartExpert_p.java index 4bca093ca..812f17f59 100644 --- a/htroot/CrawlStartExpert_p.java +++ b/htroot/CrawlStartExpert_p.java @@ -9,7 +9,7 @@ // $LastChangedBy: orbiter $ // // LICENSE -// +// // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or @@ -25,32 +25,36 @@ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA import net.yacy.cora.protocol.RequestHeader; +import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; import de.anomic.crawler.CrawlProfile; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; public class CrawlStartExpert_p { - + public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { // return variable that accumulates replacements - //final Switchboard sb = (Switchboard) env; + final Switchboard sb = (Switchboard) env; final serverObjects prop = new serverObjects(); - + // define visible variables prop.put("starturl", /*(intranet) ? repository :*/ "http://"); prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0")); prop.put("crawlingDepth", Math.min(3, env.getConfigLong("crawlingDepth", 0))); prop.put("mustmatch", /*(intranet) ? repository + ".*" :*/ CrawlProfile.MATCH_ALL); prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER); - + prop.put("ipMustmatch", sb.getConfig("crawlingIPMustMatch", CrawlProfile.MATCH_ALL)); + prop.put("ipMustnotmatch", sb.getConfig("crawlingIPMustNotMatch", CrawlProfile.MATCH_NEVER)); + prop.put("crawlingCountryMustMatch", sb.getConfig("crawlingCountryMustMatch", "")); + prop.put("crawlingIfOlderCheck", "0"); prop.put("crawlingIfOlderUnitYearCheck", "0"); prop.put("crawlingIfOlderUnitMonthCheck", "0"); prop.put("crawlingIfOlderUnitDayCheck", "1"); prop.put("crawlingIfOlderUnitHourCheck", "0"); prop.put("crawlingIfOlderNumber", "7"); - + final int crawlingDomFilterDepth = env.getConfigInt("crawlingDomFilterDepth", -1); prop.put("crawlingDomFilterCheck", (crawlingDomFilterDepth == -1) ? "0" : "1"); prop.put("crawlingDomFilterDepth", (crawlingDomFilterDepth == -1) ? 1 : crawlingDomFilterDepth); @@ -62,18 +66,18 @@ public class CrawlStartExpert_p { prop.put("indexingTextChecked", env.getConfigBool("indexText", true) ? "1" : "0"); prop.put("indexingMediaChecked", env.getConfigBool("indexMedia", true) ? "1" : "0"); prop.put("crawlOrderChecked", env.getConfigBool("crawlOrder", true) ? "1" : "0"); - + final long LCbusySleep = env.getConfigLong(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, 100L); final int LCppm = (LCbusySleep == 0) ? 1000 : (int) (60000L / LCbusySleep); prop.put("crawlingSpeedMaxChecked", (LCppm >= 1000) ? "1" : "0"); prop.put("crawlingSpeedCustChecked", ((LCppm > 10) && (LCppm < 1000)) ? "1" : "0"); prop.put("crawlingSpeedMinChecked", (LCppm <= 10) ? "1" : "0"); prop.put("customPPMdefault", ((LCppm > 10) && (LCppm < 1000)) ? Integer.toString(LCppm) : ""); - + prop.put("xsstopwChecked", env.getConfigBool("xsstopw", true) ? "1" : "0"); prop.put("xdstopwChecked", env.getConfigBool("xdstopw", true) ? "1" : "0"); prop.put("xpstopwChecked", env.getConfigBool("xpstopw", true) ? "1" : "0"); - + // return rewrite properties return prop; } diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index d703ad38a..0dadc3ce4 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -156,6 +156,14 @@ public class Crawler_p { String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL); final String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER); if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL; // avoid that all urls are filtered out if bad value was submitted + String ipMustMatch = post.get("ipMustmatch", CrawlProfile.MATCH_ALL); + final String ipMustNotMatch = post.get("ipMustnotmatch", CrawlProfile.MATCH_NEVER); + if (ipMustMatch.length() < 2) ipMustMatch = CrawlProfile.MATCH_ALL; + final String countryMustMatch = post.getBoolean("countryMustMatchSwitch", false) ? post.get("countryMustMatchList", "") : ""; + sb.setConfig("crawlingIPMustMatch", ipMustMatch); + sb.setConfig("crawlingIPMustNotMatch", ipMustNotMatch); + if (countryMustMatch.length() > 0) sb.setConfig("crawlingCountryMustMatch", countryMustMatch); + // special cases: if (crawlingStartURL!= null && fullDomain) { if (crawlingStartURL.isFile()) { @@ -249,7 +257,10 @@ public class Crawler_p { crawlingStart, crawlingStartURL, newcrawlingMustMatch, - CrawlProfile.MATCH_NEVER, + newcrawlingMustNotMatch, + ipMustMatch, + ipMustNotMatch, + countryMustMatch, newcrawlingdepth, crawlingIfOlder, crawlingDomMaxPages, @@ -306,6 +317,9 @@ public class Crawler_p { crawlingStartURL, newcrawlingMustMatch, newcrawlingMustNotMatch, + ipMustMatch, + ipMustNotMatch, + countryMustMatch, newcrawlingdepth, crawlingIfOlder, crawlingDomMaxPages, @@ -426,6 +440,9 @@ public class Crawler_p { crawlURL, newcrawlingMustMatch, CrawlProfile.MATCH_NEVER, + ipMustMatch, + ipMustNotMatch, + countryMustMatch, newcrawlingdepth, crawlingIfOlder, crawlingDomMaxPages, @@ -463,6 +480,9 @@ public class Crawler_p { sitemapURL, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, + ipMustMatch, + ipMustNotMatch, + countryMustMatch, 0, crawlingIfOlder, crawlingDomMaxPages, @@ -504,6 +524,9 @@ public class Crawler_p { sitelistURL, newcrawlingMustMatch, CrawlProfile.MATCH_NEVER, + ipMustMatch, + ipMustNotMatch, + countryMustMatch, newcrawlingdepth, crawlingIfOlder, crawlingDomMaxPages, diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index d1ca69db6..a61d07de2 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -149,6 +149,9 @@ public class QuickCrawlLink_p { crawlingStartURL.getHost(), crawlingStartURL, crawlingMustMatch, + CrawlProfile.MATCH_ALL, + CrawlProfile.MATCH_NEVER, + "", crawlingMustNotMatch, CrawlingDepth, 60 * 24 * 30, // recrawlIfOlder (minutes); here: one month diff --git a/source/de/anomic/crawler/CrawlProfile.java b/source/de/anomic/crawler/CrawlProfile.java index b574183e6..4705fa7c2 100644 --- a/source/de/anomic/crawler/CrawlProfile.java +++ b/source/de/anomic/crawler/CrawlProfile.java @@ -48,8 +48,6 @@ public class CrawlProfile extends ConcurrentHashMap implements M public static final String HANDLE = "handle"; public static final String NAME = "name"; public static final String START_URL = "startURL"; - public static final String FILTER_MUSTMATCH = "generalFilter"; - public static final String FILTER_MUSTNOTMATCH = "nevermatch"; public static final String DEPTH = "generalDepth"; public static final String RECRAWL_IF_OLDER = "recrawlIfOlder"; public static final String DOM_MAX_PAGES = "domMaxPages"; @@ -63,6 +61,11 @@ public class CrawlProfile extends ConcurrentHashMap implements M public static final String XDSTOPW = "xdstopw"; public static final String XPSTOPW = "xpstopw"; public static final String CACHE_STRAGEGY = "cacheStrategy"; + public static final String FILTER_URL_MUSTMATCH = "generalFilter"; // for URLs + public static final String FILTER_URL_MUSTNOTMATCH = "nevermatch"; // for URLs + public static final String FILTER_IP_MUSTMATCH = "crawlingIPMustMatch"; + public static final String FILTER_IP_MUSTNOTMATCH = "crawlingIPMustNotMatch"; + public static final String FILTER_COUNTRY_MUSTMATCH = "crawlingCountryMustMatch"; private Pattern mustmatch = null, mustnotmatch = null; @@ -70,8 +73,8 @@ public class CrawlProfile extends ConcurrentHashMap implements M * Constructor which creates CrawlPofile from parameters. * @param name name of the crawl profile * @param startURL root URL of the crawl - * @param mustmatch URLs which do not match this regex will be ignored - * @param mustnotmatch URLs which match this regex will be ignored + * @param urlMustMatch URLs which do not match this regex will be ignored + * @param urlMustNotMatch URLs which match this regex will be ignored * @param depth height of the tree which will be created by the crawler * @param recrawlIfOlder documents which have been indexed in the past will * be indexed again if they are older than the time (ms) in this parameter @@ -89,8 +92,11 @@ public class CrawlProfile extends ConcurrentHashMap implements M public CrawlProfile( final String name, final DigestURI startURL, - final String mustmatch, - final String mustnotmatch, + final String urlMustMatch, + final String urlMustNotMatch, + final String ipMustMatch, + final String ipMustNotMatch, + final String countryMustMatch, final int depth, final long recrawlIfOlder /*date*/, final int domMaxPages, @@ -107,14 +113,17 @@ public class CrawlProfile extends ConcurrentHashMap implements M if (name == null || name.isEmpty()) { throw new NullPointerException("name must not be null or empty"); } - final String handle = (startURL == null) + final String handle = (startURL == null) ? Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name)).substring(0, Word.commonHashLength) : ASCII.String(startURL.hash()); put(HANDLE, handle); put(NAME, name); put(START_URL, (startURL == null) ? "" : startURL.toNormalform(true, false)); - put(FILTER_MUSTMATCH, (mustmatch == null) ? CrawlProfile.MATCH_ALL : mustmatch); - put(FILTER_MUSTNOTMATCH, (mustnotmatch == null) ? CrawlProfile.MATCH_NEVER : mustnotmatch); + put(FILTER_URL_MUSTMATCH, (urlMustMatch == null) ? CrawlProfile.MATCH_ALL : urlMustMatch); + put(FILTER_URL_MUSTNOTMATCH, (urlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER : urlMustNotMatch); + put(FILTER_IP_MUSTMATCH, (ipMustMatch == null) ? CrawlProfile.MATCH_ALL : ipMustMatch); + put(FILTER_IP_MUSTNOTMATCH, (ipMustNotMatch == null) ? CrawlProfile.MATCH_NEVER : ipMustNotMatch); + put(FILTER_COUNTRY_MUSTMATCH, (countryMustMatch == null) ? "" : countryMustMatch); put(DEPTH, depth); put(RECRAWL_IF_OLDER, recrawlIfOlder); put(DOM_MAX_PAGES, domMaxPages); @@ -137,7 +146,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M super(ext == null ? 1 : ext.size()); if (ext != null) putAll(ext); } - + /** * Adds a parameter to CrawlProfile. * @param key name of the parameter @@ -174,7 +183,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M //if (r == null) return null; return r; } - + /** * Gets the name of the CrawlProfile. * @return name of the profile @@ -184,7 +193,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M if (r == null) return ""; return r; } - + /** * Gets the root URL of the crawl job. * @return root URL @@ -193,35 +202,35 @@ public class CrawlProfile extends ConcurrentHashMap implements M final String r = get(START_URL); return r; } - + /** * Gets the regex which must be matched by URLs in order to be crawled. * @return regex which must be matched */ public Pattern mustMatchPattern() { if (this.mustmatch == null) { - String r = get(FILTER_MUSTMATCH); + String r = get(FILTER_URL_MUSTMATCH); if (r == null) r = CrawlProfile.MATCH_ALL; this.mustmatch = Pattern.compile(r); } return this.mustmatch; } - + /** * Gets the regex which must not be matched by URLs in order to be crawled. * @return regex which must not be matched */ public Pattern mustNotMatchPattern() { if (this.mustnotmatch == null) { - String r = get(FILTER_MUSTNOTMATCH); + String r = get(FILTER_URL_MUSTNOTMATCH); if (r == null) r = CrawlProfile.MATCH_NEVER; this.mustnotmatch = Pattern.compile(r); } return this.mustnotmatch; } - + /** - * Gets depth of crawl job (or height of the tree which will be + * Gets depth of crawl job (or height of the tree which will be * created by the crawler). * @return depth of crawl job */ @@ -235,7 +244,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M return 0; } } - + public CacheStrategy cacheStrategy() { final String r = get(CACHE_STRAGEGY); if (r == null) return CacheStrategy.IFEXIST; @@ -246,11 +255,11 @@ public class CrawlProfile extends ConcurrentHashMap implements M return CacheStrategy.IFEXIST; } } - + public void setCacheStrategy(final CacheStrategy newStrategy) { put(CACHE_STRAGEGY, newStrategy.toString()); } - + /** * Gets the minimum age that an entry must have to be re-crawled. * @return time in ms @@ -268,7 +277,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M return 0L; } } - + public int domMaxPages() { // this is the maximum number of pages that are crawled for a single domain // if -1, this means no limit @@ -283,31 +292,31 @@ public class CrawlProfile extends ConcurrentHashMap implements M return Integer.MAX_VALUE; } } - + public boolean crawlingQ() { final String r = get(CRAWLING_Q); if (r == null) return false; return (r.equals(Boolean.TRUE.toString())); } - + public boolean pushSolr() { final String r = get(PUSH_SOLR); if (r == null) return true; return (r.equals(Boolean.TRUE.toString())); } - + public boolean indexText() { final String r = get(INDEX_TEXT); if (r == null) return true; return (r.equals(Boolean.TRUE.toString())); } - + public boolean indexMedia() { final String r = get(INDEX_MEDIA); if (r == null) return true; return (r.equals(Boolean.TRUE.toString())); } - + public boolean storeHTCache() { final String r = get(STORE_HTCACHE); if (r == null) return false; @@ -318,19 +327,19 @@ public class CrawlProfile extends ConcurrentHashMap implements M if (r == null) return false; return (r.equals(Boolean.TRUE.toString())); } - + public boolean excludeStaticStopwords() { final String r = get(XSSTOPW); if (r == null) return false; return (r.equals(Boolean.TRUE.toString())); } - + public boolean excludeDynamicStopwords() { final String r = get(XDSTOPW); if (r == null) return false; return (r.equals(Boolean.TRUE.toString())); } - + public boolean excludeParentStopwords() { final String r = get(XPSTOPW); if (r == null) return false; diff --git a/source/de/anomic/crawler/CrawlSwitchboard.java b/source/de/anomic/crawler/CrawlSwitchboard.java index a2f4d2ab7..fe2013edf 100644 --- a/source/de/anomic/crawler/CrawlSwitchboard.java +++ b/source/de/anomic/crawler/CrawlSwitchboard.java @@ -63,7 +63,8 @@ public final class CrawlSwitchboard { public static final long CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE = 60L * 24L * 30L; private final Log log; - private Map> profilesActiveCrawls, profilesPassiveCrawls, profilesInvalidCrawls; + private Map> profilesActiveCrawls; + private final Map> profilesPassiveCrawls, profilesInvalidCrawls; public CrawlProfile defaultProxyProfile; public CrawlProfile defaultRemoteProfile; public CrawlProfile defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile; @@ -91,28 +92,28 @@ public final class CrawlSwitchboard { final File profilesInvalidFile = new File(queuesRoot, DBFILE_INVALID_CRAWL_PROFILES); this.profilesInvalidCrawls = loadFromDB(profilesInvalidFile); - + final File profilesActiveFile = new File(queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES); this.profilesActiveCrawls = loadFromDB(profilesActiveFile); for (final byte[] handle : this.profilesActiveCrawls.keySet()) { final CrawlProfile p; p = new CrawlProfile(this.profilesActiveCrawls.get(handle)); - if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_MUSTMATCH))) { - this.removeActive(handle); - this.putInvalid(handle, p); + if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_URL_MUSTMATCH))) { + removeActive(handle); + putInvalid(handle, p); Log.logWarning("CrawlProfiles", "removed Profile " + p.handle() + ": " + p.name() - + " from active crawls since " + CrawlProfile.FILTER_MUSTMATCH - + " is no valid regular expression: " + p.get(CrawlProfile.FILTER_MUSTMATCH)); - } else if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_MUSTNOTMATCH))) { - this.putInvalid(handle, p); - this.removeActive(handle); + + " from active crawls since " + CrawlProfile.FILTER_URL_MUSTMATCH + + " is no valid regular expression: " + p.get(CrawlProfile.FILTER_URL_MUSTMATCH)); + } else if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH))) { + putInvalid(handle, p); + removeActive(handle); Log.logWarning("CrawlProfiles", "removed Profile " + p.handle() + ": " + p.name() - + " from active crawls since " + CrawlProfile.FILTER_MUSTNOTMATCH - + " is no valid regular expression: " + p.get(CrawlProfile.FILTER_MUSTNOTMATCH)); + + " from active crawls since " + CrawlProfile.FILTER_URL_MUSTNOTMATCH + + " is no valid regular expression: " + p.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH)); } else { Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name()); } - + } initActiveCrawlProfiles(); log.logInfo("Loaded active crawl profiles from file " + profilesActiveFile.getName() + ", " + this.profilesActiveCrawls.size() + " entries"); @@ -134,7 +135,7 @@ public final class CrawlSwitchboard { if (m == null) return null; return new CrawlProfile(m); } - + public CrawlProfile getInvalid(final byte[] profileKey) { if (profileKey == null) return null; final Map m = this.profilesInvalidCrawls.get(profileKey); @@ -152,7 +153,7 @@ public final class CrawlSwitchboard { public Set getActive() { return this.profilesActiveCrawls.keySet(); } - + public Set getInvalid() { return this.profilesInvalidCrawls.keySet(); } @@ -165,7 +166,7 @@ public final class CrawlSwitchboard { if (profileKey == null) return; this.profilesActiveCrawls.remove(profileKey); } - + public void removeInvalid(final byte[] profileKey) { if (profileKey == null) return; this.profilesInvalidCrawls.remove(profileKey); @@ -179,7 +180,7 @@ public final class CrawlSwitchboard { public void putActive(final byte[] profileKey, final CrawlProfile profile) { this.profilesActiveCrawls.put(profileKey, profile); } - + public void putInvalid(final byte[] profileKey, final CrawlProfile profile) { this.profilesInvalidCrawls.put(profileKey, profile); } @@ -227,7 +228,10 @@ public final class CrawlSwitchboard { if (this.defaultProxyProfile == null) { // generate new default entry for proxy crawling this.defaultProxyProfile = new CrawlProfile( - "proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, + "proxy", null, + CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, + CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, + "", 0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, false, true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/, @@ -239,38 +243,38 @@ public final class CrawlSwitchboard { } if (this.defaultRemoteProfile == null) { // generate new default entry for remote crawling - this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, + this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", CrawlProfile.MATCH_NEVER, 0, -1, -1, true, true, true, false, false, true, true, false, CacheStrategy.IFFRESH); this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultRemoteProfile.handle()), this.defaultRemoteProfile); } if (this.defaultTextSnippetLocalProfile == null) { // generate new default entry for snippet fetch and optional crawling - this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, + this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CacheStrategy.IFEXIST); this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()), this.defaultTextSnippetLocalProfile); } if (this.defaultTextSnippetGlobalProfile == null) { // generate new default entry for snippet fetch and optional crawling - this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, + this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, true, false, true, true, false, CacheStrategy.IFEXIST); this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()), this.defaultTextSnippetGlobalProfile); } this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST); if (this.defaultMediaSnippetLocalProfile == null) { // generate new default entry for snippet fetch and optional crawling - this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, + this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CacheStrategy.IFEXIST); this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()), this.defaultMediaSnippetLocalProfile); } if (this.defaultMediaSnippetGlobalProfile == null) { // generate new default entry for snippet fetch and optional crawling - this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, + this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, true, false, true, true, false, true, true, false, CacheStrategy.IFEXIST); this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()), this.defaultMediaSnippetGlobalProfile); } if (this.defaultSurrogateProfile == null) { // generate new default entry for surrogate parsing - this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, + this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, true, true, false, false, false, true, true, false, CacheStrategy.NOCACHE); this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultSurrogateProfile.handle()), this.defaultSurrogateProfile); } @@ -324,8 +328,8 @@ public final class CrawlSwitchboard { ((MapHeap) this.profilesInvalidCrawls).close(); ((MapHeap) this.profilesPassiveCrawls).close(); } - - + + /** * Loads crawl profiles from a DB file. * @param file DB file