added crawl settings for three new filters for each crawl:

must-match for IPs (IPs that are known after DNS resolving for each URL in the crawl queue)
must-not-match for IPs
must-match against a list of country codes (allows only loading from hosts that are hostet in given countries)

note: the settings and input environment is there with that commit, but the values are not yet evaluated

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7976 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent 47a8c69745
commit 5ad7f9612b

@ -557,6 +557,12 @@ xpstopw=true
# Change to false if requesting hits from peers with modified stopwords-file and using the unchanged client-version
filterOutStopwordsFromTopwords=true
# crawling steering: must-match/must-not-match
crawlingIPMustMatch=.*
crawlingIPMustNotMatch=
# the default country codes are all codes for countries in Europe
crawlingCountryMustMatch=AD,AL,AT,BA,BE,BG,BY,CH,CY,CZ,DE,DK,EE,ES,FI,FO,FR,GG,GI,GR,HR,HU,IE,IM,IS,IT,JE,LI,LT,LU,LV,MC,MD,MK,MT,NL,NO,PL,PT,RO,RU,SE,SI,SJ,SK,SM,TR,UA,UK,VA,YU
# performance-settings
# delay-times for permanent loops (milliseconds)
# the idlesleep is the pause that an proces sleeps if the last call to the

@ -86,8 +86,8 @@ public class CrawlProfileEditor_p {
static {
labels.add(new eentry(CrawlProfile.NAME, "Name", true, eentry.STRING));
labels.add(new eentry(CrawlProfile.START_URL, "Start URL", true, eentry.STRING));
labels.add(new eentry(CrawlProfile.FILTER_MUSTMATCH, "Must-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.FILTER_MUSTNOTMATCH, "Must-Not-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.FILTER_URL_MUSTMATCH, "Must-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.FILTER_URL_MUSTNOTMATCH, "Must-Not-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.DEPTH, "Crawl Depth", false, eentry.INTEGER));
labels.add(new eentry(CrawlProfile.RECRAWL_IF_OLDER, "Recrawl If Older", false, eentry.INTEGER));
labels.add(new eentry(CrawlProfile.DOM_MAX_PAGES, "Domain Max. Pages", false, eentry.INTEGER));
@ -159,8 +159,8 @@ public class CrawlProfileEditor_p {
if ((post != null) && (selentry != null)) {
if (post.containsKey("submit")) {
try {
Pattern.compile(post.get(CrawlProfile.FILTER_MUSTMATCH, CrawlProfile.MATCH_ALL));
Pattern.compile(post.get(CrawlProfile.FILTER_MUSTNOTMATCH, CrawlProfile.MATCH_NEVER));
Pattern.compile(post.get(CrawlProfile.FILTER_URL_MUSTMATCH, CrawlProfile.MATCH_ALL));
Pattern.compile(post.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH, CrawlProfile.MATCH_NEVER));
final Iterator<eentry> lit = labels.iterator();
eentry tee;
while (lit.hasNext()) {

@ -136,7 +136,7 @@
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td><label for="mustmatch">Must-Match Filter</label>:</td>
<td><label for="mustmatch">Must-Match Filter for URLs</label>:</td>
<td>
<input type="radio" name="range" id="rangeWide" value="wide" checked="checked" />Use filter&nbsp;&nbsp;
<input name="mustmatch" id="mustmatch" type="text" size="60" maxlength="100" value="#[mustmatch]#" /><br />
@ -151,7 +151,7 @@
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td><label for="mustnotmatch">Must-Not-Match Filter</label>:</td>
<td><label for="mustnotmatch">Must-Not-Match Filter for URLs</label>:</td>
<td>
<input name="mustnotmatch" id="mustnotmatch" type="text" size="60" maxlength="100" value="#[mustnotmatch]#" />
</td>
@ -162,6 +162,37 @@
If you don't know what this means, please leave this field empty.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td><label for="ipMustmatch">Must-Match Filter for IPs</label>:</td>
<td>
<input name="ipMustmatch" id="ipMustmatch" type="text" size="60" maxlength="100" value="#[ipMustmatch]#" />
</td>
<td>
Like the MUST-Match Filter for URLs this filter must match, but only for the IP of the host.
YaCy performs a DNS lookup for each host and this filter restricts the crawl to specific IPs
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td><label for="ipMustnotmatch">Must-Not-Match Filter for IPs</label>:</td>
<td>
<input name="ipMustnotmatch" id="ipMustnotmatch" type="text" size="60" maxlength="100" value="#[ipMustnotmatch]#" />
</td>
<td>
This filter must not match on the IP of the crawled host.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td><label for="crawlingCountryMustMatch">Must-Match List for Country Codes</label>:</td>
<td>
<input type="radio" name="countryMustMatchSwitch" id="countryMustMatchSwitch" value="true" />Use filter&nbsp;&nbsp;
<input name="crawlingCountryMustMatch" id="crawlingCountryMustMatch" type="text" size="60" maxlength="100" value="#[crawlingCountryMustMatch]#" />
<input type="radio" name="countryMustMatchSwitch" id="countryMustMatchSwitch" value="false" checked="checked" />no country code restriction
</td>
<td>
Crawls can be restricted to specific countries. This uses the country code that can be computed from
the IP of the server that hosts the page. The filter is not a regular expressions but a list of country codes, separated by comma.
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td>Maximum Pages per Domain:</td>
<td>

@ -25,6 +25,7 @@
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import de.anomic.crawler.CrawlProfile;
import de.anomic.server.serverObjects;
@ -34,7 +35,7 @@ public class CrawlStartExpert_p {
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
// return variable that accumulates replacements
//final Switchboard sb = (Switchboard) env;
final Switchboard sb = (Switchboard) env;
final serverObjects prop = new serverObjects();
// define visible variables
@ -43,6 +44,9 @@ public class CrawlStartExpert_p {
prop.put("crawlingDepth", Math.min(3, env.getConfigLong("crawlingDepth", 0)));
prop.put("mustmatch", /*(intranet) ? repository + ".*" :*/ CrawlProfile.MATCH_ALL);
prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER);
prop.put("ipMustmatch", sb.getConfig("crawlingIPMustMatch", CrawlProfile.MATCH_ALL));
prop.put("ipMustnotmatch", sb.getConfig("crawlingIPMustNotMatch", CrawlProfile.MATCH_NEVER));
prop.put("crawlingCountryMustMatch", sb.getConfig("crawlingCountryMustMatch", ""));
prop.put("crawlingIfOlderCheck", "0");
prop.put("crawlingIfOlderUnitYearCheck", "0");

@ -156,6 +156,14 @@ public class Crawler_p {
String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL);
final String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER);
if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL; // avoid that all urls are filtered out if bad value was submitted
String ipMustMatch = post.get("ipMustmatch", CrawlProfile.MATCH_ALL);
final String ipMustNotMatch = post.get("ipMustnotmatch", CrawlProfile.MATCH_NEVER);
if (ipMustMatch.length() < 2) ipMustMatch = CrawlProfile.MATCH_ALL;
final String countryMustMatch = post.getBoolean("countryMustMatchSwitch", false) ? post.get("countryMustMatchList", "") : "";
sb.setConfig("crawlingIPMustMatch", ipMustMatch);
sb.setConfig("crawlingIPMustNotMatch", ipMustNotMatch);
if (countryMustMatch.length() > 0) sb.setConfig("crawlingCountryMustMatch", countryMustMatch);
// special cases:
if (crawlingStartURL!= null && fullDomain) {
if (crawlingStartURL.isFile()) {
@ -249,7 +257,10 @@ public class Crawler_p {
crawlingStart,
crawlingStartURL,
newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER,
newcrawlingMustNotMatch,
ipMustMatch,
ipMustNotMatch,
countryMustMatch,
newcrawlingdepth,
crawlingIfOlder,
crawlingDomMaxPages,
@ -306,6 +317,9 @@ public class Crawler_p {
crawlingStartURL,
newcrawlingMustMatch,
newcrawlingMustNotMatch,
ipMustMatch,
ipMustNotMatch,
countryMustMatch,
newcrawlingdepth,
crawlingIfOlder,
crawlingDomMaxPages,
@ -426,6 +440,9 @@ public class Crawler_p {
crawlURL,
newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER,
ipMustMatch,
ipMustNotMatch,
countryMustMatch,
newcrawlingdepth,
crawlingIfOlder,
crawlingDomMaxPages,
@ -463,6 +480,9 @@ public class Crawler_p {
sitemapURL,
CrawlProfile.MATCH_ALL,
CrawlProfile.MATCH_NEVER,
ipMustMatch,
ipMustNotMatch,
countryMustMatch,
0,
crawlingIfOlder,
crawlingDomMaxPages,
@ -504,6 +524,9 @@ public class Crawler_p {
sitelistURL,
newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER,
ipMustMatch,
ipMustNotMatch,
countryMustMatch,
newcrawlingdepth,
crawlingIfOlder,
crawlingDomMaxPages,

@ -149,6 +149,9 @@ public class QuickCrawlLink_p {
crawlingStartURL.getHost(),
crawlingStartURL,
crawlingMustMatch,
CrawlProfile.MATCH_ALL,
CrawlProfile.MATCH_NEVER,
"",
crawlingMustNotMatch,
CrawlingDepth,
60 * 24 * 30, // recrawlIfOlder (minutes); here: one month

@ -48,8 +48,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public static final String HANDLE = "handle";
public static final String NAME = "name";
public static final String START_URL = "startURL";
public static final String FILTER_MUSTMATCH = "generalFilter";
public static final String FILTER_MUSTNOTMATCH = "nevermatch";
public static final String DEPTH = "generalDepth";
public static final String RECRAWL_IF_OLDER = "recrawlIfOlder";
public static final String DOM_MAX_PAGES = "domMaxPages";
@ -63,6 +61,11 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public static final String XDSTOPW = "xdstopw";
public static final String XPSTOPW = "xpstopw";
public static final String CACHE_STRAGEGY = "cacheStrategy";
public static final String FILTER_URL_MUSTMATCH = "generalFilter"; // for URLs
public static final String FILTER_URL_MUSTNOTMATCH = "nevermatch"; // for URLs
public static final String FILTER_IP_MUSTMATCH = "crawlingIPMustMatch";
public static final String FILTER_IP_MUSTNOTMATCH = "crawlingIPMustNotMatch";
public static final String FILTER_COUNTRY_MUSTMATCH = "crawlingCountryMustMatch";
private Pattern mustmatch = null, mustnotmatch = null;
@ -70,8 +73,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
* Constructor which creates CrawlPofile from parameters.
* @param name name of the crawl profile
* @param startURL root URL of the crawl
* @param mustmatch URLs which do not match this regex will be ignored
* @param mustnotmatch URLs which match this regex will be ignored
* @param urlMustMatch URLs which do not match this regex will be ignored
* @param urlMustNotMatch URLs which match this regex will be ignored
* @param depth height of the tree which will be created by the crawler
* @param recrawlIfOlder documents which have been indexed in the past will
* be indexed again if they are older than the time (ms) in this parameter
@ -89,8 +92,11 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public CrawlProfile(
final String name,
final DigestURI startURL,
final String mustmatch,
final String mustnotmatch,
final String urlMustMatch,
final String urlMustNotMatch,
final String ipMustMatch,
final String ipMustNotMatch,
final String countryMustMatch,
final int depth,
final long recrawlIfOlder /*date*/,
final int domMaxPages,
@ -113,8 +119,11 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
put(HANDLE, handle);
put(NAME, name);
put(START_URL, (startURL == null) ? "" : startURL.toNormalform(true, false));
put(FILTER_MUSTMATCH, (mustmatch == null) ? CrawlProfile.MATCH_ALL : mustmatch);
put(FILTER_MUSTNOTMATCH, (mustnotmatch == null) ? CrawlProfile.MATCH_NEVER : mustnotmatch);
put(FILTER_URL_MUSTMATCH, (urlMustMatch == null) ? CrawlProfile.MATCH_ALL : urlMustMatch);
put(FILTER_URL_MUSTNOTMATCH, (urlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER : urlMustNotMatch);
put(FILTER_IP_MUSTMATCH, (ipMustMatch == null) ? CrawlProfile.MATCH_ALL : ipMustMatch);
put(FILTER_IP_MUSTNOTMATCH, (ipMustNotMatch == null) ? CrawlProfile.MATCH_NEVER : ipMustNotMatch);
put(FILTER_COUNTRY_MUSTMATCH, (countryMustMatch == null) ? "" : countryMustMatch);
put(DEPTH, depth);
put(RECRAWL_IF_OLDER, recrawlIfOlder);
put(DOM_MAX_PAGES, domMaxPages);
@ -200,7 +209,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
*/
public Pattern mustMatchPattern() {
if (this.mustmatch == null) {
String r = get(FILTER_MUSTMATCH);
String r = get(FILTER_URL_MUSTMATCH);
if (r == null) r = CrawlProfile.MATCH_ALL;
this.mustmatch = Pattern.compile(r);
}
@ -213,7 +222,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
*/
public Pattern mustNotMatchPattern() {
if (this.mustnotmatch == null) {
String r = get(FILTER_MUSTNOTMATCH);
String r = get(FILTER_URL_MUSTNOTMATCH);
if (r == null) r = CrawlProfile.MATCH_NEVER;
this.mustnotmatch = Pattern.compile(r);
}

@ -63,7 +63,8 @@ public final class CrawlSwitchboard {
public static final long CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE = 60L * 24L * 30L;
private final Log log;
private Map<byte[], Map<String, String>> profilesActiveCrawls, profilesPassiveCrawls, profilesInvalidCrawls;
private Map<byte[], Map<String, String>> profilesActiveCrawls;
private final Map<byte[], Map<String, String>> profilesPassiveCrawls, profilesInvalidCrawls;
public CrawlProfile defaultProxyProfile;
public CrawlProfile defaultRemoteProfile;
public CrawlProfile defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile;
@ -97,18 +98,18 @@ public final class CrawlSwitchboard {
for (final byte[] handle : this.profilesActiveCrawls.keySet()) {
final CrawlProfile p;
p = new CrawlProfile(this.profilesActiveCrawls.get(handle));
if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_MUSTMATCH))) {
this.removeActive(handle);
this.putInvalid(handle, p);
if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_URL_MUSTMATCH))) {
removeActive(handle);
putInvalid(handle, p);
Log.logWarning("CrawlProfiles", "removed Profile " + p.handle() + ": " + p.name()
+ " from active crawls since " + CrawlProfile.FILTER_MUSTMATCH
+ " is no valid regular expression: " + p.get(CrawlProfile.FILTER_MUSTMATCH));
} else if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_MUSTNOTMATCH))) {
this.putInvalid(handle, p);
this.removeActive(handle);
+ " from active crawls since " + CrawlProfile.FILTER_URL_MUSTMATCH
+ " is no valid regular expression: " + p.get(CrawlProfile.FILTER_URL_MUSTMATCH));
} else if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH))) {
putInvalid(handle, p);
removeActive(handle);
Log.logWarning("CrawlProfiles", "removed Profile " + p.handle() + ": " + p.name()
+ " from active crawls since " + CrawlProfile.FILTER_MUSTNOTMATCH
+ " is no valid regular expression: " + p.get(CrawlProfile.FILTER_MUSTNOTMATCH));
+ " from active crawls since " + CrawlProfile.FILTER_URL_MUSTNOTMATCH
+ " is no valid regular expression: " + p.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH));
} else {
Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name());
}
@ -227,7 +228,10 @@ public final class CrawlSwitchboard {
if (this.defaultProxyProfile == null) {
// generate new default entry for proxy crawling
this.defaultProxyProfile = new CrawlProfile(
"proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
"proxy", null,
CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
"",
0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, false,
true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/,
@ -239,38 +243,38 @@ public final class CrawlSwitchboard {
}
if (this.defaultRemoteProfile == null) {
// generate new default entry for remote crawling
this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", CrawlProfile.MATCH_NEVER, 0,
-1, -1, true, true, true, false, false, true, true, false, CacheStrategy.IFFRESH);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultRemoteProfile.handle()), this.defaultRemoteProfile);
}
if (this.defaultTextSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()), this.defaultTextSnippetLocalProfile);
}
if (this.defaultTextSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, true, false, true, true, false, CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()), this.defaultTextSnippetGlobalProfile);
}
this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST);
if (this.defaultMediaSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()), this.defaultMediaSnippetLocalProfile);
}
if (this.defaultMediaSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, true, false, true, true, false, true, true, false, CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()), this.defaultMediaSnippetGlobalProfile);
}
if (this.defaultSurrogateProfile == null) {
// generate new default entry for surrogate parsing
this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, true, true, false, false, false, true, true, false, CacheStrategy.NOCACHE);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultSurrogateProfile.handle()), this.defaultSurrogateProfile);
}

Loading…
Cancel
Save