|
|
@ -217,270 +217,210 @@ public final class CrawlSwitchboard
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
private void initActiveCrawlProfiles() {
|
|
|
|
private void initActiveCrawlProfiles() {
|
|
|
|
this.defaultProxyProfile = null;
|
|
|
|
// generate new default entry for proxy crawling
|
|
|
|
this.defaultRemoteProfile = null;
|
|
|
|
this.defaultProxyProfile =
|
|
|
|
this.defaultTextSnippetLocalProfile = null;
|
|
|
|
new CrawlProfile(
|
|
|
|
this.defaultTextSnippetGlobalProfile = null;
|
|
|
|
CRAWL_PROFILE_PROXY,
|
|
|
|
this.defaultMediaSnippetLocalProfile = null;
|
|
|
|
CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch
|
|
|
|
this.defaultMediaSnippetGlobalProfile = null;
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
|
|
|
|
this.defaultSurrogateProfile = null;
|
|
|
|
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
|
|
|
|
CrawlProfile profile;
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
|
|
|
|
String name;
|
|
|
|
"", //crawlerCountryMustMatch
|
|
|
|
try {
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
|
|
|
|
for ( final byte[] handle : this.profilesActiveCrawls.keySet() ) {
|
|
|
|
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
|
|
|
|
profile = new CrawlProfile(this.profilesActiveCrawls.get(handle));
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
|
|
|
|
name = profile.name();
|
|
|
|
0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
|
|
|
|
if ( name.equals(CRAWL_PROFILE_PROXY) ) {
|
|
|
|
true,
|
|
|
|
this.defaultProxyProfile = profile;
|
|
|
|
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE),
|
|
|
|
}
|
|
|
|
-1,
|
|
|
|
if ( name.equals(CRAWL_PROFILE_REMOTE) ) {
|
|
|
|
false,
|
|
|
|
this.defaultRemoteProfile = profile;
|
|
|
|
true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/,
|
|
|
|
}
|
|
|
|
true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/,
|
|
|
|
if ( name.equals(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) ) {
|
|
|
|
true,
|
|
|
|
this.defaultTextSnippetLocalProfile = profile;
|
|
|
|
false /*getConfigBool(PROXY_INDEXING_REMOTE, false)*/,
|
|
|
|
}
|
|
|
|
true,
|
|
|
|
if ( name.equals(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT) ) {
|
|
|
|
true,
|
|
|
|
this.defaultTextSnippetGlobalProfile = profile;
|
|
|
|
true,
|
|
|
|
}
|
|
|
|
CacheStrategy.IFFRESH,
|
|
|
|
if ( name.equals(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) ) {
|
|
|
|
"robot_" + CRAWL_PROFILE_PROXY);
|
|
|
|
this.defaultMediaSnippetLocalProfile = profile;
|
|
|
|
this.profilesActiveCrawls.put(
|
|
|
|
}
|
|
|
|
UTF8.getBytes(this.defaultProxyProfile.handle()),
|
|
|
|
if ( name.equals(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) ) {
|
|
|
|
this.defaultProxyProfile);
|
|
|
|
this.defaultMediaSnippetGlobalProfile = profile;
|
|
|
|
// generate new default entry for remote crawling
|
|
|
|
}
|
|
|
|
this.defaultRemoteProfile =
|
|
|
|
if ( name.equals(CRAWL_PROFILE_SURROGATE) ) {
|
|
|
|
new CrawlProfile(
|
|
|
|
this.defaultSurrogateProfile = profile;
|
|
|
|
CRAWL_PROFILE_REMOTE,
|
|
|
|
}
|
|
|
|
CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch
|
|
|
|
}
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
|
|
|
|
} catch ( final Exception e ) {
|
|
|
|
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
|
|
|
|
this.profilesActiveCrawls.clear();
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
|
|
|
|
this.defaultProxyProfile = null;
|
|
|
|
"", //crawlerCountryMustMatch
|
|
|
|
this.defaultRemoteProfile = null;
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
|
|
|
|
this.defaultTextSnippetLocalProfile = null;
|
|
|
|
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
|
|
|
|
this.defaultTextSnippetGlobalProfile = null;
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
|
|
|
|
this.defaultMediaSnippetLocalProfile = null;
|
|
|
|
0,
|
|
|
|
this.defaultMediaSnippetGlobalProfile = null;
|
|
|
|
false,
|
|
|
|
this.defaultSurrogateProfile = null;
|
|
|
|
-1,
|
|
|
|
}
|
|
|
|
-1,
|
|
|
|
|
|
|
|
true,
|
|
|
|
if ( this.defaultProxyProfile == null ) {
|
|
|
|
true,
|
|
|
|
// generate new default entry for proxy crawling
|
|
|
|
true,
|
|
|
|
this.defaultProxyProfile =
|
|
|
|
false,
|
|
|
|
new CrawlProfile(
|
|
|
|
false,
|
|
|
|
CRAWL_PROFILE_PROXY,
|
|
|
|
true,
|
|
|
|
CrawlProfile.MATCH_ALL_STRING,
|
|
|
|
true,
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING,
|
|
|
|
false,
|
|
|
|
CrawlProfile.MATCH_ALL_STRING,
|
|
|
|
CacheStrategy.IFFRESH,
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING,
|
|
|
|
"robot_" + CRAWL_PROFILE_REMOTE);
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING,
|
|
|
|
this.profilesActiveCrawls.put(
|
|
|
|
CrawlProfile.MATCH_ALL_STRING,
|
|
|
|
UTF8.getBytes(this.defaultRemoteProfile.handle()),
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING,
|
|
|
|
this.defaultRemoteProfile);
|
|
|
|
"",
|
|
|
|
// generate new default entry for snippet fetch and optional crawling
|
|
|
|
0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
|
|
|
|
this.defaultTextSnippetLocalProfile =
|
|
|
|
true,
|
|
|
|
new CrawlProfile(
|
|
|
|
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE),
|
|
|
|
CRAWL_PROFILE_SNIPPET_LOCAL_TEXT,
|
|
|
|
-1,
|
|
|
|
CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch
|
|
|
|
false,
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
|
|
|
|
true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/,
|
|
|
|
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
|
|
|
|
true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/,
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
|
|
|
|
true,
|
|
|
|
"", //crawlerCountryMustMatch
|
|
|
|
false /*getConfigBool(PROXY_INDEXING_REMOTE, false)*/,
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
|
|
|
|
true,
|
|
|
|
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
|
|
|
|
true,
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
|
|
|
|
true,
|
|
|
|
0,
|
|
|
|
CacheStrategy.IFFRESH,
|
|
|
|
false,
|
|
|
|
"robot_" + CRAWL_PROFILE_PROXY);
|
|
|
|
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE),
|
|
|
|
this.profilesActiveCrawls.put(
|
|
|
|
-1,
|
|
|
|
UTF8.getBytes(this.defaultProxyProfile.handle()),
|
|
|
|
true,
|
|
|
|
this.defaultProxyProfile);
|
|
|
|
false,
|
|
|
|
}
|
|
|
|
false,
|
|
|
|
if ( this.defaultRemoteProfile == null ) {
|
|
|
|
true,
|
|
|
|
// generate new default entry for remote crawling
|
|
|
|
false,
|
|
|
|
this.defaultRemoteProfile =
|
|
|
|
true,
|
|
|
|
new CrawlProfile(
|
|
|
|
true,
|
|
|
|
CRAWL_PROFILE_REMOTE,
|
|
|
|
false,
|
|
|
|
CrawlProfile.MATCH_ALL_STRING,
|
|
|
|
CacheStrategy.IFEXIST,
|
|
|
|
CrawlProfile.MATCH_ALL_STRING,
|
|
|
|
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT);
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING,
|
|
|
|
this.profilesActiveCrawls.put(
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING,
|
|
|
|
UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()),
|
|
|
|
CrawlProfile.MATCH_ALL_STRING,
|
|
|
|
this.defaultTextSnippetLocalProfile);
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING,
|
|
|
|
// generate new default entry for snippet fetch and optional crawling
|
|
|
|
"",
|
|
|
|
this.defaultTextSnippetGlobalProfile =
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING,
|
|
|
|
new CrawlProfile(
|
|
|
|
0,
|
|
|
|
CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT,
|
|
|
|
false,
|
|
|
|
CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch
|
|
|
|
-1,
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
|
|
|
|
-1,
|
|
|
|
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
|
|
|
|
true,
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
|
|
|
|
true,
|
|
|
|
"", //crawlerCountryMustMatch
|
|
|
|
true,
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
|
|
|
|
false,
|
|
|
|
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
|
|
|
|
false,
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
|
|
|
|
true,
|
|
|
|
0,
|
|
|
|
true,
|
|
|
|
false,
|
|
|
|
false,
|
|
|
|
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE),
|
|
|
|
CacheStrategy.IFFRESH,
|
|
|
|
-1,
|
|
|
|
"robot_" + CRAWL_PROFILE_REMOTE);
|
|
|
|
true,
|
|
|
|
this.profilesActiveCrawls.put(
|
|
|
|
true,
|
|
|
|
UTF8.getBytes(this.defaultRemoteProfile.handle()),
|
|
|
|
true,
|
|
|
|
this.defaultRemoteProfile);
|
|
|
|
true,
|
|
|
|
}
|
|
|
|
false,
|
|
|
|
if ( this.defaultTextSnippetLocalProfile == null ) {
|
|
|
|
true,
|
|
|
|
// generate new default entry for snippet fetch and optional crawling
|
|
|
|
true,
|
|
|
|
this.defaultTextSnippetLocalProfile =
|
|
|
|
false,
|
|
|
|
new CrawlProfile(
|
|
|
|
CacheStrategy.IFEXIST,
|
|
|
|
CRAWL_PROFILE_SNIPPET_LOCAL_TEXT,
|
|
|
|
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT);
|
|
|
|
CrawlProfile.MATCH_ALL_STRING,
|
|
|
|
this.profilesActiveCrawls.put(
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING,
|
|
|
|
UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()),
|
|
|
|
CrawlProfile.MATCH_ALL_STRING,
|
|
|
|
this.defaultTextSnippetGlobalProfile);
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING,
|
|
|
|
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING,
|
|
|
|
|
|
|
|
CrawlProfile.MATCH_ALL_STRING,
|
|
|
|
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING,
|
|
|
|
|
|
|
|
"",
|
|
|
|
|
|
|
|
0,
|
|
|
|
|
|
|
|
false,
|
|
|
|
|
|
|
|
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE),
|
|
|
|
|
|
|
|
-1,
|
|
|
|
|
|
|
|
true,
|
|
|
|
|
|
|
|
false,
|
|
|
|
|
|
|
|
false,
|
|
|
|
|
|
|
|
true,
|
|
|
|
|
|
|
|
false,
|
|
|
|
|
|
|
|
true,
|
|
|
|
|
|
|
|
true,
|
|
|
|
|
|
|
|
false,
|
|
|
|
|
|
|
|
CacheStrategy.IFEXIST,
|
|
|
|
|
|
|
|
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT);
|
|
|
|
|
|
|
|
this.profilesActiveCrawls.put(
|
|
|
|
|
|
|
|
UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()),
|
|
|
|
|
|
|
|
this.defaultTextSnippetLocalProfile);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( this.defaultTextSnippetGlobalProfile == null ) {
|
|
|
|
|
|
|
|
// generate new default entry for snippet fetch and optional crawling
|
|
|
|
|
|
|
|
this.defaultTextSnippetGlobalProfile =
|
|
|
|
|
|
|
|
new CrawlProfile(
|
|
|
|
|
|
|
|
CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT,
|
|
|
|
|
|
|
|
CrawlProfile.MATCH_ALL_STRING,
|
|
|
|
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING,
|
|
|
|
|
|
|
|
CrawlProfile.MATCH_ALL_STRING,
|
|
|
|
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING,
|
|
|
|
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING,
|
|
|
|
|
|
|
|
CrawlProfile.MATCH_ALL_STRING,
|
|
|
|
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING,
|
|
|
|
|
|
|
|
"",
|
|
|
|
|
|
|
|
0,
|
|
|
|
|
|
|
|
false,
|
|
|
|
|
|
|
|
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE),
|
|
|
|
|
|
|
|
-1,
|
|
|
|
|
|
|
|
true,
|
|
|
|
|
|
|
|
true,
|
|
|
|
|
|
|
|
true,
|
|
|
|
|
|
|
|
true,
|
|
|
|
|
|
|
|
false,
|
|
|
|
|
|
|
|
true,
|
|
|
|
|
|
|
|
true,
|
|
|
|
|
|
|
|
false,
|
|
|
|
|
|
|
|
CacheStrategy.IFEXIST,
|
|
|
|
|
|
|
|
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT);
|
|
|
|
|
|
|
|
this.profilesActiveCrawls.put(
|
|
|
|
|
|
|
|
UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()),
|
|
|
|
|
|
|
|
this.defaultTextSnippetGlobalProfile);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST);
|
|
|
|
this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST);
|
|
|
|
if ( this.defaultMediaSnippetLocalProfile == null ) {
|
|
|
|
// generate new default entry for snippet fetch and optional crawling
|
|
|
|
// generate new default entry for snippet fetch and optional crawling
|
|
|
|
this.defaultMediaSnippetLocalProfile =
|
|
|
|
this.defaultMediaSnippetLocalProfile =
|
|
|
|
new CrawlProfile(
|
|
|
|
new CrawlProfile(
|
|
|
|
CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA,
|
|
|
|
CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA,
|
|
|
|
CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch
|
|
|
|
CrawlProfile.MATCH_ALL_STRING,
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING,
|
|
|
|
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
|
|
|
|
CrawlProfile.MATCH_ALL_STRING,
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING,
|
|
|
|
"", //crawlerCountryMustMatch
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING,
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
|
|
|
|
CrawlProfile.MATCH_ALL_STRING,
|
|
|
|
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING,
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
|
|
|
|
"",
|
|
|
|
0,
|
|
|
|
0,
|
|
|
|
false,
|
|
|
|
false,
|
|
|
|
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE),
|
|
|
|
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE),
|
|
|
|
-1,
|
|
|
|
-1,
|
|
|
|
true,
|
|
|
|
true,
|
|
|
|
false,
|
|
|
|
false,
|
|
|
|
false,
|
|
|
|
false,
|
|
|
|
true,
|
|
|
|
true,
|
|
|
|
false,
|
|
|
|
false,
|
|
|
|
true,
|
|
|
|
true,
|
|
|
|
true,
|
|
|
|
true,
|
|
|
|
false,
|
|
|
|
false,
|
|
|
|
CacheStrategy.IFEXIST,
|
|
|
|
CacheStrategy.IFEXIST,
|
|
|
|
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA);
|
|
|
|
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA);
|
|
|
|
this.profilesActiveCrawls.put(
|
|
|
|
this.profilesActiveCrawls.put(
|
|
|
|
UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()),
|
|
|
|
UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()),
|
|
|
|
this.defaultMediaSnippetLocalProfile);
|
|
|
|
this.defaultMediaSnippetLocalProfile);
|
|
|
|
// generate new default entry for snippet fetch and optional crawling
|
|
|
|
}
|
|
|
|
this.defaultMediaSnippetGlobalProfile =
|
|
|
|
if ( this.defaultMediaSnippetGlobalProfile == null ) {
|
|
|
|
new CrawlProfile(
|
|
|
|
// generate new default entry for snippet fetch and optional crawling
|
|
|
|
CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
|
|
|
|
this.defaultMediaSnippetGlobalProfile =
|
|
|
|
CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch
|
|
|
|
new CrawlProfile(
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
|
|
|
|
CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
|
|
|
|
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
|
|
|
|
CrawlProfile.MATCH_ALL_STRING,
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING,
|
|
|
|
"", //crawlerCountryMustMatch
|
|
|
|
CrawlProfile.MATCH_ALL_STRING,
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING,
|
|
|
|
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING,
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
|
|
|
|
CrawlProfile.MATCH_ALL_STRING,
|
|
|
|
0,
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING,
|
|
|
|
false,
|
|
|
|
"",
|
|
|
|
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE),
|
|
|
|
0,
|
|
|
|
-1,
|
|
|
|
false,
|
|
|
|
true,
|
|
|
|
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE),
|
|
|
|
false,
|
|
|
|
-1,
|
|
|
|
true,
|
|
|
|
true,
|
|
|
|
true,
|
|
|
|
false,
|
|
|
|
false,
|
|
|
|
true,
|
|
|
|
true,
|
|
|
|
true,
|
|
|
|
true,
|
|
|
|
false,
|
|
|
|
false,
|
|
|
|
true,
|
|
|
|
CacheStrategy.IFEXIST,
|
|
|
|
true,
|
|
|
|
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA);
|
|
|
|
false,
|
|
|
|
this.profilesActiveCrawls.put(
|
|
|
|
CacheStrategy.IFEXIST,
|
|
|
|
UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()),
|
|
|
|
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA);
|
|
|
|
this.defaultMediaSnippetGlobalProfile);
|
|
|
|
this.profilesActiveCrawls.put(
|
|
|
|
// generate new default entry for surrogate parsing
|
|
|
|
UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()),
|
|
|
|
this.defaultSurrogateProfile =
|
|
|
|
this.defaultMediaSnippetGlobalProfile);
|
|
|
|
new CrawlProfile(
|
|
|
|
}
|
|
|
|
CRAWL_PROFILE_SURROGATE,
|
|
|
|
if ( this.defaultSurrogateProfile == null ) {
|
|
|
|
CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch
|
|
|
|
// generate new default entry for surrogate parsing
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
|
|
|
|
this.defaultSurrogateProfile =
|
|
|
|
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
|
|
|
|
new CrawlProfile(
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
|
|
|
|
CRAWL_PROFILE_SURROGATE,
|
|
|
|
"", //crawlerCountryMustMatch
|
|
|
|
CrawlProfile.MATCH_ALL_STRING,
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING,
|
|
|
|
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
|
|
|
|
CrawlProfile.MATCH_ALL_STRING,
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING,
|
|
|
|
0,
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING,
|
|
|
|
false,
|
|
|
|
CrawlProfile.MATCH_ALL_STRING,
|
|
|
|
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE),
|
|
|
|
CrawlProfile.MATCH_NEVER_STRING,
|
|
|
|
-1,
|
|
|
|
"",
|
|
|
|
true,
|
|
|
|
0,
|
|
|
|
true,
|
|
|
|
false,
|
|
|
|
false,
|
|
|
|
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE),
|
|
|
|
false,
|
|
|
|
-1,
|
|
|
|
false,
|
|
|
|
true,
|
|
|
|
true,
|
|
|
|
true,
|
|
|
|
true,
|
|
|
|
false,
|
|
|
|
false,
|
|
|
|
false,
|
|
|
|
CacheStrategy.NOCACHE,
|
|
|
|
false,
|
|
|
|
"robot_" + CRAWL_PROFILE_SURROGATE);
|
|
|
|
true,
|
|
|
|
this.profilesActiveCrawls.put(
|
|
|
|
true,
|
|
|
|
UTF8.getBytes(this.defaultSurrogateProfile.handle()),
|
|
|
|
false,
|
|
|
|
this.defaultSurrogateProfile);
|
|
|
|
CacheStrategy.NOCACHE,
|
|
|
|
|
|
|
|
"robot_" + CRAWL_PROFILE_SURROGATE);
|
|
|
|
|
|
|
|
this.profilesActiveCrawls.put(
|
|
|
|
|
|
|
|
UTF8.getBytes(this.defaultSurrogateProfile.handle()),
|
|
|
|
|
|
|
|
this.defaultSurrogateProfile);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
private void resetProfiles() {
|
|
|
|
private void resetProfiles() {
|
|
|
|