fixes to crawl profiles

pull/1/head
Michael Peter Christen 13 years ago
parent 2f536cb54d
commit 76d218fbef

@ -134,14 +134,14 @@ public class QuickCrawlLink_p {
try { try {
pe = new CrawlProfile( pe = new CrawlProfile(
crawlingStartURL.toNormalform(true, false), crawlingStartURL.toNormalform(true, false),
crawlingMustMatch, crawlingMustMatch, //crawlerUrlMustMatch
CrawlProfile.MATCH_ALL_STRING, crawlingMustNotMatch, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
CrawlProfile.MATCH_ALL_STRING, "", //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
"", CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
crawlingMustNotMatch, CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlingDepth, CrawlingDepth,
true, true,
60 * 24 * 30, // recrawlIfOlder (minutes); here: one month 60 * 24 * 30, // recrawlIfOlder (minutes); here: one month

@ -217,65 +217,18 @@ public final class CrawlSwitchboard
} }
private void initActiveCrawlProfiles() { private void initActiveCrawlProfiles() {
this.defaultProxyProfile = null;
this.defaultRemoteProfile = null;
this.defaultTextSnippetLocalProfile = null;
this.defaultTextSnippetGlobalProfile = null;
this.defaultMediaSnippetLocalProfile = null;
this.defaultMediaSnippetGlobalProfile = null;
this.defaultSurrogateProfile = null;
CrawlProfile profile;
String name;
try {
for ( final byte[] handle : this.profilesActiveCrawls.keySet() ) {
profile = new CrawlProfile(this.profilesActiveCrawls.get(handle));
name = profile.name();
if ( name.equals(CRAWL_PROFILE_PROXY) ) {
this.defaultProxyProfile = profile;
}
if ( name.equals(CRAWL_PROFILE_REMOTE) ) {
this.defaultRemoteProfile = profile;
}
if ( name.equals(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) ) {
this.defaultTextSnippetLocalProfile = profile;
}
if ( name.equals(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT) ) {
this.defaultTextSnippetGlobalProfile = profile;
}
if ( name.equals(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) ) {
this.defaultMediaSnippetLocalProfile = profile;
}
if ( name.equals(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) ) {
this.defaultMediaSnippetGlobalProfile = profile;
}
if ( name.equals(CRAWL_PROFILE_SURROGATE) ) {
this.defaultSurrogateProfile = profile;
}
}
} catch ( final Exception e ) {
this.profilesActiveCrawls.clear();
this.defaultProxyProfile = null;
this.defaultRemoteProfile = null;
this.defaultTextSnippetLocalProfile = null;
this.defaultTextSnippetGlobalProfile = null;
this.defaultMediaSnippetLocalProfile = null;
this.defaultMediaSnippetGlobalProfile = null;
this.defaultSurrogateProfile = null;
}
if ( this.defaultProxyProfile == null ) {
// generate new default entry for proxy crawling // generate new default entry for proxy crawling
this.defaultProxyProfile = this.defaultProxyProfile =
new CrawlProfile( new CrawlProfile(
CRAWL_PROFILE_PROXY, CRAWL_PROFILE_PROXY,
CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
CrawlProfile.MATCH_NEVER_STRING, "", //crawlerCountryMustMatch
CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
"", CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/, 0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
true, true,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE),
@ -293,20 +246,18 @@ public final class CrawlSwitchboard
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultProxyProfile.handle()), UTF8.getBytes(this.defaultProxyProfile.handle()),
this.defaultProxyProfile); this.defaultProxyProfile);
}
if ( this.defaultRemoteProfile == null ) {
// generate new default entry for remote crawling // generate new default entry for remote crawling
this.defaultRemoteProfile = this.defaultRemoteProfile =
new CrawlProfile( new CrawlProfile(
CRAWL_PROFILE_REMOTE, CRAWL_PROFILE_REMOTE,
CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch
CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
CrawlProfile.MATCH_ALL_STRING, "", //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
"", CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
0, 0,
false, false,
-1, -1,
@ -324,20 +275,18 @@ public final class CrawlSwitchboard
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultRemoteProfile.handle()), UTF8.getBytes(this.defaultRemoteProfile.handle()),
this.defaultRemoteProfile); this.defaultRemoteProfile);
}
if ( this.defaultTextSnippetLocalProfile == null ) {
// generate new default entry for snippet fetch and optional crawling // generate new default entry for snippet fetch and optional crawling
this.defaultTextSnippetLocalProfile = this.defaultTextSnippetLocalProfile =
new CrawlProfile( new CrawlProfile(
CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, CRAWL_PROFILE_SNIPPET_LOCAL_TEXT,
CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
CrawlProfile.MATCH_NEVER_STRING, "", //crawlerCountryMustMatch
CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
"", CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
0, 0,
false, false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE),
@ -355,20 +304,18 @@ public final class CrawlSwitchboard
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()), UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()),
this.defaultTextSnippetLocalProfile); this.defaultTextSnippetLocalProfile);
}
if ( this.defaultTextSnippetGlobalProfile == null ) {
// generate new default entry for snippet fetch and optional crawling // generate new default entry for snippet fetch and optional crawling
this.defaultTextSnippetGlobalProfile = this.defaultTextSnippetGlobalProfile =
new CrawlProfile( new CrawlProfile(
CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT,
CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
CrawlProfile.MATCH_NEVER_STRING, "", //crawlerCountryMustMatch
CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
"", CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
0, 0,
false, false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE),
@ -386,21 +333,19 @@ public final class CrawlSwitchboard
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()), UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()),
this.defaultTextSnippetGlobalProfile); this.defaultTextSnippetGlobalProfile);
}
this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST); this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST);
if ( this.defaultMediaSnippetLocalProfile == null ) {
// generate new default entry for snippet fetch and optional crawling // generate new default entry for snippet fetch and optional crawling
this.defaultMediaSnippetLocalProfile = this.defaultMediaSnippetLocalProfile =
new CrawlProfile( new CrawlProfile(
CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA,
CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
CrawlProfile.MATCH_NEVER_STRING, "", //crawlerCountryMustMatch
CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
"", CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
0, 0,
false, false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE),
@ -418,20 +363,18 @@ public final class CrawlSwitchboard
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()), UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()),
this.defaultMediaSnippetLocalProfile); this.defaultMediaSnippetLocalProfile);
}
if ( this.defaultMediaSnippetGlobalProfile == null ) {
// generate new default entry for snippet fetch and optional crawling // generate new default entry for snippet fetch and optional crawling
this.defaultMediaSnippetGlobalProfile = this.defaultMediaSnippetGlobalProfile =
new CrawlProfile( new CrawlProfile(
CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
CrawlProfile.MATCH_NEVER_STRING, "", //crawlerCountryMustMatch
CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
"", CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
0, 0,
false, false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE),
@ -449,20 +392,18 @@ public final class CrawlSwitchboard
this.profilesActiveCrawls.put( this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()), UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()),
this.defaultMediaSnippetGlobalProfile); this.defaultMediaSnippetGlobalProfile);
}
if ( this.defaultSurrogateProfile == null ) {
// generate new default entry for surrogate parsing // generate new default entry for surrogate parsing
this.defaultSurrogateProfile = this.defaultSurrogateProfile =
new CrawlProfile( new CrawlProfile(
CRAWL_PROFILE_SURROGATE, CRAWL_PROFILE_SURROGATE,
CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
CrawlProfile.MATCH_NEVER_STRING, "", //crawlerCountryMustMatch
CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
"", CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
0, 0,
false, false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE),
@ -481,7 +422,6 @@ public final class CrawlSwitchboard
UTF8.getBytes(this.defaultSurrogateProfile.handle()), UTF8.getBytes(this.defaultSurrogateProfile.handle()),
this.defaultSurrogateProfile); this.defaultSurrogateProfile);
} }
}
private void resetProfiles() { private void resetProfiles() {
this.profilesActiveCrawlsCache.clear(); this.profilesActiveCrawlsCache.clear();

@ -164,10 +164,10 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
put(CRAWLER_URL_MUSTNOTMATCH, (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch); put(CRAWLER_URL_MUSTNOTMATCH, (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch);
put(CRAWLER_IP_MUSTMATCH, (crawlerIpMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerIpMustMatch); put(CRAWLER_IP_MUSTMATCH, (crawlerIpMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerIpMustMatch);
put(CRAWLER_IP_MUSTNOTMATCH, (crawlerIpMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerIpMustNotMatch); put(CRAWLER_IP_MUSTNOTMATCH, (crawlerIpMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerIpMustNotMatch);
put(CRAWLER_COUNTRY_MUSTMATCH, (crawlerCountryMustMatch == null) ? "" : crawlerCountryMustMatch); put(CRAWLER_COUNTRY_MUSTMATCH, (crawlerCountryMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerCountryMustMatch);
put(CRAWLER_URL_NODEPTHLIMITMATCH, (crawlerNoDepthLimitMatch == null) ? "" : crawlerNoDepthLimitMatch); put(CRAWLER_URL_NODEPTHLIMITMATCH, (crawlerNoDepthLimitMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerNoDepthLimitMatch);
put(INDEXING_URL_MUSTMATCH, (indexUrlMustMatch == null) ? "" : indexUrlMustMatch); put(INDEXING_URL_MUSTMATCH, (indexUrlMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexUrlMustMatch);
put(INDEXING_URL_MUSTNOTMATCH, (indexUrlMustNotMatch == null) ? "" : indexUrlMustNotMatch); put(INDEXING_URL_MUSTNOTMATCH, (indexUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexUrlMustNotMatch);
put(DEPTH, depth); put(DEPTH, depth);
put(DIRECT_DOC_BY_URL, directDocByURL); put(DIRECT_DOC_BY_URL, directDocByURL);
put(RECRAWL_IF_OLDER, recrawlIfOlder); put(RECRAWL_IF_OLDER, recrawlIfOlder);
@ -351,7 +351,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
*/ */
public String[] countryMustMatchList() { public String[] countryMustMatchList() {
String countryMustMatch = get(CRAWLER_COUNTRY_MUSTMATCH); String countryMustMatch = get(CRAWLER_COUNTRY_MUSTMATCH);
if (countryMustMatch == null) countryMustMatch = ""; if (countryMustMatch == null) countryMustMatch = CrawlProfile.MATCH_NEVER_STRING;
if (countryMustMatch.isEmpty()) return new String[0]; if (countryMustMatch.isEmpty()) return new String[0];
String[] list = countryMustMatch.split(","); String[] list = countryMustMatch.split(",");
if (list.length == 1 && list.length == 0) list = new String[0]; if (list.length == 1 && list.length == 0) list = new String[0];

@ -2461,7 +2461,7 @@ public final class Switchboard extends serverSwitch
} }
if (!profile.indexUrlMustMatchPattern().matcher(urls).matches() || if (!profile.indexUrlMustMatchPattern().matcher(urls).matches() ||
profile.indexUrlMustNotMatchPattern().matcher(urls).matches() ) { profile.indexUrlMustNotMatchPattern().matcher(urls).matches() ) {
if (this.log.isInfo()) this.log.logInfo("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url"); if (this.log.isInfo()) this.log.logInfo("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern());
return new IndexingQueueEntry(in.queueEntry, in.documents, null); return new IndexingQueueEntry(in.queueEntry, in.documents, null);
} }

Loading…
Cancel
Save