added crawl settings for three new filters for each crawl:

must-match for IPs (IPs that are known after DNS resolving for each URL in the crawl queue)
must-not-match for IPs
must-match against a list of country codes (allows only loading from hosts that are hostet in given countries)

note: the settings and input environment is there with that commit, but the values are not yet evaluated

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7976 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent 47a8c69745
commit 5ad7f9612b

@ -557,6 +557,12 @@ xpstopw=true
# Change to false if requesting hits from peers with modified stopwords-file and using the unchanged client-version
filterOutStopwordsFromTopwords=true
# crawling steering: must-match/must-not-match
crawlingIPMustMatch=.*
crawlingIPMustNotMatch=
# the default country codes are all codes for countries in Europe
crawlingCountryMustMatch=AD,AL,AT,BA,BE,BG,BY,CH,CY,CZ,DE,DK,EE,ES,FI,FO,FR,GG,GI,GR,HR,HU,IE,IM,IS,IT,JE,LI,LT,LU,LV,MC,MD,MK,MT,NL,NO,PL,PT,RO,RU,SE,SI,SJ,SK,SM,TR,UA,UK,VA,YU
# performance-settings
# delay-times for permanent loops (milliseconds)
# the idlesleep is the pause that an proces sleeps if the last call to the

@ -86,8 +86,8 @@ public class CrawlProfileEditor_p {
static {
labels.add(new eentry(CrawlProfile.NAME, "Name", true, eentry.STRING));
labels.add(new eentry(CrawlProfile.START_URL, "Start URL", true, eentry.STRING));
labels.add(new eentry(CrawlProfile.FILTER_MUSTMATCH, "Must-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.FILTER_MUSTNOTMATCH, "Must-Not-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.FILTER_URL_MUSTMATCH, "Must-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.FILTER_URL_MUSTNOTMATCH, "Must-Not-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.DEPTH, "Crawl Depth", false, eentry.INTEGER));
labels.add(new eentry(CrawlProfile.RECRAWL_IF_OLDER, "Recrawl If Older", false, eentry.INTEGER));
labels.add(new eentry(CrawlProfile.DOM_MAX_PAGES, "Domain Max. Pages", false, eentry.INTEGER));
@ -159,8 +159,8 @@ public class CrawlProfileEditor_p {
if ((post != null) && (selentry != null)) {
if (post.containsKey("submit")) {
try {
Pattern.compile(post.get(CrawlProfile.FILTER_MUSTMATCH, CrawlProfile.MATCH_ALL));
Pattern.compile(post.get(CrawlProfile.FILTER_MUSTNOTMATCH, CrawlProfile.MATCH_NEVER));
Pattern.compile(post.get(CrawlProfile.FILTER_URL_MUSTMATCH, CrawlProfile.MATCH_ALL));
Pattern.compile(post.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH, CrawlProfile.MATCH_NEVER));
final Iterator<eentry> lit = labels.iterator();
eentry tee;
while (lit.hasNext()) {

@ -136,7 +136,7 @@
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td><label for="mustmatch">Must-Match Filter</label>:</td>
<td><label for="mustmatch">Must-Match Filter for URLs</label>:</td>
<td>
<input type="radio" name="range" id="rangeWide" value="wide" checked="checked" />Use filter&nbsp;&nbsp;
<input name="mustmatch" id="mustmatch" type="text" size="60" maxlength="100" value="#[mustmatch]#" /><br />
@ -151,7 +151,7 @@
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td><label for="mustnotmatch">Must-Not-Match Filter</label>:</td>
<td><label for="mustnotmatch">Must-Not-Match Filter for URLs</label>:</td>
<td>
<input name="mustnotmatch" id="mustnotmatch" type="text" size="60" maxlength="100" value="#[mustnotmatch]#" />
</td>
@ -162,6 +162,37 @@
If you don't know what this means, please leave this field empty.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td><label for="ipMustmatch">Must-Match Filter for IPs</label>:</td>
<td>
<input name="ipMustmatch" id="ipMustmatch" type="text" size="60" maxlength="100" value="#[ipMustmatch]#" />
</td>
<td>
Like the MUST-Match Filter for URLs this filter must match, but only for the IP of the host.
YaCy performs a DNS lookup for each host and this filter restricts the crawl to specific IPs
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td><label for="ipMustnotmatch">Must-Not-Match Filter for IPs</label>:</td>
<td>
<input name="ipMustnotmatch" id="ipMustnotmatch" type="text" size="60" maxlength="100" value="#[ipMustnotmatch]#" />
</td>
<td>
This filter must not match on the IP of the crawled host.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td><label for="crawlingCountryMustMatch">Must-Match List for Country Codes</label>:</td>
<td>
<input type="radio" name="countryMustMatchSwitch" id="countryMustMatchSwitch" value="true" />Use filter&nbsp;&nbsp;
<input name="crawlingCountryMustMatch" id="crawlingCountryMustMatch" type="text" size="60" maxlength="100" value="#[crawlingCountryMustMatch]#" />
<input type="radio" name="countryMustMatchSwitch" id="countryMustMatchSwitch" value="false" checked="checked" />no country code restriction
</td>
<td>
Crawls can be restricted to specific countries. This uses the country code that can be computed from
the IP of the server that hosts the page. The filter is not a regular expressions but a list of country codes, separated by comma.
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td>Maximum Pages per Domain:</td>
<td>

@ -9,7 +9,7 @@
// $LastChangedBy: orbiter $
//
// LICENSE
//
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
@ -25,32 +25,36 @@
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import de.anomic.crawler.CrawlProfile;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
public class CrawlStartExpert_p {
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
// return variable that accumulates replacements
//final Switchboard sb = (Switchboard) env;
final Switchboard sb = (Switchboard) env;
final serverObjects prop = new serverObjects();
// define visible variables
prop.put("starturl", /*(intranet) ? repository :*/ "http://");
prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0"));
prop.put("crawlingDepth", Math.min(3, env.getConfigLong("crawlingDepth", 0)));
prop.put("mustmatch", /*(intranet) ? repository + ".*" :*/ CrawlProfile.MATCH_ALL);
prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER);
prop.put("ipMustmatch", sb.getConfig("crawlingIPMustMatch", CrawlProfile.MATCH_ALL));
prop.put("ipMustnotmatch", sb.getConfig("crawlingIPMustNotMatch", CrawlProfile.MATCH_NEVER));
prop.put("crawlingCountryMustMatch", sb.getConfig("crawlingCountryMustMatch", ""));
prop.put("crawlingIfOlderCheck", "0");
prop.put("crawlingIfOlderUnitYearCheck", "0");
prop.put("crawlingIfOlderUnitMonthCheck", "0");
prop.put("crawlingIfOlderUnitDayCheck", "1");
prop.put("crawlingIfOlderUnitHourCheck", "0");
prop.put("crawlingIfOlderNumber", "7");
final int crawlingDomFilterDepth = env.getConfigInt("crawlingDomFilterDepth", -1);
prop.put("crawlingDomFilterCheck", (crawlingDomFilterDepth == -1) ? "0" : "1");
prop.put("crawlingDomFilterDepth", (crawlingDomFilterDepth == -1) ? 1 : crawlingDomFilterDepth);
@ -62,18 +66,18 @@ public class CrawlStartExpert_p {
prop.put("indexingTextChecked", env.getConfigBool("indexText", true) ? "1" : "0");
prop.put("indexingMediaChecked", env.getConfigBool("indexMedia", true) ? "1" : "0");
prop.put("crawlOrderChecked", env.getConfigBool("crawlOrder", true) ? "1" : "0");
final long LCbusySleep = env.getConfigLong(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, 100L);
final int LCppm = (LCbusySleep == 0) ? 1000 : (int) (60000L / LCbusySleep);
prop.put("crawlingSpeedMaxChecked", (LCppm >= 1000) ? "1" : "0");
prop.put("crawlingSpeedCustChecked", ((LCppm > 10) && (LCppm < 1000)) ? "1" : "0");
prop.put("crawlingSpeedMinChecked", (LCppm <= 10) ? "1" : "0");
prop.put("customPPMdefault", ((LCppm > 10) && (LCppm < 1000)) ? Integer.toString(LCppm) : "");
prop.put("xsstopwChecked", env.getConfigBool("xsstopw", true) ? "1" : "0");
prop.put("xdstopwChecked", env.getConfigBool("xdstopw", true) ? "1" : "0");
prop.put("xpstopwChecked", env.getConfigBool("xpstopw", true) ? "1" : "0");
// return rewrite properties
return prop;
}

@ -156,6 +156,14 @@ public class Crawler_p {
String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL);
final String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER);
if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL; // avoid that all urls are filtered out if bad value was submitted
String ipMustMatch = post.get("ipMustmatch", CrawlProfile.MATCH_ALL);
final String ipMustNotMatch = post.get("ipMustnotmatch", CrawlProfile.MATCH_NEVER);
if (ipMustMatch.length() < 2) ipMustMatch = CrawlProfile.MATCH_ALL;
final String countryMustMatch = post.getBoolean("countryMustMatchSwitch", false) ? post.get("countryMustMatchList", "") : "";
sb.setConfig("crawlingIPMustMatch", ipMustMatch);
sb.setConfig("crawlingIPMustNotMatch", ipMustNotMatch);
if (countryMustMatch.length() > 0) sb.setConfig("crawlingCountryMustMatch", countryMustMatch);
// special cases:
if (crawlingStartURL!= null && fullDomain) {
if (crawlingStartURL.isFile()) {
@ -249,7 +257,10 @@ public class Crawler_p {
crawlingStart,
crawlingStartURL,
newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER,
newcrawlingMustNotMatch,
ipMustMatch,
ipMustNotMatch,
countryMustMatch,
newcrawlingdepth,
crawlingIfOlder,
crawlingDomMaxPages,
@ -306,6 +317,9 @@ public class Crawler_p {
crawlingStartURL,
newcrawlingMustMatch,
newcrawlingMustNotMatch,
ipMustMatch,
ipMustNotMatch,
countryMustMatch,
newcrawlingdepth,
crawlingIfOlder,
crawlingDomMaxPages,
@ -426,6 +440,9 @@ public class Crawler_p {
crawlURL,
newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER,
ipMustMatch,
ipMustNotMatch,
countryMustMatch,
newcrawlingdepth,
crawlingIfOlder,
crawlingDomMaxPages,
@ -463,6 +480,9 @@ public class Crawler_p {
sitemapURL,
CrawlProfile.MATCH_ALL,
CrawlProfile.MATCH_NEVER,
ipMustMatch,
ipMustNotMatch,
countryMustMatch,
0,
crawlingIfOlder,
crawlingDomMaxPages,
@ -504,6 +524,9 @@ public class Crawler_p {
sitelistURL,
newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER,
ipMustMatch,
ipMustNotMatch,
countryMustMatch,
newcrawlingdepth,
crawlingIfOlder,
crawlingDomMaxPages,

@ -149,6 +149,9 @@ public class QuickCrawlLink_p {
crawlingStartURL.getHost(),
crawlingStartURL,
crawlingMustMatch,
CrawlProfile.MATCH_ALL,
CrawlProfile.MATCH_NEVER,
"",
crawlingMustNotMatch,
CrawlingDepth,
60 * 24 * 30, // recrawlIfOlder (minutes); here: one month

@ -48,8 +48,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public static final String HANDLE = "handle";
public static final String NAME = "name";
public static final String START_URL = "startURL";
public static final String FILTER_MUSTMATCH = "generalFilter";
public static final String FILTER_MUSTNOTMATCH = "nevermatch";
public static final String DEPTH = "generalDepth";
public static final String RECRAWL_IF_OLDER = "recrawlIfOlder";
public static final String DOM_MAX_PAGES = "domMaxPages";
@ -63,6 +61,11 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public static final String XDSTOPW = "xdstopw";
public static final String XPSTOPW = "xpstopw";
public static final String CACHE_STRAGEGY = "cacheStrategy";
public static final String FILTER_URL_MUSTMATCH = "generalFilter"; // for URLs
public static final String FILTER_URL_MUSTNOTMATCH = "nevermatch"; // for URLs
public static final String FILTER_IP_MUSTMATCH = "crawlingIPMustMatch";
public static final String FILTER_IP_MUSTNOTMATCH = "crawlingIPMustNotMatch";
public static final String FILTER_COUNTRY_MUSTMATCH = "crawlingCountryMustMatch";
private Pattern mustmatch = null, mustnotmatch = null;
@ -70,8 +73,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
* Constructor which creates CrawlPofile from parameters.
* @param name name of the crawl profile
* @param startURL root URL of the crawl
* @param mustmatch URLs which do not match this regex will be ignored
* @param mustnotmatch URLs which match this regex will be ignored
* @param urlMustMatch URLs which do not match this regex will be ignored
* @param urlMustNotMatch URLs which match this regex will be ignored
* @param depth height of the tree which will be created by the crawler
* @param recrawlIfOlder documents which have been indexed in the past will
* be indexed again if they are older than the time (ms) in this parameter
@ -89,8 +92,11 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public CrawlProfile(
final String name,
final DigestURI startURL,
final String mustmatch,
final String mustnotmatch,
final String urlMustMatch,
final String urlMustNotMatch,
final String ipMustMatch,
final String ipMustNotMatch,
final String countryMustMatch,
final int depth,
final long recrawlIfOlder /*date*/,
final int domMaxPages,
@ -107,14 +113,17 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if (name == null || name.isEmpty()) {
throw new NullPointerException("name must not be null or empty");
}
final String handle = (startURL == null)
final String handle = (startURL == null)
? Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name)).substring(0, Word.commonHashLength)
: ASCII.String(startURL.hash());
put(HANDLE, handle);
put(NAME, name);
put(START_URL, (startURL == null) ? "" : startURL.toNormalform(true, false));
put(FILTER_MUSTMATCH, (mustmatch == null) ? CrawlProfile.MATCH_ALL : mustmatch);
put(FILTER_MUSTNOTMATCH, (mustnotmatch == null) ? CrawlProfile.MATCH_NEVER : mustnotmatch);
put(FILTER_URL_MUSTMATCH, (urlMustMatch == null) ? CrawlProfile.MATCH_ALL : urlMustMatch);
put(FILTER_URL_MUSTNOTMATCH, (urlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER : urlMustNotMatch);
put(FILTER_IP_MUSTMATCH, (ipMustMatch == null) ? CrawlProfile.MATCH_ALL : ipMustMatch);
put(FILTER_IP_MUSTNOTMATCH, (ipMustNotMatch == null) ? CrawlProfile.MATCH_NEVER : ipMustNotMatch);
put(FILTER_COUNTRY_MUSTMATCH, (countryMustMatch == null) ? "" : countryMustMatch);
put(DEPTH, depth);
put(RECRAWL_IF_OLDER, recrawlIfOlder);
put(DOM_MAX_PAGES, domMaxPages);
@ -137,7 +146,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
super(ext == null ? 1 : ext.size());
if (ext != null) putAll(ext);
}
/**
* Adds a parameter to CrawlProfile.
* @param key name of the parameter
@ -174,7 +183,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
//if (r == null) return null;
return r;
}
/**
* Gets the name of the CrawlProfile.
* @return name of the profile
@ -184,7 +193,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if (r == null) return "";
return r;
}
/**
* Gets the root URL of the crawl job.
* @return root URL
@ -193,35 +202,35 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
final String r = get(START_URL);
return r;
}
/**
* Gets the regex which must be matched by URLs in order to be crawled.
* @return regex which must be matched
*/
public Pattern mustMatchPattern() {
if (this.mustmatch == null) {
String r = get(FILTER_MUSTMATCH);
String r = get(FILTER_URL_MUSTMATCH);
if (r == null) r = CrawlProfile.MATCH_ALL;
this.mustmatch = Pattern.compile(r);
}
return this.mustmatch;
}
/**
* Gets the regex which must not be matched by URLs in order to be crawled.
* @return regex which must not be matched
*/
public Pattern mustNotMatchPattern() {
if (this.mustnotmatch == null) {
String r = get(FILTER_MUSTNOTMATCH);
String r = get(FILTER_URL_MUSTNOTMATCH);
if (r == null) r = CrawlProfile.MATCH_NEVER;
this.mustnotmatch = Pattern.compile(r);
}
return this.mustnotmatch;
}
/**
* Gets depth of crawl job (or height of the tree which will be
* Gets depth of crawl job (or height of the tree which will be
* created by the crawler).
* @return depth of crawl job
*/
@ -235,7 +244,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
return 0;
}
}
public CacheStrategy cacheStrategy() {
final String r = get(CACHE_STRAGEGY);
if (r == null) return CacheStrategy.IFEXIST;
@ -246,11 +255,11 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
return CacheStrategy.IFEXIST;
}
}
public void setCacheStrategy(final CacheStrategy newStrategy) {
put(CACHE_STRAGEGY, newStrategy.toString());
}
/**
* Gets the minimum age that an entry must have to be re-crawled.
* @return time in ms
@ -268,7 +277,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
return 0L;
}
}
public int domMaxPages() {
// this is the maximum number of pages that are crawled for a single domain
// if -1, this means no limit
@ -283,31 +292,31 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
return Integer.MAX_VALUE;
}
}
public boolean crawlingQ() {
final String r = get(CRAWLING_Q);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean pushSolr() {
final String r = get(PUSH_SOLR);
if (r == null) return true;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean indexText() {
final String r = get(INDEX_TEXT);
if (r == null) return true;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean indexMedia() {
final String r = get(INDEX_MEDIA);
if (r == null) return true;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean storeHTCache() {
final String r = get(STORE_HTCACHE);
if (r == null) return false;
@ -318,19 +327,19 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean excludeStaticStopwords() {
final String r = get(XSSTOPW);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean excludeDynamicStopwords() {
final String r = get(XDSTOPW);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean excludeParentStopwords() {
final String r = get(XPSTOPW);
if (r == null) return false;

@ -63,7 +63,8 @@ public final class CrawlSwitchboard {
public static final long CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE = 60L * 24L * 30L;
private final Log log;
private Map<byte[], Map<String, String>> profilesActiveCrawls, profilesPassiveCrawls, profilesInvalidCrawls;
private Map<byte[], Map<String, String>> profilesActiveCrawls;
private final Map<byte[], Map<String, String>> profilesPassiveCrawls, profilesInvalidCrawls;
public CrawlProfile defaultProxyProfile;
public CrawlProfile defaultRemoteProfile;
public CrawlProfile defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile;
@ -91,28 +92,28 @@ public final class CrawlSwitchboard {
final File profilesInvalidFile = new File(queuesRoot, DBFILE_INVALID_CRAWL_PROFILES);
this.profilesInvalidCrawls = loadFromDB(profilesInvalidFile);
final File profilesActiveFile = new File(queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES);
this.profilesActiveCrawls = loadFromDB(profilesActiveFile);
for (final byte[] handle : this.profilesActiveCrawls.keySet()) {
final CrawlProfile p;
p = new CrawlProfile(this.profilesActiveCrawls.get(handle));
if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_MUSTMATCH))) {
this.removeActive(handle);
this.putInvalid(handle, p);
if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_URL_MUSTMATCH))) {
removeActive(handle);
putInvalid(handle, p);
Log.logWarning("CrawlProfiles", "removed Profile " + p.handle() + ": " + p.name()
+ " from active crawls since " + CrawlProfile.FILTER_MUSTMATCH
+ " is no valid regular expression: " + p.get(CrawlProfile.FILTER_MUSTMATCH));
} else if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_MUSTNOTMATCH))) {
this.putInvalid(handle, p);
this.removeActive(handle);
+ " from active crawls since " + CrawlProfile.FILTER_URL_MUSTMATCH
+ " is no valid regular expression: " + p.get(CrawlProfile.FILTER_URL_MUSTMATCH));
} else if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH))) {
putInvalid(handle, p);
removeActive(handle);
Log.logWarning("CrawlProfiles", "removed Profile " + p.handle() + ": " + p.name()
+ " from active crawls since " + CrawlProfile.FILTER_MUSTNOTMATCH
+ " is no valid regular expression: " + p.get(CrawlProfile.FILTER_MUSTNOTMATCH));
+ " from active crawls since " + CrawlProfile.FILTER_URL_MUSTNOTMATCH
+ " is no valid regular expression: " + p.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH));
} else {
Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name());
}
}
initActiveCrawlProfiles();
log.logInfo("Loaded active crawl profiles from file " + profilesActiveFile.getName() + ", " + this.profilesActiveCrawls.size() + " entries");
@ -134,7 +135,7 @@ public final class CrawlSwitchboard {
if (m == null) return null;
return new CrawlProfile(m);
}
public CrawlProfile getInvalid(final byte[] profileKey) {
if (profileKey == null) return null;
final Map<String, String> m = this.profilesInvalidCrawls.get(profileKey);
@ -152,7 +153,7 @@ public final class CrawlSwitchboard {
public Set<byte[]> getActive() {
return this.profilesActiveCrawls.keySet();
}
public Set<byte[]> getInvalid() {
return this.profilesInvalidCrawls.keySet();
}
@ -165,7 +166,7 @@ public final class CrawlSwitchboard {
if (profileKey == null) return;
this.profilesActiveCrawls.remove(profileKey);
}
public void removeInvalid(final byte[] profileKey) {
if (profileKey == null) return;
this.profilesInvalidCrawls.remove(profileKey);
@ -179,7 +180,7 @@ public final class CrawlSwitchboard {
public void putActive(final byte[] profileKey, final CrawlProfile profile) {
this.profilesActiveCrawls.put(profileKey, profile);
}
public void putInvalid(final byte[] profileKey, final CrawlProfile profile) {
this.profilesInvalidCrawls.put(profileKey, profile);
}
@ -227,7 +228,10 @@ public final class CrawlSwitchboard {
if (this.defaultProxyProfile == null) {
// generate new default entry for proxy crawling
this.defaultProxyProfile = new CrawlProfile(
"proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
"proxy", null,
CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
"",
0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, false,
true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/,
@ -239,38 +243,38 @@ public final class CrawlSwitchboard {
}
if (this.defaultRemoteProfile == null) {
// generate new default entry for remote crawling
this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", CrawlProfile.MATCH_NEVER, 0,
-1, -1, true, true, true, false, false, true, true, false, CacheStrategy.IFFRESH);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultRemoteProfile.handle()), this.defaultRemoteProfile);
}
if (this.defaultTextSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()), this.defaultTextSnippetLocalProfile);
}
if (this.defaultTextSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, true, false, true, true, false, CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()), this.defaultTextSnippetGlobalProfile);
}
this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST);
if (this.defaultMediaSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()), this.defaultMediaSnippetLocalProfile);
}
if (this.defaultMediaSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, true, false, true, true, false, true, true, false, CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()), this.defaultMediaSnippetGlobalProfile);
}
if (this.defaultSurrogateProfile == null) {
// generate new default entry for surrogate parsing
this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, true, true, false, false, false, true, true, false, CacheStrategy.NOCACHE);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultSurrogateProfile.handle()), this.defaultSurrogateProfile);
}
@ -324,8 +328,8 @@ public final class CrawlSwitchboard {
((MapHeap) this.profilesInvalidCrawls).close();
((MapHeap) this.profilesPassiveCrawls).close();
}
/**
* Loads crawl profiles from a DB file.
* @param file DB file

Loading…
Cancel
Save