added crawl settings for three new filters for each crawl:

must-match for IPs (IPs that are known after DNS resolving for each URL in the crawl queue)
must-not-match for IPs
must-match against a list of country codes (allows only loading from hosts that are hostet in given countries)

note: the settings and input environment is there with that commit, but the values are not yet evaluated

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7976 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent 47a8c69745
commit 5ad7f9612b

@ -557,6 +557,12 @@ xpstopw=true
# Change to false if requesting hits from peers with modified stopwords-file and using the unchanged client-version # Change to false if requesting hits from peers with modified stopwords-file and using the unchanged client-version
filterOutStopwordsFromTopwords=true filterOutStopwordsFromTopwords=true
# crawling steering: must-match/must-not-match
crawlingIPMustMatch=.*
crawlingIPMustNotMatch=
# the default country codes are all codes for countries in Europe
crawlingCountryMustMatch=AD,AL,AT,BA,BE,BG,BY,CH,CY,CZ,DE,DK,EE,ES,FI,FO,FR,GG,GI,GR,HR,HU,IE,IM,IS,IT,JE,LI,LT,LU,LV,MC,MD,MK,MT,NL,NO,PL,PT,RO,RU,SE,SI,SJ,SK,SM,TR,UA,UK,VA,YU
# performance-settings # performance-settings
# delay-times for permanent loops (milliseconds) # delay-times for permanent loops (milliseconds)
# the idlesleep is the pause that an proces sleeps if the last call to the # the idlesleep is the pause that an proces sleeps if the last call to the

@ -86,8 +86,8 @@ public class CrawlProfileEditor_p {
static { static {
labels.add(new eentry(CrawlProfile.NAME, "Name", true, eentry.STRING)); labels.add(new eentry(CrawlProfile.NAME, "Name", true, eentry.STRING));
labels.add(new eentry(CrawlProfile.START_URL, "Start URL", true, eentry.STRING)); labels.add(new eentry(CrawlProfile.START_URL, "Start URL", true, eentry.STRING));
labels.add(new eentry(CrawlProfile.FILTER_MUSTMATCH, "Must-Match Filter", false, eentry.STRING)); labels.add(new eentry(CrawlProfile.FILTER_URL_MUSTMATCH, "Must-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.FILTER_MUSTNOTMATCH, "Must-Not-Match Filter", false, eentry.STRING)); labels.add(new eentry(CrawlProfile.FILTER_URL_MUSTNOTMATCH, "Must-Not-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.DEPTH, "Crawl Depth", false, eentry.INTEGER)); labels.add(new eentry(CrawlProfile.DEPTH, "Crawl Depth", false, eentry.INTEGER));
labels.add(new eentry(CrawlProfile.RECRAWL_IF_OLDER, "Recrawl If Older", false, eentry.INTEGER)); labels.add(new eentry(CrawlProfile.RECRAWL_IF_OLDER, "Recrawl If Older", false, eentry.INTEGER));
labels.add(new eentry(CrawlProfile.DOM_MAX_PAGES, "Domain Max. Pages", false, eentry.INTEGER)); labels.add(new eentry(CrawlProfile.DOM_MAX_PAGES, "Domain Max. Pages", false, eentry.INTEGER));
@ -159,8 +159,8 @@ public class CrawlProfileEditor_p {
if ((post != null) && (selentry != null)) { if ((post != null) && (selentry != null)) {
if (post.containsKey("submit")) { if (post.containsKey("submit")) {
try { try {
Pattern.compile(post.get(CrawlProfile.FILTER_MUSTMATCH, CrawlProfile.MATCH_ALL)); Pattern.compile(post.get(CrawlProfile.FILTER_URL_MUSTMATCH, CrawlProfile.MATCH_ALL));
Pattern.compile(post.get(CrawlProfile.FILTER_MUSTNOTMATCH, CrawlProfile.MATCH_NEVER)); Pattern.compile(post.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH, CrawlProfile.MATCH_NEVER));
final Iterator<eentry> lit = labels.iterator(); final Iterator<eentry> lit = labels.iterator();
eentry tee; eentry tee;
while (lit.hasNext()) { while (lit.hasNext()) {

@ -136,7 +136,7 @@
</td> </td>
</tr> </tr>
<tr valign="top" class="TableCellLight"> <tr valign="top" class="TableCellLight">
<td><label for="mustmatch">Must-Match Filter</label>:</td> <td><label for="mustmatch">Must-Match Filter for URLs</label>:</td>
<td> <td>
<input type="radio" name="range" id="rangeWide" value="wide" checked="checked" />Use filter&nbsp;&nbsp; <input type="radio" name="range" id="rangeWide" value="wide" checked="checked" />Use filter&nbsp;&nbsp;
<input name="mustmatch" id="mustmatch" type="text" size="60" maxlength="100" value="#[mustmatch]#" /><br /> <input name="mustmatch" id="mustmatch" type="text" size="60" maxlength="100" value="#[mustmatch]#" /><br />
@ -151,7 +151,7 @@
</td> </td>
</tr> </tr>
<tr valign="top" class="TableCellDark"> <tr valign="top" class="TableCellDark">
<td><label for="mustnotmatch">Must-Not-Match Filter</label>:</td> <td><label for="mustnotmatch">Must-Not-Match Filter for URLs</label>:</td>
<td> <td>
<input name="mustnotmatch" id="mustnotmatch" type="text" size="60" maxlength="100" value="#[mustnotmatch]#" /> <input name="mustnotmatch" id="mustnotmatch" type="text" size="60" maxlength="100" value="#[mustnotmatch]#" />
</td> </td>
@ -162,6 +162,37 @@
If you don't know what this means, please leave this field empty. If you don't know what this means, please leave this field empty.
</td> </td>
</tr> </tr>
<tr valign="top" class="TableCellLight">
<td><label for="ipMustmatch">Must-Match Filter for IPs</label>:</td>
<td>
<input name="ipMustmatch" id="ipMustmatch" type="text" size="60" maxlength="100" value="#[ipMustmatch]#" />
</td>
<td>
Like the MUST-Match Filter for URLs this filter must match, but only for the IP of the host.
YaCy performs a DNS lookup for each host and this filter restricts the crawl to specific IPs
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td><label for="ipMustnotmatch">Must-Not-Match Filter for IPs</label>:</td>
<td>
<input name="ipMustnotmatch" id="ipMustnotmatch" type="text" size="60" maxlength="100" value="#[ipMustnotmatch]#" />
</td>
<td>
This filter must not match on the IP of the crawled host.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td><label for="crawlingCountryMustMatch">Must-Match List for Country Codes</label>:</td>
<td>
<input type="radio" name="countryMustMatchSwitch" id="countryMustMatchSwitch" value="true" />Use filter&nbsp;&nbsp;
<input name="crawlingCountryMustMatch" id="crawlingCountryMustMatch" type="text" size="60" maxlength="100" value="#[crawlingCountryMustMatch]#" />
<input type="radio" name="countryMustMatchSwitch" id="countryMustMatchSwitch" value="false" checked="checked" />no country code restriction
</td>
<td>
Crawls can be restricted to specific countries. This uses the country code that can be computed from
the IP of the server that hosts the page. The filter is not a regular expressions but a list of country codes, separated by comma.
</td>
</tr>
<tr valign="top" class="TableCellDark"> <tr valign="top" class="TableCellDark">
<td>Maximum Pages per Domain:</td> <td>Maximum Pages per Domain:</td>
<td> <td>

@ -25,6 +25,7 @@
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants; import net.yacy.search.SwitchboardConstants;
import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.CrawlProfile;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
@ -34,7 +35,7 @@ public class CrawlStartExpert_p {
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
// return variable that accumulates replacements // return variable that accumulates replacements
//final Switchboard sb = (Switchboard) env; final Switchboard sb = (Switchboard) env;
final serverObjects prop = new serverObjects(); final serverObjects prop = new serverObjects();
// define visible variables // define visible variables
@ -43,6 +44,9 @@ public class CrawlStartExpert_p {
prop.put("crawlingDepth", Math.min(3, env.getConfigLong("crawlingDepth", 0))); prop.put("crawlingDepth", Math.min(3, env.getConfigLong("crawlingDepth", 0)));
prop.put("mustmatch", /*(intranet) ? repository + ".*" :*/ CrawlProfile.MATCH_ALL); prop.put("mustmatch", /*(intranet) ? repository + ".*" :*/ CrawlProfile.MATCH_ALL);
prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER); prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER);
prop.put("ipMustmatch", sb.getConfig("crawlingIPMustMatch", CrawlProfile.MATCH_ALL));
prop.put("ipMustnotmatch", sb.getConfig("crawlingIPMustNotMatch", CrawlProfile.MATCH_NEVER));
prop.put("crawlingCountryMustMatch", sb.getConfig("crawlingCountryMustMatch", ""));
prop.put("crawlingIfOlderCheck", "0"); prop.put("crawlingIfOlderCheck", "0");
prop.put("crawlingIfOlderUnitYearCheck", "0"); prop.put("crawlingIfOlderUnitYearCheck", "0");

@ -156,6 +156,14 @@ public class Crawler_p {
String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL); String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL);
final String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER); final String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER);
if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL; // avoid that all urls are filtered out if bad value was submitted if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL; // avoid that all urls are filtered out if bad value was submitted
String ipMustMatch = post.get("ipMustmatch", CrawlProfile.MATCH_ALL);
final String ipMustNotMatch = post.get("ipMustnotmatch", CrawlProfile.MATCH_NEVER);
if (ipMustMatch.length() < 2) ipMustMatch = CrawlProfile.MATCH_ALL;
final String countryMustMatch = post.getBoolean("countryMustMatchSwitch", false) ? post.get("countryMustMatchList", "") : "";
sb.setConfig("crawlingIPMustMatch", ipMustMatch);
sb.setConfig("crawlingIPMustNotMatch", ipMustNotMatch);
if (countryMustMatch.length() > 0) sb.setConfig("crawlingCountryMustMatch", countryMustMatch);
// special cases: // special cases:
if (crawlingStartURL!= null && fullDomain) { if (crawlingStartURL!= null && fullDomain) {
if (crawlingStartURL.isFile()) { if (crawlingStartURL.isFile()) {
@ -249,7 +257,10 @@ public class Crawler_p {
crawlingStart, crawlingStart,
crawlingStartURL, crawlingStartURL,
newcrawlingMustMatch, newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER, newcrawlingMustNotMatch,
ipMustMatch,
ipMustNotMatch,
countryMustMatch,
newcrawlingdepth, newcrawlingdepth,
crawlingIfOlder, crawlingIfOlder,
crawlingDomMaxPages, crawlingDomMaxPages,
@ -306,6 +317,9 @@ public class Crawler_p {
crawlingStartURL, crawlingStartURL,
newcrawlingMustMatch, newcrawlingMustMatch,
newcrawlingMustNotMatch, newcrawlingMustNotMatch,
ipMustMatch,
ipMustNotMatch,
countryMustMatch,
newcrawlingdepth, newcrawlingdepth,
crawlingIfOlder, crawlingIfOlder,
crawlingDomMaxPages, crawlingDomMaxPages,
@ -426,6 +440,9 @@ public class Crawler_p {
crawlURL, crawlURL,
newcrawlingMustMatch, newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_NEVER,
ipMustMatch,
ipMustNotMatch,
countryMustMatch,
newcrawlingdepth, newcrawlingdepth,
crawlingIfOlder, crawlingIfOlder,
crawlingDomMaxPages, crawlingDomMaxPages,
@ -463,6 +480,9 @@ public class Crawler_p {
sitemapURL, sitemapURL,
CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_ALL,
CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_NEVER,
ipMustMatch,
ipMustNotMatch,
countryMustMatch,
0, 0,
crawlingIfOlder, crawlingIfOlder,
crawlingDomMaxPages, crawlingDomMaxPages,
@ -504,6 +524,9 @@ public class Crawler_p {
sitelistURL, sitelistURL,
newcrawlingMustMatch, newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_NEVER,
ipMustMatch,
ipMustNotMatch,
countryMustMatch,
newcrawlingdepth, newcrawlingdepth,
crawlingIfOlder, crawlingIfOlder,
crawlingDomMaxPages, crawlingDomMaxPages,

@ -149,6 +149,9 @@ public class QuickCrawlLink_p {
crawlingStartURL.getHost(), crawlingStartURL.getHost(),
crawlingStartURL, crawlingStartURL,
crawlingMustMatch, crawlingMustMatch,
CrawlProfile.MATCH_ALL,
CrawlProfile.MATCH_NEVER,
"",
crawlingMustNotMatch, crawlingMustNotMatch,
CrawlingDepth, CrawlingDepth,
60 * 24 * 30, // recrawlIfOlder (minutes); here: one month 60 * 24 * 30, // recrawlIfOlder (minutes); here: one month

@ -48,8 +48,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public static final String HANDLE = "handle"; public static final String HANDLE = "handle";
public static final String NAME = "name"; public static final String NAME = "name";
public static final String START_URL = "startURL"; public static final String START_URL = "startURL";
public static final String FILTER_MUSTMATCH = "generalFilter";
public static final String FILTER_MUSTNOTMATCH = "nevermatch";
public static final String DEPTH = "generalDepth"; public static final String DEPTH = "generalDepth";
public static final String RECRAWL_IF_OLDER = "recrawlIfOlder"; public static final String RECRAWL_IF_OLDER = "recrawlIfOlder";
public static final String DOM_MAX_PAGES = "domMaxPages"; public static final String DOM_MAX_PAGES = "domMaxPages";
@ -63,6 +61,11 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public static final String XDSTOPW = "xdstopw"; public static final String XDSTOPW = "xdstopw";
public static final String XPSTOPW = "xpstopw"; public static final String XPSTOPW = "xpstopw";
public static final String CACHE_STRAGEGY = "cacheStrategy"; public static final String CACHE_STRAGEGY = "cacheStrategy";
public static final String FILTER_URL_MUSTMATCH = "generalFilter"; // for URLs
public static final String FILTER_URL_MUSTNOTMATCH = "nevermatch"; // for URLs
public static final String FILTER_IP_MUSTMATCH = "crawlingIPMustMatch";
public static final String FILTER_IP_MUSTNOTMATCH = "crawlingIPMustNotMatch";
public static final String FILTER_COUNTRY_MUSTMATCH = "crawlingCountryMustMatch";
private Pattern mustmatch = null, mustnotmatch = null; private Pattern mustmatch = null, mustnotmatch = null;
@ -70,8 +73,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
* Constructor which creates CrawlPofile from parameters. * Constructor which creates CrawlPofile from parameters.
* @param name name of the crawl profile * @param name name of the crawl profile
* @param startURL root URL of the crawl * @param startURL root URL of the crawl
* @param mustmatch URLs which do not match this regex will be ignored * @param urlMustMatch URLs which do not match this regex will be ignored
* @param mustnotmatch URLs which match this regex will be ignored * @param urlMustNotMatch URLs which match this regex will be ignored
* @param depth height of the tree which will be created by the crawler * @param depth height of the tree which will be created by the crawler
* @param recrawlIfOlder documents which have been indexed in the past will * @param recrawlIfOlder documents which have been indexed in the past will
* be indexed again if they are older than the time (ms) in this parameter * be indexed again if they are older than the time (ms) in this parameter
@ -89,8 +92,11 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public CrawlProfile( public CrawlProfile(
final String name, final String name,
final DigestURI startURL, final DigestURI startURL,
final String mustmatch, final String urlMustMatch,
final String mustnotmatch, final String urlMustNotMatch,
final String ipMustMatch,
final String ipMustNotMatch,
final String countryMustMatch,
final int depth, final int depth,
final long recrawlIfOlder /*date*/, final long recrawlIfOlder /*date*/,
final int domMaxPages, final int domMaxPages,
@ -113,8 +119,11 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
put(HANDLE, handle); put(HANDLE, handle);
put(NAME, name); put(NAME, name);
put(START_URL, (startURL == null) ? "" : startURL.toNormalform(true, false)); put(START_URL, (startURL == null) ? "" : startURL.toNormalform(true, false));
put(FILTER_MUSTMATCH, (mustmatch == null) ? CrawlProfile.MATCH_ALL : mustmatch); put(FILTER_URL_MUSTMATCH, (urlMustMatch == null) ? CrawlProfile.MATCH_ALL : urlMustMatch);
put(FILTER_MUSTNOTMATCH, (mustnotmatch == null) ? CrawlProfile.MATCH_NEVER : mustnotmatch); put(FILTER_URL_MUSTNOTMATCH, (urlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER : urlMustNotMatch);
put(FILTER_IP_MUSTMATCH, (ipMustMatch == null) ? CrawlProfile.MATCH_ALL : ipMustMatch);
put(FILTER_IP_MUSTNOTMATCH, (ipMustNotMatch == null) ? CrawlProfile.MATCH_NEVER : ipMustNotMatch);
put(FILTER_COUNTRY_MUSTMATCH, (countryMustMatch == null) ? "" : countryMustMatch);
put(DEPTH, depth); put(DEPTH, depth);
put(RECRAWL_IF_OLDER, recrawlIfOlder); put(RECRAWL_IF_OLDER, recrawlIfOlder);
put(DOM_MAX_PAGES, domMaxPages); put(DOM_MAX_PAGES, domMaxPages);
@ -200,7 +209,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
*/ */
public Pattern mustMatchPattern() { public Pattern mustMatchPattern() {
if (this.mustmatch == null) { if (this.mustmatch == null) {
String r = get(FILTER_MUSTMATCH); String r = get(FILTER_URL_MUSTMATCH);
if (r == null) r = CrawlProfile.MATCH_ALL; if (r == null) r = CrawlProfile.MATCH_ALL;
this.mustmatch = Pattern.compile(r); this.mustmatch = Pattern.compile(r);
} }
@ -213,7 +222,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
*/ */
public Pattern mustNotMatchPattern() { public Pattern mustNotMatchPattern() {
if (this.mustnotmatch == null) { if (this.mustnotmatch == null) {
String r = get(FILTER_MUSTNOTMATCH); String r = get(FILTER_URL_MUSTNOTMATCH);
if (r == null) r = CrawlProfile.MATCH_NEVER; if (r == null) r = CrawlProfile.MATCH_NEVER;
this.mustnotmatch = Pattern.compile(r); this.mustnotmatch = Pattern.compile(r);
} }

@ -63,7 +63,8 @@ public final class CrawlSwitchboard {
public static final long CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE = 60L * 24L * 30L; public static final long CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE = 60L * 24L * 30L;
private final Log log; private final Log log;
private Map<byte[], Map<String, String>> profilesActiveCrawls, profilesPassiveCrawls, profilesInvalidCrawls; private Map<byte[], Map<String, String>> profilesActiveCrawls;
private final Map<byte[], Map<String, String>> profilesPassiveCrawls, profilesInvalidCrawls;
public CrawlProfile defaultProxyProfile; public CrawlProfile defaultProxyProfile;
public CrawlProfile defaultRemoteProfile; public CrawlProfile defaultRemoteProfile;
public CrawlProfile defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile; public CrawlProfile defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile;
@ -97,18 +98,18 @@ public final class CrawlSwitchboard {
for (final byte[] handle : this.profilesActiveCrawls.keySet()) { for (final byte[] handle : this.profilesActiveCrawls.keySet()) {
final CrawlProfile p; final CrawlProfile p;
p = new CrawlProfile(this.profilesActiveCrawls.get(handle)); p = new CrawlProfile(this.profilesActiveCrawls.get(handle));
if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_MUSTMATCH))) { if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_URL_MUSTMATCH))) {
this.removeActive(handle); removeActive(handle);
this.putInvalid(handle, p); putInvalid(handle, p);
Log.logWarning("CrawlProfiles", "removed Profile " + p.handle() + ": " + p.name() Log.logWarning("CrawlProfiles", "removed Profile " + p.handle() + ": " + p.name()
+ " from active crawls since " + CrawlProfile.FILTER_MUSTMATCH + " from active crawls since " + CrawlProfile.FILTER_URL_MUSTMATCH
+ " is no valid regular expression: " + p.get(CrawlProfile.FILTER_MUSTMATCH)); + " is no valid regular expression: " + p.get(CrawlProfile.FILTER_URL_MUSTMATCH));
} else if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_MUSTNOTMATCH))) { } else if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH))) {
this.putInvalid(handle, p); putInvalid(handle, p);
this.removeActive(handle); removeActive(handle);
Log.logWarning("CrawlProfiles", "removed Profile " + p.handle() + ": " + p.name() Log.logWarning("CrawlProfiles", "removed Profile " + p.handle() + ": " + p.name()
+ " from active crawls since " + CrawlProfile.FILTER_MUSTNOTMATCH + " from active crawls since " + CrawlProfile.FILTER_URL_MUSTNOTMATCH
+ " is no valid regular expression: " + p.get(CrawlProfile.FILTER_MUSTNOTMATCH)); + " is no valid regular expression: " + p.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH));
} else { } else {
Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name()); Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name());
} }
@ -227,7 +228,10 @@ public final class CrawlSwitchboard {
if (this.defaultProxyProfile == null) { if (this.defaultProxyProfile == null) {
// generate new default entry for proxy crawling // generate new default entry for proxy crawling
this.defaultProxyProfile = new CrawlProfile( this.defaultProxyProfile = new CrawlProfile(
"proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "proxy", null,
CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
"",
0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/, 0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, false,
true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/, true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/,
@ -239,38 +243,38 @@ public final class CrawlSwitchboard {
} }
if (this.defaultRemoteProfile == null) { if (this.defaultRemoteProfile == null) {
// generate new default entry for remote crawling // generate new default entry for remote crawling
this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", CrawlProfile.MATCH_NEVER, 0,
-1, -1, true, true, true, false, false, true, true, false, CacheStrategy.IFFRESH); -1, -1, true, true, true, false, false, true, true, false, CacheStrategy.IFFRESH);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultRemoteProfile.handle()), this.defaultRemoteProfile); this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultRemoteProfile.handle()), this.defaultRemoteProfile);
} }
if (this.defaultTextSnippetLocalProfile == null) { if (this.defaultTextSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling // generate new default entry for snippet fetch and optional crawling
this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CacheStrategy.IFEXIST); CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()), this.defaultTextSnippetLocalProfile); this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()), this.defaultTextSnippetLocalProfile);
} }
if (this.defaultTextSnippetGlobalProfile == null) { if (this.defaultTextSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling // generate new default entry for snippet fetch and optional crawling
this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, true, false, true, true, false, CacheStrategy.IFEXIST); CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, true, false, true, true, false, CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()), this.defaultTextSnippetGlobalProfile); this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()), this.defaultTextSnippetGlobalProfile);
} }
this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST); this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST);
if (this.defaultMediaSnippetLocalProfile == null) { if (this.defaultMediaSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling // generate new default entry for snippet fetch and optional crawling
this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CacheStrategy.IFEXIST); CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()), this.defaultMediaSnippetLocalProfile); this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()), this.defaultMediaSnippetLocalProfile);
} }
if (this.defaultMediaSnippetGlobalProfile == null) { if (this.defaultMediaSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling // generate new default entry for snippet fetch and optional crawling
this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, true, false, true, true, false, true, true, false, CacheStrategy.IFEXIST); CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, true, false, true, true, false, true, true, false, CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()), this.defaultMediaSnippetGlobalProfile); this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()), this.defaultMediaSnippetGlobalProfile);
} }
if (this.defaultSurrogateProfile == null) { if (this.defaultSurrogateProfile == null) {
// generate new default entry for surrogate parsing // generate new default entry for surrogate parsing
this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, "", 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, true, true, false, false, false, true, true, false, CacheStrategy.NOCACHE); CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, true, true, false, false, false, true, true, false, CacheStrategy.NOCACHE);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultSurrogateProfile.handle()), this.defaultSurrogateProfile); this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultSurrogateProfile.handle()), this.defaultSurrogateProfile);
} }

Loading…
Cancel
Save