added directDocByURL attribute in crawl profile

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7985 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent c61e4cfd78
commit cf4fd525ee

@ -516,6 +516,7 @@ proxyURL.rewriteURLs=domainlist
# Be careful with this number. Consider a branching factor of average 20;
# A prefetch-depth of 8 would index 25.600.000.000 pages, maybe the whole WWW.
crawlingDepth=3
crawlingDirectDocByURL=true
crawlingIfOlder=-1
crawlingDomFilterDepth=-1
crawlingDomMaxPages=-1
@ -710,10 +711,6 @@ crawler.file.maxFileSize=100000000
# maximum number of crawler threads
crawler.MaxActiveThreads = 200
# flag: consider all embedded image/audio/video document links
# from all crawled documents as its own document
crawler.embedLinksAsDocuments = true
# maximum size of indexing queue
indexer.slots = 100

@ -78,8 +78,11 @@
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td><label for="crawlingDepth">Crawling Depth</label>:</td>
<td><input name="crawlingDepth" id="crawlingDepth" type="text" size="2" maxlength="2" value="#[crawlingDepth]#" /></td>
<td>Crawling Depth:</td>
<td>
<input name="crawlingDepth" id="crawlingDepth" type="text" size="2" maxlength="2" value="#[crawlingDepth]#" />&nbsp;&nbsp;&nbsp;
<input type="checkbox" name="directDocByURL" id="directDocByURL" #(directDocByURLChecked)#::checked="checked"#(/directDocByURLChecked)# />also all linked non-parsable documents
</td>
<td>
This defines how often the Crawler will follow links (of links..) embedded in websites.
0 means that only the page you enter under "Starting Point" will be added

@ -42,6 +42,7 @@ public class CrawlStartExpert_p {
prop.put("starturl", /*(intranet) ? repository :*/ "http://");
prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0"));
prop.put("crawlingDepth", Math.min(3, env.getConfigLong("crawlingDepth", 0)));
prop.put("directDocByURLChecked", sb.getConfigBool("crawlingDirectDocByURL", true) ? "1" : "0");
prop.put("mustmatch", /*(intranet) ? repository + ".*" :*/ CrawlProfile.MATCH_ALL_STRING);
prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
prop.put("ipMustmatch", sb.getConfig("crawlingIPMustMatch", CrawlProfile.MATCH_ALL_STRING));

@ -187,6 +187,9 @@ public class Crawler_p {
env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8;
final boolean directDocByURL = "on".equals(post.get("directDocByURL", "on"));
env.setConfig("crawlingDirectDocByURL", directDocByURL);
// recrawl
final String recrawl = post.get("recrawl", "nodoubles"); // nodoubles, reload, scheduler
boolean crawlingIfOlderCheck = "on".equals(post.get("crawlingIfOlderCheck", "off"));
@ -262,6 +265,7 @@ public class Crawler_p {
ipMustNotMatch,
countryMustMatch,
newcrawlingdepth,
directDocByURL,
crawlingIfOlder,
crawlingDomMaxPages,
crawlingQ,
@ -321,6 +325,7 @@ public class Crawler_p {
ipMustNotMatch,
countryMustMatch,
newcrawlingdepth,
directDocByURL,
crawlingIfOlder,
crawlingDomMaxPages,
crawlingQ,
@ -444,6 +449,7 @@ public class Crawler_p {
ipMustNotMatch,
countryMustMatch,
newcrawlingdepth,
false,
crawlingIfOlder,
crawlingDomMaxPages,
crawlingQ,
@ -484,6 +490,7 @@ public class Crawler_p {
ipMustNotMatch,
countryMustMatch,
0,
false,
crawlingIfOlder,
crawlingDomMaxPages,
true,
@ -528,6 +535,7 @@ public class Crawler_p {
ipMustNotMatch,
countryMustMatch,
newcrawlingdepth,
directDocByURL,
crawlingIfOlder,
crawlingDomMaxPages,
crawlingQ,

@ -154,6 +154,7 @@ public class QuickCrawlLink_p {
"",
crawlingMustNotMatch,
CrawlingDepth,
true,
60 * 24 * 30, // recrawlIfOlder (minutes); here: one month
-1, // domMaxPages, if negative: no count restriction
crawlDynamic,

@ -51,6 +51,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public static final String NAME = "name";
public static final String START_URL = "startURL";
public static final String DEPTH = "generalDepth";
public static final String DIRECT_DOC_BY_URL= "directDocByURL";
public static final String RECRAWL_IF_OLDER = "recrawlIfOlder";
public static final String DOM_MAX_PAGES = "domMaxPages";
public static final String CRAWLING_Q = "crawlingQ";
@ -77,7 +78,11 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
* @param startURL root URL of the crawl
* @param urlMustMatch URLs which do not match this regex will be ignored
* @param urlMustNotMatch URLs which match this regex will be ignored
* @param ipMustMatch IPs from URLs which do not match this regex will be ignored
* @param ipMustNotMatch IPs from URLs which match this regex will be ignored
* @param countryMustMatch URLs from a specific country must match
* @param depth height of the tree which will be created by the crawler
* @param directDocByURL if true, then linked documents that cannot be parsed are indexed as document
* @param recrawlIfOlder documents which have been indexed in the past will
* be indexed again if they are older than the time (ms) in this parameter
* @param domMaxPages maximum number from one domain which will be indexed
@ -100,6 +105,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
final String ipMustNotMatch,
final String countryMustMatch,
final int depth,
final boolean directDocByURL,
final long recrawlIfOlder /*date*/,
final int domMaxPages,
final boolean crawlingQ,
@ -127,6 +133,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
put(FILTER_IP_MUSTNOTMATCH, (ipMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : ipMustNotMatch);
put(FILTER_COUNTRY_MUSTMATCH, (countryMustMatch == null) ? "" : countryMustMatch);
put(DEPTH, depth);
put(DIRECT_DOC_BY_URL, directDocByURL);
put(RECRAWL_IF_OLDER, recrawlIfOlder);
put(DOM_MAX_PAGES, domMaxPages);
put(CRAWLING_Q, crawlingQ); // crawling of urls with '?'
@ -298,6 +305,12 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
}
}
public boolean directDocByURL() {
final String r = get(DIRECT_DOC_BY_URL);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public CacheStrategy cacheStrategy() {
final String r = get(CACHE_STRAGEGY);
if (r == null) return CacheStrategy.IFEXIST;

@ -233,6 +233,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING,
"",
0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
true,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, false,
true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/,
true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/,
@ -243,38 +244,38 @@ public final class CrawlSwitchboard {
}
if (this.defaultRemoteProfile == null) {
// generate new default entry for remote crawling
this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", CrawlProfile.MATCH_NEVER_STRING, 0,
this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", CrawlProfile.MATCH_NEVER_STRING, 0, true,
-1, -1, true, true, true, false, false, true, true, false, CacheStrategy.IFFRESH);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultRemoteProfile.handle()), this.defaultRemoteProfile);
}
if (this.defaultTextSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0,
this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, true,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()), this.defaultTextSnippetLocalProfile);
}
if (this.defaultTextSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0,
this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, true,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, true, false, true, true, false, CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()), this.defaultTextSnippetGlobalProfile);
}
this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST);
if (this.defaultMediaSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0,
this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, true,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()), this.defaultMediaSnippetLocalProfile);
}
if (this.defaultMediaSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0,
this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, true,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, true, false, true, true, false, true, true, false, CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()), this.defaultMediaSnippetGlobalProfile);
}
if (this.defaultSurrogateProfile == null) {
// generate new default entry for surrogate parsing
this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0,
this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, true, true, false, false, false, true, true, false, CacheStrategy.NOCACHE);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultSurrogateProfile.handle()), this.defaultSurrogateProfile);
}

@ -875,18 +875,17 @@ public class Domains {
);
}
public static boolean isLocal(final String host, final InetAddress hostaddress) {
return isLocal(host, hostaddress, true);
}
/**
* check if the given host is a local address.
* the hostaddress is optional and shall be given if the address is already known
* @param host
* @param hostaddress may be null if not known yet
* @param recursive
* @return true if the given host is local
*/
public static boolean isLocal(final String host, final InetAddress hostaddress) {
return isLocal(host, hostaddress, true);
}
private static boolean isLocal(final String host, InetAddress hostaddress, final boolean recursive) {
if (noLocalCheck || // DO NOT REMOVE THIS! it is correct to return true if the check is off
@ -912,7 +911,7 @@ public class Domains {
return isLocal(hostaddress);
}
public static boolean isLocal(final InetAddress a) {
private static boolean isLocal(final InetAddress a) {
final boolean
localp = noLocalCheck || // DO NOT REMOVE THIS! it is correct to return true if the check is off
a == null ||
@ -928,6 +927,8 @@ public class Domains {
* find the locale for a given host. This feature is only available in full quality,
* if the file InetAddressLocator.jar is placed in the /lib directory (as a plug-in)
* from http://javainetlocator.sourceforge.net/
* In case that that you know the InetAddress of the host, DO NOT call this method but the
* other method with the InetAddress first to get better results.
* @param host
* @return the locale for the host
*/
@ -935,6 +936,7 @@ public class Domains {
if (host == null) return null;
final Locale locale = getLocale(dnsResolve(host));
if (locale != null && locale.getCountry() != null && locale.getCountry().length() > 0) return locale;
final int p = host.lastIndexOf('.');
if (p < 0) return null;
String tld = host.substring(p + 1).toUpperCase();
@ -945,6 +947,8 @@ public class Domains {
/**
* find the locale for a given Address
* This uses the InetAddressLocator.jar library
* TODO: integrate http://www.maxmind.com/app/geolitecountry
* @param address
* @return
*/

@ -1930,7 +1930,7 @@ public final class Switchboard extends serverSwitch {
assert response.getContent() != null;
try {
// parse the document
documents = TextParser.parseSource(response.url(), response.getMimeType(), response.getCharacterEncoding(), response.getContent(), getConfigBool("crawler.embedLinksAsDocuments", false));
documents = TextParser.parseSource(response.url(), response.getMimeType(), response.getCharacterEncoding(), response.getContent(), response.profile().directDocByURL());
if (documents == null) {
throw new Parser.Failure("Parser returned null.", response.url());
}

Loading…
Cancel
Save