diff --git a/defaults/yacy.init b/defaults/yacy.init
index c7e219a6a..758062e7e 100644
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@@ -516,6 +516,7 @@ proxyURL.rewriteURLs=domainlist
# Be careful with this number. Consider a branching factor of average 20;
# A prefetch-depth of 8 would index 25.600.000.000 pages, maybe the whole WWW.
crawlingDepth=3
+crawlingDirectDocByURL=true
crawlingIfOlder=-1
crawlingDomFilterDepth=-1
crawlingDomMaxPages=-1
@@ -710,10 +711,6 @@ crawler.file.maxFileSize=100000000
# maximum number of crawler threads
crawler.MaxActiveThreads = 200
-# flag: consider all embedded image/audio/video document links
-# from all crawled documents as its own document
-crawler.embedLinksAsDocuments = true
-
# maximum size of indexing queue
indexer.slots = 100
diff --git a/htroot/CrawlStartExpert_p.html b/htroot/CrawlStartExpert_p.html
index b91ea7b52..ca1b4b8c4 100644
--- a/htroot/CrawlStartExpert_p.html
+++ b/htroot/CrawlStartExpert_p.html
@@ -78,8 +78,11 @@
-
:
-
+
Crawling Depth:
+
+
+ also all linked non-parsable documents
+
This defines how often the Crawler will follow links (of links..) embedded in websites.
0 means that only the page you enter under "Starting Point" will be added
diff --git a/htroot/CrawlStartExpert_p.java b/htroot/CrawlStartExpert_p.java
index 4b1793e68..30adc4c36 100644
--- a/htroot/CrawlStartExpert_p.java
+++ b/htroot/CrawlStartExpert_p.java
@@ -42,6 +42,7 @@ public class CrawlStartExpert_p {
prop.put("starturl", /*(intranet) ? repository :*/ "http://");
prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0"));
prop.put("crawlingDepth", Math.min(3, env.getConfigLong("crawlingDepth", 0)));
+ prop.put("directDocByURLChecked", sb.getConfigBool("crawlingDirectDocByURL", true) ? "1" : "0");
prop.put("mustmatch", /*(intranet) ? repository + ".*" :*/ CrawlProfile.MATCH_ALL_STRING);
prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
prop.put("ipMustmatch", sb.getConfig("crawlingIPMustMatch", CrawlProfile.MATCH_ALL_STRING));
diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java
index fbef760bf..8298bcae1 100644
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@@ -187,6 +187,9 @@ public class Crawler_p {
env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8;
+ final boolean directDocByURL = "on".equals(post.get("directDocByURL", "on"));
+ env.setConfig("crawlingDirectDocByURL", directDocByURL);
+
// recrawl
final String recrawl = post.get("recrawl", "nodoubles"); // nodoubles, reload, scheduler
boolean crawlingIfOlderCheck = "on".equals(post.get("crawlingIfOlderCheck", "off"));
@@ -262,6 +265,7 @@ public class Crawler_p {
ipMustNotMatch,
countryMustMatch,
newcrawlingdepth,
+ directDocByURL,
crawlingIfOlder,
crawlingDomMaxPages,
crawlingQ,
@@ -321,6 +325,7 @@ public class Crawler_p {
ipMustNotMatch,
countryMustMatch,
newcrawlingdepth,
+ directDocByURL,
crawlingIfOlder,
crawlingDomMaxPages,
crawlingQ,
@@ -444,6 +449,7 @@ public class Crawler_p {
ipMustNotMatch,
countryMustMatch,
newcrawlingdepth,
+ false,
crawlingIfOlder,
crawlingDomMaxPages,
crawlingQ,
@@ -484,6 +490,7 @@ public class Crawler_p {
ipMustNotMatch,
countryMustMatch,
0,
+ false,
crawlingIfOlder,
crawlingDomMaxPages,
true,
@@ -528,6 +535,7 @@ public class Crawler_p {
ipMustNotMatch,
countryMustMatch,
newcrawlingdepth,
+ directDocByURL,
crawlingIfOlder,
crawlingDomMaxPages,
crawlingQ,
diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java
index 7de24d99a..4db98c48d 100644
--- a/htroot/QuickCrawlLink_p.java
+++ b/htroot/QuickCrawlLink_p.java
@@ -154,6 +154,7 @@ public class QuickCrawlLink_p {
"",
crawlingMustNotMatch,
CrawlingDepth,
+ true,
60 * 24 * 30, // recrawlIfOlder (minutes); here: one month
-1, // domMaxPages, if negative: no count restriction
crawlDynamic,
diff --git a/source/de/anomic/crawler/CrawlProfile.java b/source/de/anomic/crawler/CrawlProfile.java
index cce5e2688..53ce0e231 100644
--- a/source/de/anomic/crawler/CrawlProfile.java
+++ b/source/de/anomic/crawler/CrawlProfile.java
@@ -51,6 +51,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M
public static final String NAME = "name";
public static final String START_URL = "startURL";
public static final String DEPTH = "generalDepth";
+ public static final String DIRECT_DOC_BY_URL= "directDocByURL";
public static final String RECRAWL_IF_OLDER = "recrawlIfOlder";
public static final String DOM_MAX_PAGES = "domMaxPages";
public static final String CRAWLING_Q = "crawlingQ";
@@ -77,7 +78,11 @@ public class CrawlProfile extends ConcurrentHashMap implements M
* @param startURL root URL of the crawl
* @param urlMustMatch URLs which do not match this regex will be ignored
* @param urlMustNotMatch URLs which match this regex will be ignored
+ * @param ipMustMatch IPs from URLs which do not match this regex will be ignored
+ * @param ipMustNotMatch IPs from URLs which match this regex will be ignored
+ * @param countryMustMatch URLs from a specific country must match
* @param depth height of the tree which will be created by the crawler
+ * @param directDocByURL if true, then linked documents that cannot be parsed are indexed as document
* @param recrawlIfOlder documents which have been indexed in the past will
* be indexed again if they are older than the time (ms) in this parameter
* @param domMaxPages maximum number from one domain which will be indexed
@@ -100,6 +105,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M
final String ipMustNotMatch,
final String countryMustMatch,
final int depth,
+ final boolean directDocByURL,
final long recrawlIfOlder /*date*/,
final int domMaxPages,
final boolean crawlingQ,
@@ -127,6 +133,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M
put(FILTER_IP_MUSTNOTMATCH, (ipMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : ipMustNotMatch);
put(FILTER_COUNTRY_MUSTMATCH, (countryMustMatch == null) ? "" : countryMustMatch);
put(DEPTH, depth);
+ put(DIRECT_DOC_BY_URL, directDocByURL);
put(RECRAWL_IF_OLDER, recrawlIfOlder);
put(DOM_MAX_PAGES, domMaxPages);
put(CRAWLING_Q, crawlingQ); // crawling of urls with '?'
@@ -298,6 +305,12 @@ public class CrawlProfile extends ConcurrentHashMap implements M
}
}
+ public boolean directDocByURL() {
+ final String r = get(DIRECT_DOC_BY_URL);
+ if (r == null) return false;
+ return (r.equals(Boolean.TRUE.toString()));
+ }
+
public CacheStrategy cacheStrategy() {
final String r = get(CACHE_STRAGEGY);
if (r == null) return CacheStrategy.IFEXIST;
diff --git a/source/de/anomic/crawler/CrawlSwitchboard.java b/source/de/anomic/crawler/CrawlSwitchboard.java
index ea2fb73d4..ad4752697 100644
--- a/source/de/anomic/crawler/CrawlSwitchboard.java
+++ b/source/de/anomic/crawler/CrawlSwitchboard.java
@@ -233,6 +233,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING,
"",
0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
+ true,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, false,
true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/,
true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/,
@@ -243,38 +244,38 @@ public final class CrawlSwitchboard {
}
if (this.defaultRemoteProfile == null) {
// generate new default entry for remote crawling
- this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", CrawlProfile.MATCH_NEVER_STRING, 0,
+ this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", CrawlProfile.MATCH_NEVER_STRING, 0, true,
-1, -1, true, true, true, false, false, true, true, false, CacheStrategy.IFFRESH);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultRemoteProfile.handle()), this.defaultRemoteProfile);
}
if (this.defaultTextSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
- this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0,
+ this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, true,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()), this.defaultTextSnippetLocalProfile);
}
if (this.defaultTextSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
- this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0,
+ this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, true,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, true, false, true, true, false, CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()), this.defaultTextSnippetGlobalProfile);
}
this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST);
if (this.defaultMediaSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
- this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0,
+ this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, true,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()), this.defaultMediaSnippetLocalProfile);
}
if (this.defaultMediaSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
- this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0,
+ this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, true,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, true, false, true, true, false, true, true, false, CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()), this.defaultMediaSnippetGlobalProfile);
}
if (this.defaultSurrogateProfile == null) {
// generate new default entry for surrogate parsing
- this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0,
+ this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, true, true, false, false, false, true, true, false, CacheStrategy.NOCACHE);
this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultSurrogateProfile.handle()), this.defaultSurrogateProfile);
}
diff --git a/source/net/yacy/cora/protocol/Domains.java b/source/net/yacy/cora/protocol/Domains.java
index dc5823d2b..b7ab12c93 100644
--- a/source/net/yacy/cora/protocol/Domains.java
+++ b/source/net/yacy/cora/protocol/Domains.java
@@ -875,18 +875,17 @@ public class Domains {
);
}
- public static boolean isLocal(final String host, final InetAddress hostaddress) {
- return isLocal(host, hostaddress, true);
- }
-
/**
* check if the given host is a local address.
* the hostaddress is optional and shall be given if the address is already known
* @param host
* @param hostaddress may be null if not known yet
- * @param recursive
* @return true if the given host is local
*/
+ public static boolean isLocal(final String host, final InetAddress hostaddress) {
+ return isLocal(host, hostaddress, true);
+ }
+
private static boolean isLocal(final String host, InetAddress hostaddress, final boolean recursive) {
if (noLocalCheck || // DO NOT REMOVE THIS! it is correct to return true if the check is off
@@ -912,7 +911,7 @@ public class Domains {
return isLocal(hostaddress);
}
- public static boolean isLocal(final InetAddress a) {
+ private static boolean isLocal(final InetAddress a) {
final boolean
localp = noLocalCheck || // DO NOT REMOVE THIS! it is correct to return true if the check is off
a == null ||
@@ -928,6 +927,8 @@ public class Domains {
* find the locale for a given host. This feature is only available in full quality,
* if the file InetAddressLocator.jar is placed in the /lib directory (as a plug-in)
* from http://javainetlocator.sourceforge.net/
+ * In case that that you know the InetAddress of the host, DO NOT call this method but the
+ * other method with the InetAddress first to get better results.
* @param host
* @return the locale for the host
*/
@@ -935,6 +936,7 @@ public class Domains {
if (host == null) return null;
final Locale locale = getLocale(dnsResolve(host));
if (locale != null && locale.getCountry() != null && locale.getCountry().length() > 0) return locale;
+
final int p = host.lastIndexOf('.');
if (p < 0) return null;
String tld = host.substring(p + 1).toUpperCase();
@@ -945,6 +947,8 @@ public class Domains {
/**
* find the locale for a given Address
+ * This uses the InetAddressLocator.jar library
+ * TODO: integrate http://www.maxmind.com/app/geolitecountry
* @param address
* @return
*/
diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java
index b0c6ac375..72bd822df 100644
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@@ -1930,7 +1930,7 @@ public final class Switchboard extends serverSwitch {
assert response.getContent() != null;
try {
// parse the document
- documents = TextParser.parseSource(response.url(), response.getMimeType(), response.getCharacterEncoding(), response.getContent(), getConfigBool("crawler.embedLinksAsDocuments", false));
+ documents = TextParser.parseSource(response.url(), response.getMimeType(), response.getCharacterEncoding(), response.getContent(), response.profile().directDocByURL());
if (documents == null) {
throw new Parser.Failure("Parser returned null.", response.url());
}