diff --git a/defaults/yacy.init b/defaults/yacy.init index c7e219a6a..758062e7e 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -516,6 +516,7 @@ proxyURL.rewriteURLs=domainlist # Be careful with this number. Consider a branching factor of average 20; # A prefetch-depth of 8 would index 25.600.000.000 pages, maybe the whole WWW. crawlingDepth=3 +crawlingDirectDocByURL=true crawlingIfOlder=-1 crawlingDomFilterDepth=-1 crawlingDomMaxPages=-1 @@ -710,10 +711,6 @@ crawler.file.maxFileSize=100000000 # maximum number of crawler threads crawler.MaxActiveThreads = 200 -# flag: consider all embedded image/audio/video document links -# from all crawled documents as its own document -crawler.embedLinksAsDocuments = true - # maximum size of indexing queue indexer.slots = 100 diff --git a/htroot/CrawlStartExpert_p.html b/htroot/CrawlStartExpert_p.html index b91ea7b52..ca1b4b8c4 100644 --- a/htroot/CrawlStartExpert_p.html +++ b/htroot/CrawlStartExpert_p.html @@ -78,8 +78,11 @@ - : - + Crawling Depth: + +     + also all linked non-parsable documents + This defines how often the Crawler will follow links (of links..) embedded in websites. 0 means that only the page you enter under "Starting Point" will be added diff --git a/htroot/CrawlStartExpert_p.java b/htroot/CrawlStartExpert_p.java index 4b1793e68..30adc4c36 100644 --- a/htroot/CrawlStartExpert_p.java +++ b/htroot/CrawlStartExpert_p.java @@ -42,6 +42,7 @@ public class CrawlStartExpert_p { prop.put("starturl", /*(intranet) ? repository :*/ "http://"); prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0")); prop.put("crawlingDepth", Math.min(3, env.getConfigLong("crawlingDepth", 0))); + prop.put("directDocByURLChecked", sb.getConfigBool("crawlingDirectDocByURL", true) ? "1" : "0"); prop.put("mustmatch", /*(intranet) ? repository + ".*" :*/ CrawlProfile.MATCH_ALL_STRING); prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING); prop.put("ipMustmatch", sb.getConfig("crawlingIPMustMatch", CrawlProfile.MATCH_ALL_STRING)); diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index fbef760bf..8298bcae1 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -187,6 +187,9 @@ public class Crawler_p { env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth)); if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8; + final boolean directDocByURL = "on".equals(post.get("directDocByURL", "on")); + env.setConfig("crawlingDirectDocByURL", directDocByURL); + // recrawl final String recrawl = post.get("recrawl", "nodoubles"); // nodoubles, reload, scheduler boolean crawlingIfOlderCheck = "on".equals(post.get("crawlingIfOlderCheck", "off")); @@ -262,6 +265,7 @@ public class Crawler_p { ipMustNotMatch, countryMustMatch, newcrawlingdepth, + directDocByURL, crawlingIfOlder, crawlingDomMaxPages, crawlingQ, @@ -321,6 +325,7 @@ public class Crawler_p { ipMustNotMatch, countryMustMatch, newcrawlingdepth, + directDocByURL, crawlingIfOlder, crawlingDomMaxPages, crawlingQ, @@ -444,6 +449,7 @@ public class Crawler_p { ipMustNotMatch, countryMustMatch, newcrawlingdepth, + false, crawlingIfOlder, crawlingDomMaxPages, crawlingQ, @@ -484,6 +490,7 @@ public class Crawler_p { ipMustNotMatch, countryMustMatch, 0, + false, crawlingIfOlder, crawlingDomMaxPages, true, @@ -528,6 +535,7 @@ public class Crawler_p { ipMustNotMatch, countryMustMatch, newcrawlingdepth, + directDocByURL, crawlingIfOlder, crawlingDomMaxPages, crawlingQ, diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index 7de24d99a..4db98c48d 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -154,6 +154,7 @@ public class QuickCrawlLink_p { "", crawlingMustNotMatch, CrawlingDepth, + true, 60 * 24 * 30, // recrawlIfOlder (minutes); here: one month -1, // domMaxPages, if negative: no count restriction crawlDynamic, diff --git a/source/de/anomic/crawler/CrawlProfile.java b/source/de/anomic/crawler/CrawlProfile.java index cce5e2688..53ce0e231 100644 --- a/source/de/anomic/crawler/CrawlProfile.java +++ b/source/de/anomic/crawler/CrawlProfile.java @@ -51,6 +51,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M public static final String NAME = "name"; public static final String START_URL = "startURL"; public static final String DEPTH = "generalDepth"; + public static final String DIRECT_DOC_BY_URL= "directDocByURL"; public static final String RECRAWL_IF_OLDER = "recrawlIfOlder"; public static final String DOM_MAX_PAGES = "domMaxPages"; public static final String CRAWLING_Q = "crawlingQ"; @@ -77,7 +78,11 @@ public class CrawlProfile extends ConcurrentHashMap implements M * @param startURL root URL of the crawl * @param urlMustMatch URLs which do not match this regex will be ignored * @param urlMustNotMatch URLs which match this regex will be ignored + * @param ipMustMatch IPs from URLs which do not match this regex will be ignored + * @param ipMustNotMatch IPs from URLs which match this regex will be ignored + * @param countryMustMatch URLs from a specific country must match * @param depth height of the tree which will be created by the crawler + * @param directDocByURL if true, then linked documents that cannot be parsed are indexed as document * @param recrawlIfOlder documents which have been indexed in the past will * be indexed again if they are older than the time (ms) in this parameter * @param domMaxPages maximum number from one domain which will be indexed @@ -100,6 +105,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M final String ipMustNotMatch, final String countryMustMatch, final int depth, + final boolean directDocByURL, final long recrawlIfOlder /*date*/, final int domMaxPages, final boolean crawlingQ, @@ -127,6 +133,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M put(FILTER_IP_MUSTNOTMATCH, (ipMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : ipMustNotMatch); put(FILTER_COUNTRY_MUSTMATCH, (countryMustMatch == null) ? "" : countryMustMatch); put(DEPTH, depth); + put(DIRECT_DOC_BY_URL, directDocByURL); put(RECRAWL_IF_OLDER, recrawlIfOlder); put(DOM_MAX_PAGES, domMaxPages); put(CRAWLING_Q, crawlingQ); // crawling of urls with '?' @@ -298,6 +305,12 @@ public class CrawlProfile extends ConcurrentHashMap implements M } } + public boolean directDocByURL() { + final String r = get(DIRECT_DOC_BY_URL); + if (r == null) return false; + return (r.equals(Boolean.TRUE.toString())); + } + public CacheStrategy cacheStrategy() { final String r = get(CACHE_STRAGEGY); if (r == null) return CacheStrategy.IFEXIST; diff --git a/source/de/anomic/crawler/CrawlSwitchboard.java b/source/de/anomic/crawler/CrawlSwitchboard.java index ea2fb73d4..ad4752697 100644 --- a/source/de/anomic/crawler/CrawlSwitchboard.java +++ b/source/de/anomic/crawler/CrawlSwitchboard.java @@ -233,6 +233,7 @@ public final class CrawlSwitchboard { CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/, + true, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, false, true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/, true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/, @@ -243,38 +244,38 @@ public final class CrawlSwitchboard { } if (this.defaultRemoteProfile == null) { // generate new default entry for remote crawling - this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", CrawlProfile.MATCH_NEVER_STRING, 0, + this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", CrawlProfile.MATCH_NEVER_STRING, 0, true, -1, -1, true, true, true, false, false, true, true, false, CacheStrategy.IFFRESH); this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultRemoteProfile.handle()), this.defaultRemoteProfile); } if (this.defaultTextSnippetLocalProfile == null) { // generate new default entry for snippet fetch and optional crawling - this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, + this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, true, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CacheStrategy.IFEXIST); this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()), this.defaultTextSnippetLocalProfile); } if (this.defaultTextSnippetGlobalProfile == null) { // generate new default entry for snippet fetch and optional crawling - this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, + this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, true, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, true, false, true, true, false, CacheStrategy.IFEXIST); this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()), this.defaultTextSnippetGlobalProfile); } this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST); if (this.defaultMediaSnippetLocalProfile == null) { // generate new default entry for snippet fetch and optional crawling - this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, + this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, true, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CacheStrategy.IFEXIST); this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()), this.defaultMediaSnippetLocalProfile); } if (this.defaultMediaSnippetGlobalProfile == null) { // generate new default entry for snippet fetch and optional crawling - this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, + this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, true, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, true, false, true, true, false, true, true, false, CacheStrategy.IFEXIST); this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()), this.defaultMediaSnippetGlobalProfile); } if (this.defaultSurrogateProfile == null) { // generate new default entry for surrogate parsing - this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, + this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, true, true, false, false, false, true, true, false, CacheStrategy.NOCACHE); this.profilesActiveCrawls.put(UTF8.getBytes(this.defaultSurrogateProfile.handle()), this.defaultSurrogateProfile); } diff --git a/source/net/yacy/cora/protocol/Domains.java b/source/net/yacy/cora/protocol/Domains.java index dc5823d2b..b7ab12c93 100644 --- a/source/net/yacy/cora/protocol/Domains.java +++ b/source/net/yacy/cora/protocol/Domains.java @@ -875,18 +875,17 @@ public class Domains { ); } - public static boolean isLocal(final String host, final InetAddress hostaddress) { - return isLocal(host, hostaddress, true); - } - /** * check if the given host is a local address. * the hostaddress is optional and shall be given if the address is already known * @param host * @param hostaddress may be null if not known yet - * @param recursive * @return true if the given host is local */ + public static boolean isLocal(final String host, final InetAddress hostaddress) { + return isLocal(host, hostaddress, true); + } + private static boolean isLocal(final String host, InetAddress hostaddress, final boolean recursive) { if (noLocalCheck || // DO NOT REMOVE THIS! it is correct to return true if the check is off @@ -912,7 +911,7 @@ public class Domains { return isLocal(hostaddress); } - public static boolean isLocal(final InetAddress a) { + private static boolean isLocal(final InetAddress a) { final boolean localp = noLocalCheck || // DO NOT REMOVE THIS! it is correct to return true if the check is off a == null || @@ -928,6 +927,8 @@ public class Domains { * find the locale for a given host. This feature is only available in full quality, * if the file InetAddressLocator.jar is placed in the /lib directory (as a plug-in) * from http://javainetlocator.sourceforge.net/ + * In case that that you know the InetAddress of the host, DO NOT call this method but the + * other method with the InetAddress first to get better results. * @param host * @return the locale for the host */ @@ -935,6 +936,7 @@ public class Domains { if (host == null) return null; final Locale locale = getLocale(dnsResolve(host)); if (locale != null && locale.getCountry() != null && locale.getCountry().length() > 0) return locale; + final int p = host.lastIndexOf('.'); if (p < 0) return null; String tld = host.substring(p + 1).toUpperCase(); @@ -945,6 +947,8 @@ public class Domains { /** * find the locale for a given Address + * This uses the InetAddressLocator.jar library + * TODO: integrate http://www.maxmind.com/app/geolitecountry * @param address * @return */ diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index b0c6ac375..72bd822df 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -1930,7 +1930,7 @@ public final class Switchboard extends serverSwitch { assert response.getContent() != null; try { // parse the document - documents = TextParser.parseSource(response.url(), response.getMimeType(), response.getCharacterEncoding(), response.getContent(), getConfigBool("crawler.embedLinksAsDocuments", false)); + documents = TextParser.parseSource(response.url(), response.getMimeType(), response.getCharacterEncoding(), response.getContent(), response.profile().directDocByURL()); if (documents == null) { throw new Parser.Failure("Parser returned null.", response.url()); }