diff --git a/htroot/CrawlProfileEditor_p.java b/htroot/CrawlProfileEditor_p.java index 2c40befb5..1d4d19c87 100644 --- a/htroot/CrawlProfileEditor_p.java +++ b/htroot/CrawlProfileEditor_p.java @@ -32,6 +32,7 @@ import java.util.TreeMap; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.SpaceExceededException; +import net.yacy.crawler.CrawlSwitchboard; import net.yacy.crawler.data.CrawlProfile; import net.yacy.kelondro.logging.Log; import net.yacy.search.Switchboard; @@ -124,7 +125,7 @@ public class CrawlProfileEditor_p { final Map orderdHandles = new TreeMap(); for (final byte[] h : sb.crawler.getActive()) { selentry = sb.crawler.getActive(h); - if (selentry != null && !CrawlProfile.ignoreNames.contains(selentry.name())) { + if (selentry != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(selentry.name())) { orderdHandles.put(selentry.collectionName(), selentry.handle()); } } diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index e57664428..f0a112eca 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -40,6 +40,7 @@ import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.SpaceExceededException; +import net.yacy.crawler.CrawlSwitchboard; import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.CrawlQueues; import net.yacy.crawler.data.ZURL.FailCategory; @@ -532,7 +533,7 @@ public class Crawler_p { String hosts = ""; for (final byte[] h: sb.crawler.getActive()) { profile = sb.crawler.getActive(h); - if (CrawlProfile.ignoreNames.contains(profile.name())) continue; + if (CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) continue; profile.putProfileEntry("crawlProfilesShow_list_", prop, true, dark, count, domlistlength); prop.put("crawlProfilesShow_list_" + count + "_debug", debug ? 1 : 0); if (debug) { diff --git a/htroot/IndexCreateQueues_p.java b/htroot/IndexCreateQueues_p.java index fefebcf5a..b1743229b 100644 --- a/htroot/IndexCreateQueues_p.java +++ b/htroot/IndexCreateQueues_p.java @@ -68,14 +68,7 @@ public class IndexCreateQueues_p { for (final byte[] handle: sb.crawler.getActive()) { entry = sb.crawler.getActive(handle); final String name = entry.name(); - if (name.equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY) || - name.equals(CrawlSwitchboard.CRAWL_PROFILE_REMOTE) || - name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) || - name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT) || - name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) || - name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) || - name.equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE)) - continue; + if (CrawlSwitchboard.DEFAULT_PROFILES.contains(name)) continue; if (compiledPattern.matcher(name).find()) sb.crawler.removeActive(entry.handle().getBytes()); } } else { diff --git a/htroot/WatchWebStructure_p.java b/htroot/WatchWebStructure_p.java index 83fe658ae..a376c7b09 100644 --- a/htroot/WatchWebStructure_p.java +++ b/htroot/WatchWebStructure_p.java @@ -56,14 +56,7 @@ public class WatchWebStructure_p { CrawlProfile e; for (final byte[] handle: sb.crawler.getActive()) { e = sb.crawler.getActive(handle); - if (e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY) || - e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_REMOTE) || - e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) || - e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT) || - e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) || - e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) || - e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE)) - continue; + if (CrawlSwitchboard.DEFAULT_PROFILES.contains(e.name())) continue; host = e.name(); break; // take the first one } diff --git a/source/net/yacy/crawler/CrawlSwitchboard.java b/source/net/yacy/crawler/CrawlSwitchboard.java index aea158e1c..b56bece92 100644 --- a/source/net/yacy/crawler/CrawlSwitchboard.java +++ b/source/net/yacy/crawler/CrawlSwitchboard.java @@ -60,16 +60,30 @@ public final class CrawlSwitchboard { public static final String CRAWL_PROFILE_REMOTE = "remote"; public static final String CRAWL_PROFILE_SNIPPET_LOCAL_TEXT = "snippetLocalText"; public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT = "snippetGlobalText"; + public static final String CRAWL_PROFILE_GREEDY_LEARNING_TEXT = "snippetGreedyLearningText"; public static final String CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA = "snippetLocalMedia"; public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA = "snippetGlobalMedia"; public static final String CRAWL_PROFILE_SURROGATE = "surrogates"; + public static Set DEFAULT_PROFILES = new HashSet(); + static { + DEFAULT_PROFILES.add(CRAWL_PROFILE_PROXY); + DEFAULT_PROFILES.add(CRAWL_PROFILE_REMOTE); + DEFAULT_PROFILES.add(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT); + DEFAULT_PROFILES.add(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT); + DEFAULT_PROFILES.add(CRAWL_PROFILE_GREEDY_LEARNING_TEXT); + DEFAULT_PROFILES.add(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA); + DEFAULT_PROFILES.add(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA); + DEFAULT_PROFILES.add(CRAWL_PROFILE_SURROGATE); + } + public static final String DBFILE_ACTIVE_CRAWL_PROFILES = "crawlProfilesActive.heap"; public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive.heap"; public static final long CRAWL_PROFILE_PROXY_RECRAWL_CYCLE = 60L * 24L; public static final long CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L; public static final long CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L; + public static final long CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L; public static final long CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE = 60L * 24L * 30L; public static final long CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE = 60L * 24L * 30L; public static final long CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE = 60L * 24L * 30L; @@ -82,6 +96,7 @@ public final class CrawlSwitchboard { public CrawlProfile defaultProxyProfile; public CrawlProfile defaultRemoteProfile; public CrawlProfile defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile; + public CrawlProfile defaultTextGreedyLearningProfile; public CrawlProfile defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile; public CrawlProfile defaultSurrogateProfile; private final File queuesRoot; @@ -344,6 +359,34 @@ public final class CrawlSwitchboard { UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()), this.defaultTextSnippetGlobalProfile); this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST); + // generate new default entry for greedy learning + this.defaultTextGreedyLearningProfile = + new CrawlProfile( + CRAWL_PROFILE_GREEDY_LEARNING_TEXT, + CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch + CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch + CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch + CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch + CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch + CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch + CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch + CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch + CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch + CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch + 0, + false, + CrawlProfile.getRecrawlDate(CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE), + -1, + true, + false, + false, + true, + false, + CacheStrategy.IFEXIST, + "robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT); + this.profilesActiveCrawls.put( + UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()), + this.defaultTextSnippetGlobalProfile); // generate new default entry for snippet fetch and optional crawling this.defaultMediaSnippetLocalProfile = new CrawlProfile( @@ -465,13 +508,7 @@ public final class CrawlSwitchboard { } catch ( final SpaceExceededException e ) { continue; } - if ( !((entry.name().equals(CRAWL_PROFILE_PROXY)) - || (entry.name().equals(CRAWL_PROFILE_REMOTE)) - || (entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT)) - || (entry.name().equals(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT)) - || (entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA)) - || (entry.name().equals(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) || (entry.name() - .equals(CRAWL_PROFILE_SURROGATE))) ) { + if (!DEFAULT_PROFILES.contains(entry.name())) { final CrawlProfile p = new CrawlProfile(entry); this.profilesPassiveCrawls.put(UTF8.getBytes(p.handle()), p); this.profilesActiveCrawls.remove(handle); @@ -494,13 +531,7 @@ public final class CrawlSwitchboard { for (final byte[] handle: this.getActive()) { CrawlProfile entry; entry = new CrawlProfile(this.getActive(handle)); - if (!((entry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY)) - || (entry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_REMOTE)) - || (entry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT)) - || (entry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT)) - || (entry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA)) - || (entry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) - || (entry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE)))) { + if (!CrawlSwitchboard.DEFAULT_PROFILES.contains(entry.name())) { deletionCandidate.add(ASCII.String(handle)); } } diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java index 82da4cbfd..3d6619306 100644 --- a/source/net/yacy/crawler/data/CrawlProfile.java +++ b/source/net/yacy/crawler/data/CrawlProfile.java @@ -26,7 +26,6 @@ package net.yacy.crawler.data; import java.text.DateFormat; -import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Set; @@ -546,19 +545,6 @@ public class CrawlProfile extends ConcurrentHashMap implements M return new StringBuilder(u.length() + 5).append(Pattern.quote(u)).append(".*").toString(); } - public static final Set ignoreNames = new HashSet(); - static { - ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_PROXY); - ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_REMOTE); - ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA); - ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT); - ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA); - ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT); - ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE); - ignoreNames.add(CrawlSwitchboard.DBFILE_ACTIVE_CRAWL_PROFILES); - ignoreNames.add(CrawlSwitchboard.DBFILE_PASSIVE_CRAWL_PROFILES); - } - public void putProfileEntry( final String CRAWL_PROFILE_PREFIX, final serverObjects prop, @@ -569,7 +555,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M prop.put(CRAWL_PROFILE_PREFIX + count + "_dark", dark ? "1" : "0"); prop.put(CRAWL_PROFILE_PREFIX + count + "_name", this.collectionName()); - prop.put(CRAWL_PROFILE_PREFIX + count + "_terminateButton", (!active || ignoreNames.contains(this.name())) ? "0" : "1"); + prop.put(CRAWL_PROFILE_PREFIX + count + "_terminateButton", (!active || CrawlSwitchboard.DEFAULT_PROFILES.contains(this.name())) ? "0" : "1"); prop.put(CRAWL_PROFILE_PREFIX + count + "_terminateButton_handle", this.handle()); prop.put(CRAWL_PROFILE_PREFIX + count + "_deleteButton", (active) ? "0" : "1"); prop.put(CRAWL_PROFILE_PREFIX + count + "_deleteButton_handle", this.handle()); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 161e5d428..3241b54d9 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2076,6 +2076,10 @@ public final class Switchboard extends serverSwitch { selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE))); insert = true; } + if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_GREEDY_LEARNING_TEXT) ) { + selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE))); + insert = true; + } if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) ) { selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE))); insert = true; @@ -2484,7 +2488,7 @@ public final class Switchboard extends serverSwitch { ) { // get the hyperlinks final Map hl = Document.getHyperlinks(documents); - + // add all media links also to the crawl stack. They will be re-sorted to the NOLOAD queue and indexed afterwards as pure links if (response.profile().directDocByURL()) { hl.putAll(Document.getImagelinks(documents));