diff --git a/defaults/yacy.init b/defaults/yacy.init index d52d4a7f6..7e72ae9e6 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -1156,3 +1156,14 @@ interaction.autocrawler.categoryfilter = .* # host browser settings browser.autoload = false browser.load4everyone = false + + +# greedy learning: fast information acquisition heuristic for new peers +# to make greedy learning work, it must be enabled in the network definition +# the user may switch it off at any time, but if the automatic learning limit is reached +# then the active flag is set to false automatically and this will switch to that state +# automatically by the cleanup process each time if the user switches it on again. +# While the switch in on, it will cause that the user-submitted search will be done along +# with some heuristics like: loading linked documents and adding a twitter search. +# When the learning mode is finished, the user may switch on individual heuristics by himself. +greedylearning.active = true \ No newline at end of file diff --git a/defaults/yacy.network.freeworld.unit b/defaults/yacy.network.freeworld.unit index 088d5f77b..5872fc4ff 100644 --- a/defaults/yacy.network.freeworld.unit +++ b/defaults/yacy.network.freeworld.unit @@ -73,11 +73,13 @@ network.unit.remotecrawl.speed = 300 # addresses of seed-list bootstrap locations network.unit.bootstrap.seedlist0 = http://www.yacy.net/seed.txt network.unit.bootstrap.seedlist1 = http://home.arcor.de/hermens/yacy/seed.txt -network.unit.bootstrap.seedlist2 = http://low.audioattack.de/yacy/seed.txt -network.unit.bootstrap.seedlist3 = http://www.lulabad.de/seed.txt -network.unit.bootstrap.seedlist4 = http://sixcooler.de/yacy/seed.txt -network.unit.bootstrap.seedlist5 = http://headrift.dyndns.org/yacy/seed.txt -network.unit.bootstrap.seedlist6 = http://dk5ras.dyndns.org/seed.txt +network.unit.bootstrap.seedlist2 = http://www.lulabad.de/seed.txt +network.unit.bootstrap.seedlist3 = http://sixcooler.de/yacy/seed.txt +network.unit.bootstrap.seedlist4 = http://img.homepage.bluewin.ch/352348/seed.txt +network.unit.bootstrap.seedlist5 = https://esbek.iv.net.pl/yacy/seed.txt +network.unit.bootstrap.seedlist6 = http://yacy.seed.mylookr.com/seed.txt +network.unit.bootstrap.seedlist7 = http://mary.dyndns.biz/yacy/seed.txt + # each network may use different yacy distributions. # the auto-updater can access network-specific update locations @@ -94,3 +96,7 @@ network.unit.protocol.control = uncontrolled # white/blacklists network.unit.access.whitelist = 10\..*,127\..*,172\.(1[6-9]|2[0-9]|3[0-1])\..*,169\.254\..*,192\.168\..*,localhost network.unit.access.blacklist = + +# greedy learning: fast information acquisition heuristic for new peers +greedylearning.enabled = true +greedylearning.limit.doccount = 15000 diff --git a/defaults/yacy.network.intranet.unit b/defaults/yacy.network.intranet.unit index 7eafceac6..7a27510dc 100644 --- a/defaults/yacy.network.intranet.unit +++ b/defaults/yacy.network.intranet.unit @@ -31,4 +31,8 @@ network.unit.protocol.control = uncontrolled # white/blacklists network.unit.access.whitelist = 10\..*,127\..*,172\.(1[6-9]|2[0-9]|3[0-1])\..*,169\.254\..*,192\.168\..*,localhost -network.unit.access.blacklist = \ No newline at end of file +network.unit.access.blacklist = + +# greedy learning: fast information acquisition heuristic for new peers +greedylearning.enabled = false +greedylearning.limit.doccount = 15000 \ No newline at end of file diff --git a/defaults/yacy.network.metager.unit b/defaults/yacy.network.metager.unit index 3773bbd63..5a1a7b56d 100644 --- a/defaults/yacy.network.metager.unit +++ b/defaults/yacy.network.metager.unit @@ -90,4 +90,8 @@ network.unit.protocol.control = uncontrolled # white/blacklists network.unit.access.whitelist = 10\..*,127\..*,172\.(1[6-9]|2[0-9]|3[0-1])\..*,169\.254\..*,192\.168\..*,213.183.195.83,130.75.2.35,85.31.186.137,localhost -network.unit.access.blacklist = \ No newline at end of file +network.unit.access.blacklist = + +# greedy learning: fast information acquisition heuristic for new peers +greedylearning.enabled = false +greedylearning.limit.doccount = 15000 \ No newline at end of file diff --git a/defaults/yacy.network.webportal.unit b/defaults/yacy.network.webportal.unit index b87782cd3..a440e900a 100644 --- a/defaults/yacy.network.webportal.unit +++ b/defaults/yacy.network.webportal.unit @@ -28,4 +28,8 @@ network.unit.protocol.control = uncontrolled # white/blacklists network.unit.access.whitelist = 10\..*,127\..*,172\.(1[6-9]|2[0-9]|3[0-1])\..*,169\.254\..*,192\.168\..*,localhost -network.unit.access.blacklist = \ No newline at end of file +network.unit.access.blacklist = + +# greedy learning: fast information acquisition heuristic for new peers +greedylearning.enabled = false +greedylearning.limit.doccount = 15000 \ No newline at end of file diff --git a/htroot/ConfigHeuristics_p.java b/htroot/ConfigHeuristics_p.java index 3b65c8f2e..c96c005ce 100644 --- a/htroot/ConfigHeuristics_p.java +++ b/htroot/ConfigHeuristics_p.java @@ -55,25 +55,25 @@ public class ConfigHeuristics_p { // store this call as api call sb.tables.recordAPICall(post, "ConfigHeuristics.html", WorkTables.TABLE_API_TYPE_CONFIGURATION, "heuristic settings"); - if (post.containsKey("site_on")) sb.setConfig("heuristic.site", true); - if (post.containsKey("site_off")) sb.setConfig("heuristic.site", false); - if (post.containsKey("searchresult_on")) sb.setConfig("heuristic.searchresults", true); - if (post.containsKey("searchresult_off")) sb.setConfig("heuristic.searchresults", false); - if (post.containsKey("searchresultglobal_on")) sb.setConfig("heuristic.searchresults.crawlglobal", true); - if (post.containsKey("searchresultglobal_off")) sb.setConfig("heuristic.searchresults.crawlglobal", false); - if (post.containsKey("blekko_on")) sb.setConfig("heuristic.blekko", true); - if (post.containsKey("blekko_off")) sb.setConfig("heuristic.blekko", false); - if (post.containsKey("twitter_on")) sb.setConfig("heuristic.twitter", true); - if (post.containsKey("twitter_off")) sb.setConfig("heuristic.twitter", false); + if (post.containsKey("site_on")) sb.setConfig(SwitchboardConstants.HEURISTIC_SITE, true); + if (post.containsKey("site_off")) sb.setConfig(SwitchboardConstants.HEURISTIC_SITE, false); + if (post.containsKey("searchresult_on")) sb.setConfig(SwitchboardConstants.HEURISTIC_SEARCHRESULTS, true); + if (post.containsKey("searchresult_off")) sb.setConfig(SwitchboardConstants.HEURISTIC_SEARCHRESULTS, false); + if (post.containsKey("searchresultglobal_on")) sb.setConfig(SwitchboardConstants.HEURISTIC_SEARCHRESULTS_CRAWLGLOBAL, true); + if (post.containsKey("searchresultglobal_off")) sb.setConfig(SwitchboardConstants.HEURISTIC_SEARCHRESULTS_CRAWLGLOBAL, false); + if (post.containsKey("blekko_on")) sb.setConfig(SwitchboardConstants.HEURISTIC_BLEKKO, true); + if (post.containsKey("blekko_off")) sb.setConfig(SwitchboardConstants.HEURISTIC_BLEKKO, false); + if (post.containsKey("twitter_on")) sb.setConfig(SwitchboardConstants.HEURISTIC_TWITTER, true); + if (post.containsKey("twitter_off")) sb.setConfig(SwitchboardConstants.HEURISTIC_TWITTER, false); if (post.containsKey("opensearch_on")) { - sb.setConfig("heuristic.opensearch", true); + sb.setConfig(SwitchboardConstants.HEURISTIC_OPENSEARCH, true); // re-read config (and create work table) OpenSearchConnector os = new OpenSearchConnector(sb, true); if (os.getSize() == 0) { osderrmsg = "no active search targets are configured"; } } - if (post.containsKey("opensearch_off")) sb.setConfig("heuristic.opensearch", false); + if (post.containsKey("opensearch_off")) sb.setConfig(SwitchboardConstants.HEURISTIC_OPENSEARCH, false); if (post.containsKey("discoverosd")) { final boolean metafieldavailable = sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_rel_s.name()) && (sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_protocol_s.name()) && sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_urlstub_s.name())); @@ -155,12 +155,12 @@ public class ConfigHeuristics_p { && (sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_protocol_s.name()) && sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_urlstub_s.name())) && sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false); if (!showmetafieldbutton) prop.put("osdsolrfieldswitch",1); - prop.put("site.checked", sb.getConfigBool("heuristic.site", false) ? 1 : 0); - prop.put("searchresult.checked", sb.getConfigBool("heuristic.searchresults", false) ? 1 : 0); - prop.put("searchresultglobal.checked", sb.getConfigBool("heuristic.searchresults.crawlglobal", false) ? 1 : 0); - prop.put("blekko.checked", sb.getConfigBool("heuristic.blekko", false) ? 1 : 0); - prop.put("twitter.checked", sb.getConfigBool("heuristic.twitter", false) ? 1 : 0); - prop.put("opensearch.checked", sb.getConfigBool("heuristic.opensearch", false) ? 1 : 0); + prop.put("site.checked", sb.getConfigBool(SwitchboardConstants.HEURISTIC_SITE, false) ? 1 : 0); + prop.put("searchresult.checked", sb.getConfigBool(SwitchboardConstants.HEURISTIC_SEARCHRESULTS, false) ? 1 : 0); + prop.put("searchresultglobal.checked", sb.getConfigBool(SwitchboardConstants.HEURISTIC_SEARCHRESULTS_CRAWLGLOBAL, false) ? 1 : 0); + prop.put("blekko.checked", sb.getConfigBool(SwitchboardConstants.HEURISTIC_BLEKKO, false) ? 1 : 0); + prop.put("twitter.checked", sb.getConfigBool(SwitchboardConstants.HEURISTIC_TWITTER, false) ? 1 : 0); + prop.put("opensearch.checked", sb.getConfigBool(SwitchboardConstants.HEURISTIC_OPENSEARCH, false) ? 1 : 0); // display config file content final File f = new File (sb.getDataPath(),"DATA/SETTINGS/heuristicopensearch.conf"); @@ -238,7 +238,7 @@ public class ConfigHeuristics_p { } // re-read config (and create/update work table) - if (sb.getConfigBool("heuristic.opensearch", true)) { + if (sb.getConfigBool(SwitchboardConstants.HEURISTIC_OPENSEARCH, true)) { OpenSearchConnector os = new OpenSearchConnector(sb, true); } } diff --git a/htroot/ConfigNetwork_p.java b/htroot/ConfigNetwork_p.java index 1df009045..731c19457 100644 --- a/htroot/ConfigNetwork_p.java +++ b/htroot/ConfigNetwork_p.java @@ -91,9 +91,9 @@ public class ConfigNetwork_p boolean indexReceive = "on".equals(post.get("indexReceive", "")); if ( !indexReceive ) { // remove heuristics - sb.setConfig("heuristic.site", false); - sb.setConfig("heuristic.blekko", false); - sb.setConfig("heuristic.twitter", false); + sb.setConfig(SwitchboardConstants.HEURISTIC_SITE, false); + sb.setConfig(SwitchboardConstants.HEURISTIC_BLEKKO, false); + sb.setConfig(SwitchboardConstants.HEURISTIC_TWITTER, false); } final boolean robinsonmode = "robinson".equals(post.get("network", "")); if ( robinsonmode ) { diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index e999d739a..c8d3191c2 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -158,10 +158,13 @@ public class yacysearch { sb.getConfigBool(SwitchboardConstants.INDEX_RECEIVE_ALLOW, true) || sb.getConfigBool(SwitchboardConstants.INDEX_RECEIVE_AUTODISABLED, true) || clustersearch; - boolean global = post == null || (post.get("resource", "local").equals("global") && sb.peers.sizeConnected() > 0 && indexReceiveGranted); - prop.put("topmenu_resource-select", (sb.peers == null || sb.peers.sizeConnected() == 0 || !indexReceiveGranted) ? 0 : global ? 1 : 2); + boolean p2pmode = sb.peers != null && sb.peers.sizeConnected() > 0 && indexReceiveGranted; + boolean global = post == null || (post.get("resource", "local").equals("global") && p2pmode); + boolean stealthmode = p2pmode && !global; + prop.put("topmenu_resource-select", stealthmode ? 2 : global ? 1 : 0); if ( post == null || indexSegment == null || env == null || !searchAllowed ) { + if (indexSegment == null) Log.logInfo("yacysearch", "indexSegment == null"); // we create empty entries for template strings prop.put("searchagain", "0"); prop.put("former", ""); @@ -483,7 +486,7 @@ public class yacysearch { } final int heuristicTwitter = querystring.indexOf("/heuristic/twitter", 0); - if ( heuristicBlekko >= 0 ) { + if ( heuristicTwitter >= 0 ) { querystring = querystring.replace("/heuristic/twitter", ""); modifier.add("/heuristic/twitter"); } @@ -723,16 +726,16 @@ public class yacysearch { (int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_MULTIWORD, 0)); if ( startRecord == 0 ) { - if ( modifier.sitehost != null && sb.getConfigBool("heuristic.site", false) && authenticated ) { + if ( modifier.sitehost != null && sb.getConfigBool(SwitchboardConstants.HEURISTIC_SITE, false) && authenticated && !stealthmode) { sb.heuristicSite(theSearch, modifier.sitehost); } - if ( (heuristicBlekko >= 0 || sb.getConfigBool("heuristic.blekko", false)) && authenticated ) { + if ( (heuristicBlekko >= 0 || sb.getConfigBool(SwitchboardConstants.HEURISTIC_BLEKKO, false)) && authenticated && !stealthmode ) { sb.heuristicRSS("http://blekko.com/ws/$+/rss", theSearch, "blekko"); } - if ( (heuristicTwitter >= 0 || sb.getConfigBool("heuristic.twitter", false)) && authenticated ) { + if ( (heuristicTwitter >= 0 || sb.getConfigBool(SwitchboardConstants.HEURISTIC_TWITTER, false)) && authenticated && !stealthmode ) { sb.heuristicRSS("http://search.twitter.com/search.rss?rpp=50&q=$", theSearch, "twitter"); } - if (sb.getConfigBool("heuristic.opensearch", false) && authenticated) { + if (sb.getConfigBool(SwitchboardConstants.HEURISTIC_OPENSEARCH, false) && authenticated && !stealthmode) { OpenSearchConnector.query(sb, theSearch); } } diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index d0bd59d2f..275d2e99e 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -255,7 +255,17 @@ public class yacysearchitem { prop.put("content_loc_lat", result.lat()); prop.put("content_loc_lon", result.lon()); } - if (sb.getConfigBool("heuristic.searchresults",false)) sb.heuristicSearchResults(resultUrlstring); + final boolean clustersearch = sb.isRobinsonMode() && sb.getConfig(SwitchboardConstants.CLUSTER_MODE, "").equals(SwitchboardConstants.CLUSTER_MODE_PUBLIC_CLUSTER); + final boolean indexReceiveGranted = + sb.getConfigBool(SwitchboardConstants.INDEX_RECEIVE_ALLOW, true) + || sb.getConfigBool(SwitchboardConstants.INDEX_RECEIVE_AUTODISABLED, true) + || clustersearch; + boolean p2pmode = sb.peers != null && sb.peers.sizeConnected() > 0 && indexReceiveGranted; + boolean global = post == null || (post.get("resource", "local").equals("global") && p2pmode); + boolean stealthmode = p2pmode && !global; + if ((sb.getConfigBool(SwitchboardConstants.HEURISTIC_SEARCHRESULTS, false) || + (sb.getConfigBool(SwitchboardConstants.GREEDYLEARNING_ACTIVE, false) && sb.getConfigBool(SwitchboardConstants.GREEDYLEARNING_ENABLED, false))) && + !stealthmode) sb.heuristicSearchResults(resultUrlstring); theSearch.query.transmitcount = item + 1; return prop; } diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 08e3745ca..e0eb5d88a 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -1303,9 +1303,9 @@ public final class Switchboard extends serverSwitch { ResultURLs.clearStacks(); // remove heuristics - setConfig("heuristic.site", false); - setConfig("heuristic.blekko", false); - setConfig("heuristic.twitter", false); + setConfig(SwitchboardConstants.HEURISTIC_SITE, false); + setConfig(SwitchboardConstants.HEURISTIC_BLEKKO, false); + setConfig(SwitchboardConstants.HEURISTIC_TWITTER, false); // relocate this.peers.relocate( @@ -2041,6 +2041,15 @@ public final class Switchboard extends serverSwitch { setConfig("adminAccount", ""); } + // stop greedylearning if limit is reached + if (getConfigBool(SwitchboardConstants.GREEDYLEARNING_ACTIVE, false)) { + long cs = this.index.fulltext().collectionSize(); + if (cs > getConfigInt(SwitchboardConstants.GREEDYLEARNING_LIMIT_DOCCOUNT, 0)) { + setConfig(SwitchboardConstants.GREEDYLEARNING_ACTIVE, false); + log.logInfo("finishing greedy learning phase, size=" +cs); + } + } + // refresh recrawl dates try { CrawlProfile selentry; @@ -2265,6 +2274,7 @@ public final class Switchboard extends serverSwitch { // if no crawl is running and processing is activated: // execute the (post-) processing steps for all entries that have a process tag assigned if (this.crawlQueues.coreCrawlJobSize() == 0) { + if (this.crawlQueues.noticeURL.isEmpty()) this.crawlQueues.noticeURL.clear(); // flushes more caches index.fulltext().getDefaultConfiguration().postprocessing(index); index.fulltext().getWebgraphConfiguration().postprocessing(index); } @@ -3371,7 +3381,7 @@ public final class Switchboard extends serverSwitch { }.start(); } - public final void heuristicSearchResults(final String host) { + public final void heuristicSearchResults(final String url) { new Thread() { @Override @@ -3380,7 +3390,7 @@ public final class Switchboard extends serverSwitch { // get the links for a specific site final DigestURI startUrl; try { - startUrl = new DigestURI(host); + startUrl = new DigestURI(url); } catch (final MalformedURLException e) { Log.logException(e); return; @@ -3393,7 +3403,7 @@ public final class Switchboard extends serverSwitch { if (links != null) { if (links.size() < 1000) { // limit to 1000 to skip large index pages final Iterator i = links.keySet().iterator(); - final boolean globalcrawljob = Switchboard.this.getConfigBool("heuristic.searchresults.crawlglobal",false); + final boolean globalcrawljob = Switchboard.this.getConfigBool(SwitchboardConstants.HEURISTIC_SEARCHRESULTS_CRAWLGLOBAL,false); Collection urls = new ArrayList(); while (i.hasNext()) { url = i.next(); diff --git a/source/net/yacy/search/SwitchboardConstants.java b/source/net/yacy/search/SwitchboardConstants.java index 36868777c..0f2e251a8 100644 --- a/source/net/yacy/search/SwitchboardConstants.java +++ b/source/net/yacy/search/SwitchboardConstants.java @@ -498,8 +498,25 @@ public final class SwitchboardConstants { /** * system tray */ - public static final String TRAY_ICON_ENABLED = "tray.icon.enabled"; - public static final String TRAY_ICON_FORCED = "tray.icon.force"; - public static final String TRAY_ICON_LABEL = "tray.icon.label"; - public static final String TRAY_MENU_ENABLED = "tray.menu.enabled"; + public static final String TRAY_ICON_ENABLED = "tray.icon.enabled"; + public static final String TRAY_ICON_FORCED = "tray.icon.force"; + public static final String TRAY_ICON_LABEL = "tray.icon.label"; + public static final String TRAY_MENU_ENABLED = "tray.menu.enabled"; + + /* + * search heuristics + */ + public static final String HEURISTIC_SITE = "heuristic.site"; + public static final String HEURISTIC_SEARCHRESULTS = "heuristic.searchresults"; + public static final String HEURISTIC_SEARCHRESULTS_CRAWLGLOBAL = "heuristic.searchresults.crawlglobal"; + public static final String HEURISTIC_BLEKKO = "heuristic.blekko"; + public static final String HEURISTIC_TWITTER = "heuristic.twitter"; + public static final String HEURISTIC_OPENSEARCH = "heuristic.opensearch"; + + /* + * automatic learning heuristic + */ + public static final String GREEDYLEARNING_ENABLED = "greedylearning.enabled"; + public static final String GREEDYLEARNING_LIMIT_DOCCOUNT = "greedylearning.limit.doccount"; + public static final String GREEDYLEARNING_ACTIVE = "greedylearning.active"; }