diff --git a/defaults/yacy.init b/defaults/yacy.init index 86ac2d0a9..138a65991 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -941,3 +941,8 @@ segment.process.default_tmp = default # this is only shown, if the about.body is filled about.headline = about.body = + +# search heuristics +heuristic.site = false +heuristic.scroogle = false + diff --git a/htroot/ConfigHeuristics_p.html b/htroot/ConfigHeuristics_p.html new file mode 100644 index 000000000..3a42dc786 --- /dev/null +++ b/htroot/ConfigHeuristics_p.html @@ -0,0 +1,72 @@ + + + + YaCy '#[clientname]#': Network Configuration + #%env/templates/metas.template%# + + + #%env/templates/header.template%# + #%env/templates/submenuConfig.template%# +

Heuristics Configuration

+

+ A heuristic is an 'experience-based technique that help in problem solving, learning and discovery' (wikipedia). The search heuristics that can be switched on here are techniques that help the discovery of possible search results based on link guessing, in-search crawling and requests to other search engines. + When a search heuristic is used, the resulting links are not used directly as search result but the loaded pages are indexed and stored like other content. This ensures that blacklists can be used and that the searched word actually appears on the page that was discovered by the heuristic. +

+

+

+ The success of heuristics are marked with an image (heuristic:<name> (redundant)/heuristic:<name> (new link)) below the favicon left from the search result entry: +
+
+ heuristic:<name> (redundant) +
+
+ The search result was discovered by a heuristic, but the link was already known by YaCy +
+
+ heuristic:<name> (new link) +
+
+ The search result was discovered by a heuristic, not previously known by YaCy +
+
+

+
+
+ + + + +

+ When a search is made using a 'site'-operator (like: 'download site:yacy.net') then the host of the site-operator is instantly crawled with a host-restricted depth-1 crawl. + That means: right after the search request the portal page of the host is loaded and every page that is linked on this page that points to a page on the same host. + Because this 'instant crawl' must obey the robots.txt and a minimum access time for two consecutive pages, this heuristic is rather slow, but may discover all wanted search results using a second search (after a small pause of some seconds). +

+ #(site.checked)# + + :: + + #(/site.checked)# +
+
+
+
+ + + + +

+ When using this heuristic, then every search request line is used for a call to scroogle. + 20 results are taken from scroogle and loaded simultanously, parsed and indexed immediately. +

+ #(scroogle.checked)# + + :: + + #(/scroogle.checked)# +
+
+ + + #%env/templates/footer.template%# + + \ No newline at end of file diff --git a/htroot/ConfigHeuristics_p.java b/htroot/ConfigHeuristics_p.java new file mode 100644 index 000000000..a2358c3e4 --- /dev/null +++ b/htroot/ConfigHeuristics_p.java @@ -0,0 +1,57 @@ +// ConfigHeuristics_p.java +// -------------------- +// (C) 2010 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 26.06.2010 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2010-02-09 18:14:16 +0100 (Di, 09 Feb 2010) $ +// $LastChangedRevision: 6658 $ +// $LastChangedBy: lotus $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +import de.anomic.data.WorkTables; +import de.anomic.http.server.RequestHeader; +import de.anomic.search.Switchboard; +import de.anomic.server.serverObjects; +import de.anomic.server.serverSwitch; + +public class ConfigHeuristics_p { + + public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { + + final Switchboard sb = (Switchboard) env; + final serverObjects prop = new serverObjects(); + + if (post != null) { + + // store this call as api call + sb.tables.recordAPICall(post, "ConfigHeuristics.html", WorkTables.TABLE_API_TYPE_CONFIGURATION, "heuristic settings"); + + if (post.containsKey("site_on")) sb.setConfig("heuristic.site", true); + if (post.containsKey("site_off")) sb.setConfig("heuristic.site", false); + if (post.containsKey("scroogle_on")) sb.setConfig("heuristic.scroogle", true); + if (post.containsKey("scroogle_off")) sb.setConfig("heuristic.scroogle", false); + } + + prop.put("site.checked", sb.getConfigBool("heuristic.site", false) ? 1 : 0); + prop.put("scroogle.checked", sb.getConfigBool("heuristic.scroogle", false) ? 1 : 0); + + return prop; + } +} diff --git a/htroot/env/templates/submenuConfig.template b/htroot/env/templates/submenuConfig.template index 384d13024..5fe7a00f9 100644 --- a/htroot/env/templates/submenuConfig.template +++ b/htroot/env/templates/submenuConfig.template @@ -5,6 +5,7 @@
  • Basic Configuration
  • Accounts
  • Network Configuration
  • +
  • Heuristics
  • Dictionary Loader
  • System Update
  • diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 4eb3309ac..d3e2a13c3 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -509,8 +509,8 @@ public class yacysearch { final SearchEvent theSearch = SearchEventCache.getEvent(theQuery, sb.peers, sb.crawlResults, (sb.isRobinsonMode()) ? sb.clusterhashes : null, false, sb.loader); try {Thread.sleep(global ? 100 : 10);} catch (InterruptedException e1) {} // wait a little time to get first results in the search - if (sitehost != null && authenticated) sb.heuristicSite(theSearch, sitehost); - if (heuristic >= 0 && authenticated) sb.heuristicScroogle(theSearch); + if (sitehost != null && sb.getConfigBool("heuristic.site", false)) sb.heuristicSite(theSearch, sitehost); + if ((heuristic >= 0 && authenticated) || sb.getConfigBool("heuristic.scroogle", false)) sb.heuristicScroogle(theSearch); // generate result object //serverLog.logFine("LOCAL_SEARCH", "SEARCH TIME AFTER ORDERING OF SEARCH RESULTS: " + (System.currentTimeMillis() - timestamp) + " ms"); diff --git a/htroot/yacysearchitem.html b/htroot/yacysearchitem.html index 173c6f0d7..7d5eb0403 100644 --- a/htroot/yacysearchitem.html +++ b/htroot/yacysearchitem.html @@ -5,8 +5,8 @@ #[title]#
    #(heuristic)#:: - heuristic#[name]# (redundant):: - heuristic#[name]# (new link) + heuristic:#[name]# (redundant):: + heuristic:#[name]# (new link) #(/heuristic)# #(authorized)#:: bookmark diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index a7c606499..035f8d47b 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -117,17 +117,6 @@ public class yacysearchitem { prop.putHTML("content_authorized_recommend_deletelink", "/yacysearch.html?query=" + theQuery.queryString.replace(' ', '+') + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=local&time=3&deleteref=" + new String(result.hash()) + "&urlmaskfilter=.*"); prop.putHTML("content_authorized_recommend_recommendlink", "/yacysearch.html?query=" + theQuery.queryString.replace(' ', '+') + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=local&time=3&recommendref=" + new String(result.hash()) + "&urlmaskfilter=.*"); prop.put("content_authorized_urlhash", new String(result.hash())); - SearchEvent.HeuristicResult heuristic = theSearch.getHeuristic(result.hash()); - if (heuristic == null) { - prop.put("content_heuristic", 0); - } else { - if (heuristic.redundant) { - prop.put("content_heuristic", 1); - } else { - prop.put("content_heuristic", 2); - } - prop.put("content_heuristic_name", heuristic.heuristicName); - } String resulthashString = new String(result.hash()); prop.putHTML("content_title", result.title()); prop.putXML("content_title-xml", result.title()); @@ -160,6 +149,17 @@ public class yacysearchitem { prop.put("content_description", desc); prop.putXML("content_description-xml", desc); prop.putJSON("content_description-json", desc); + SearchEvent.HeuristicResult heuristic = theSearch.getHeuristic(result.hash()); + if (heuristic == null) { + prop.put("content_heuristic", 0); + } else { + if (heuristic.redundant) { + prop.put("content_heuristic", 1); + } else { + prop.put("content_heuristic", 2); + } + prop.put("content_heuristic_name", heuristic.heuristicName); + } EventTracker.update("SEARCH", new ProfilingGraph.searchEvent(theQuery.id(true), SearchEvent.FINALIZATION + "-" + item, 0, 0), false, 30000, ProfilingGraph.maxTime); return prop; diff --git a/source/de/anomic/search/ReferenceOrder.java b/source/de/anomic/search/ReferenceOrder.java index 16b791144..ef9ea2f25 100644 --- a/source/de/anomic/search/ReferenceOrder.java +++ b/source/de/anomic/search/ReferenceOrder.java @@ -117,7 +117,6 @@ public class ReferenceOrder { private final BlockingQueue decodedEntries; public NormalizeWorker(final BlockingQueue out, Semaphore termination) { - // normalize ranking: find minimum and maximum of separate ranking criteria this.out = out; this.termination = termination; this.decodedEntries = new LinkedBlockingQueue(); @@ -131,38 +130,8 @@ public class ReferenceOrder { } public void run() { - - Map doms0 = new HashMap(); - Integer int1 = 1; - - WordReferenceVars iEntry; - String dom; - Integer count; try { - // calculate min and max for normalization - while ((iEntry = decodedEntries.take()) != WordReferenceVars.poison) { - out.put(iEntry); - // find min/max - if (min == null) min = iEntry.clone(); else min.min(iEntry); - if (max == null) max = iEntry.clone(); else max.max(iEntry); - // update domcount - dom = new String(iEntry.metadataHash()).substring(6); - count = doms0.get(dom); - if (count == null) { - doms0.put(dom, int1); - } else { - doms0.put(dom, Integer.valueOf(count.intValue() + 1)); - } - } - - // update domain score - Map.Entry entry; - final Iterator> di = doms0.entrySet().iterator(); - while (di.hasNext()) { - entry = di.next(); - doms.addScore(entry.getKey(), (entry.getValue()).intValue()); - } - if (!doms.isEmpty()) maxdomcount = doms.getMaxScore(); + addNormalizer(decodedEntries, out); } catch (InterruptedException e) { Log.logException(e); } catch (Exception e) { @@ -177,6 +146,57 @@ public class ReferenceOrder { } } + /** + * normalize ranking: find minimum and maximum of separate ranking criteria + * @param decodedEntries + * @param out + * @throws InterruptedException + */ + public void addNormalizer(BlockingQueue decodedEntries, final BlockingQueue out) throws InterruptedException { + WordReferenceVars iEntry; + Map doms0 = new HashMap(); + String dom; + Integer count; + final Integer int1 = 1; + while ((iEntry = decodedEntries.take()) != WordReferenceVars.poison) { + out.put(iEntry); + // find min/max + if (min == null) min = iEntry.clone(); else min.min(iEntry); + if (max == null) max = iEntry.clone(); else max.max(iEntry); + // update domcount + dom = new String(iEntry.metadataHash()).substring(6); + count = doms0.get(dom); + if (count == null) { + doms0.put(dom, int1); + } else { + doms0.put(dom, Integer.valueOf(count.intValue() + 1)); + } + } + + // update domain score + Map.Entry entry; + final Iterator> di = doms0.entrySet().iterator(); + while (di.hasNext()) { + entry = di.next(); + doms.addScore(entry.getKey(), (entry.getValue()).intValue()); + } + if (!doms.isEmpty()) this.maxdomcount = doms.getMaxScore(); + } + + public void addNormalizer(WordReferenceVars iEntry, final BlockingQueue out) throws InterruptedException { + out.put(iEntry); + + // find min/max + if (min == null) min = iEntry.clone(); else min.min(iEntry); + if (max == null) max = iEntry.clone(); else max.max(iEntry); + + // update domcount + String dom = new String(iEntry.metadataHash()).substring(6); + doms.addScore(dom, 1); + + if (!doms.isEmpty()) this.maxdomcount = doms.getMaxScore(); + } + public int authority(final byte[] urlHash) { return (doms.getScore(new String(urlHash, 6, 6)) << 8) / (1 + this.maxdomcount); } diff --git a/source/de/anomic/search/SearchEvent.java b/source/de/anomic/search/SearchEvent.java index d58b3f26e..7816611ae 100644 --- a/source/de/anomic/search/SearchEvent.java +++ b/source/de/anomic/search/SearchEvent.java @@ -319,7 +319,7 @@ public final class SearchEvent { return this.rankedCache.getAuthorNavigator(maxentries); } - public void addHeuristicResult(byte[] urlhash, String heuristicName, boolean redundant) { + public void addHeuristic(byte[] urlhash, String heuristicName, boolean redundant) { synchronized (this.heuristics) { this.heuristics.put(urlhash, new HeuristicResult(urlhash, heuristicName, redundant)); } diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 26c4b15ee..9383c0b1e 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -1930,7 +1930,7 @@ public final class Switchboard extends serverSwitch { public void addToIndex(final DigestURI url, final SearchEvent searchEvent, final String heuristicName) throws IOException, ParserException { final Segments.Process process = Segments.Process.LOCALCRAWLING; if (indexSegments.segment(process).urlMetadata.exists(url.hash())) { - searchEvent.addHeuristicResult(url.hash(), heuristicName, true); + searchEvent.addHeuristic(url.hash(), heuristicName, true); return; // don't do double-work } final Request request = loader.request(url, true, true); @@ -1939,9 +1939,9 @@ public final class Switchboard extends serverSwitch { log.logInfo("Heuristic: cannot load " + url.toNormalform(false, false) + ": " + acceptedError); return; } + searchEvent.addHeuristic(url.hash(), heuristicName, false); new Thread() {public void run() { try { - searchEvent.addHeuristicResult(url.hash(), heuristicName, false); Response response = loader.load(request, CacheStrategy.IFFRESH, Long.MAX_VALUE); if (response == null) throw new IOException("response == null"); if (response.getContent() == null) throw new IOException("content == null");