added a configuration page for search heuristics. currently you can switch on there:

- a site-operation heuristic that loads all direct links from a portal page if the site-operator is used
- a direct crawl for search results from scroogle for the given search terms
The configuration page can be found directly beside the network configuration page


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6951 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 5d00888c95
commit 11b7853940

@ -941,3 +941,8 @@ segment.process.default_tmp = default
# this is only shown, if the about.body is filled
about.headline =
about.body =
# search heuristics
heuristic.site = false
heuristic.scroogle = false

@ -0,0 +1,72 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>YaCy '#[clientname]#': Network Configuration</title>
#%env/templates/metas.template%#
</head>
<body id="ConfigNetwork">
#%env/templates/header.template%#
#%env/templates/submenuConfig.template%#
<h2>Heuristics Configuration</h2>
<p>
A <a href="http://en.wikipedia.org/wiki/Heuristic">heuristic</a> is an 'experience-based technique that help in problem solving, learning and discovery' (wikipedia). The search heuristics that can be switched on here are techniques that help the discovery of possible search results based on link guessing, in-search crawling and requests to other search engines.
When a search heuristic is used, the resulting links are not used directly as search result but the loaded pages are indexed and stored like other content. This ensures that blacklists can be used and that the searched word actually appears on the page that was discovered by the heuristic.
</p>
<p>
<form><fieldset>
The success of heuristics are marked with an image (<img width="16" height="9" src="/env/grafics/heuristic_redundant.gif" title="heuristic:&lt;name&gt; (redundant)" style="width:16px; height:9px;" alt="heuristic:&lt;name&gt; (redundant)"/>/<img width="16" height="9" src="/env/grafics/heuristic_new.gif" title="heuristic:&lt;name&gt; (new link)" style="width:16px; height:9px;" alt="heuristic:&lt;name&gt; (new link)"/>) below the favicon left from the search result entry:
<dl>
<dt>
<img width="16" height="9" src="/env/grafics/heuristic_redundant.gif" title="heuristic:&lt;name&gt; (redundant)" style="width:16px; height:9px;" alt="heuristic:&lt;name&gt; (redundant)"/>
</dt>
<dd>
The search result was discovered by a heuristic, but the link was already known by YaCy
</dd>
<dt>
<img width="16" height="9" src="/env/grafics/heuristic_new.gif" title="heuristic:&lt;name&gt; (new link)" style="width:16px; height:9px;" alt="heuristic:&lt;name&gt; (new link)"/>
</dt>
<dd>
The search result was discovered by a heuristic, not previously known by YaCy
</dd>
</dl></fieldset></form>
</p>
<form name="HeuristicForm" method="post" action="ConfigHeuristics_p.html" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset>
<legend>
<input type="checkbox" name="site_check" id="site" onclick="window.location.href='ConfigHeuristics_p.html?#(site.checked)#site_on=::site_off=#(/site.checked)#'" value="site"#(site.checked)#:: checked="checked"#(/site.checked)# />
<label for="domain">'site'-operator: instant shallow crawl</label>
</legend>
<p>
When a search is made using a 'site'-operator (like: 'download site:yacy.net') then the host of the site-operator is instantly crawled with a host-restricted depth-1 crawl.
That means: right after the search request the portal page of the host is loaded and every page that is linked on this page that points to a page on the same host.
Because this 'instant crawl' must obey the robots.txt and a minimum access time for two consecutive pages, this heuristic is rather slow, but may discover all wanted search results using a second search (after a small pause of some seconds).
</p>
#(site.checked)#
<input type="submit" name="site_on" value="Switch site-heuristic on" />
::
<input type="submit" name="site_off" value="Switch site-heuristic off" />
#(/site.checked)#
</fieldset>
</form>
<form name="HeuristicForm" method="post" action="ConfigHeuristics_p.html" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset>
<legend>
<input type="checkbox" name="scroogle_check" id="scroogle" onclick="window.location.href='ConfigHeuristics_p.html?#(scroogle.checked)#scroogle_on=::scroogle_off=#(/scroogle.checked)#'" value="scroogle"#(scroogle.checked)#:: checked="checked"#(/scroogle.checked)# />
<label for="domain">scroogle: load external search result list</label>
</legend>
<p>
When using this heuristic, then every search request line is used for a call to scroogle.
20 results are taken from scroogle and loaded simultanously, parsed and indexed immediately.
</p>
#(scroogle.checked)#
<input type="submit" name="scroogle_on" value="Switch scroogle-heuristic on" />
::
<input type="submit" name="scroogle_off" value="Switch scroogle-heuristic off" />
#(/scroogle.checked)#
</fieldset>
</form>
#%env/templates/footer.template%#
</body>
</html>

@ -0,0 +1,57 @@
// ConfigHeuristics_p.java
// --------------------
// (C) 2010 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 26.06.2010 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2010-02-09 18:14:16 +0100 (Di, 09 Feb 2010) $
// $LastChangedRevision: 6658 $
// $LastChangedBy: lotus $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import de.anomic.data.WorkTables;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
public class ConfigHeuristics_p {
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
final Switchboard sb = (Switchboard) env;
final serverObjects prop = new serverObjects();
if (post != null) {
// store this call as api call
sb.tables.recordAPICall(post, "ConfigHeuristics.html", WorkTables.TABLE_API_TYPE_CONFIGURATION, "heuristic settings");
if (post.containsKey("site_on")) sb.setConfig("heuristic.site", true);
if (post.containsKey("site_off")) sb.setConfig("heuristic.site", false);
if (post.containsKey("scroogle_on")) sb.setConfig("heuristic.scroogle", true);
if (post.containsKey("scroogle_off")) sb.setConfig("heuristic.scroogle", false);
}
prop.put("site.checked", sb.getConfigBool("heuristic.site", false) ? 1 : 0);
prop.put("scroogle.checked", sb.getConfigBool("heuristic.scroogle", false) ? 1 : 0);
return prop;
}
}

@ -5,6 +5,7 @@
<li><a href="/ConfigBasic.html" class="MenuItemLink lock">Basic Configuration</a></li>
<li><a href="/ConfigAccounts_p.html" class="MenuItemLink lock">Accounts</a></li>
<li><a href="/ConfigNetwork_p.html" class="MenuItemLink lock">Network Configuration</a></li>
<li><a href="/ConfigHeuristics_p.html" class="MenuItemLink lock">Heuristics</a></li>
<li><a href="/DictionaryLoader_p.html" class="MenuItemLink lock">Dictionary Loader</a></li>
<li><a href="/ConfigUpdate_p.html" class="MenuItemLink lock">System Update</a></li>
</ul>

@ -509,8 +509,8 @@ public class yacysearch {
final SearchEvent theSearch = SearchEventCache.getEvent(theQuery, sb.peers, sb.crawlResults, (sb.isRobinsonMode()) ? sb.clusterhashes : null, false, sb.loader);
try {Thread.sleep(global ? 100 : 10);} catch (InterruptedException e1) {} // wait a little time to get first results in the search
if (sitehost != null && authenticated) sb.heuristicSite(theSearch, sitehost);
if (heuristic >= 0 && authenticated) sb.heuristicScroogle(theSearch);
if (sitehost != null && sb.getConfigBool("heuristic.site", false)) sb.heuristicSite(theSearch, sitehost);
if ((heuristic >= 0 && authenticated) || sb.getConfigBool("heuristic.scroogle", false)) sb.heuristicScroogle(theSearch);
// generate result object
//serverLog.logFine("LOCAL_SEARCH", "SEARCH TIME AFTER ORDERING OF SEARCH RESULTS: " + (System.currentTimeMillis() - timestamp) + " ms");

@ -5,8 +5,8 @@
<a href="#[link]#">#[title]#</a></h4>
<div class="urlactions">
#(heuristic)#::
<img width="16" height="9" src="/env/grafics/heuristic_redundant.gif" title="heuristic:#[name]# (redundant)" style="width:16px; height:9px;" alt="heuristic#[name]# (redundant)"/>::
<img width="16" height="9" src="/env/grafics/heuristic_new.gif" title="heuristic:#[name]# (new link)" style="width:16px; height:9px;" alt="heuristic#[name]# (new link)"/>
<img width="16" height="9" src="/env/grafics/heuristic_redundant.gif" title="heuristic:#[name]# (redundant)" style="width:16px; height:9px;" alt="heuristic:#[name]# (redundant)"/>::
<img width="16" height="9" src="/env/grafics/heuristic_new.gif" title="heuristic:#[name]# (new link)" style="width:16px; height:9px;" alt="heuristic:#[name]# (new link)"/>
#(/heuristic)#
#(authorized)#::
<a href="/Bookmarks.html?edit=#[urlhash]#" class="bookmarklink" title="bookmark"><img width="11" height="11" src="/env/grafics/empty.gif" title="bookmark" alt="bookmark" class="recommendIcon" /></a>

@ -117,17 +117,6 @@ public class yacysearchitem {
prop.putHTML("content_authorized_recommend_deletelink", "/yacysearch.html?query=" + theQuery.queryString.replace(' ', '+') + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=local&time=3&deleteref=" + new String(result.hash()) + "&urlmaskfilter=.*");
prop.putHTML("content_authorized_recommend_recommendlink", "/yacysearch.html?query=" + theQuery.queryString.replace(' ', '+') + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=local&time=3&recommendref=" + new String(result.hash()) + "&urlmaskfilter=.*");
prop.put("content_authorized_urlhash", new String(result.hash()));
SearchEvent.HeuristicResult heuristic = theSearch.getHeuristic(result.hash());
if (heuristic == null) {
prop.put("content_heuristic", 0);
} else {
if (heuristic.redundant) {
prop.put("content_heuristic", 1);
} else {
prop.put("content_heuristic", 2);
}
prop.put("content_heuristic_name", heuristic.heuristicName);
}
String resulthashString = new String(result.hash());
prop.putHTML("content_title", result.title());
prop.putXML("content_title-xml", result.title());
@ -160,6 +149,17 @@ public class yacysearchitem {
prop.put("content_description", desc);
prop.putXML("content_description-xml", desc);
prop.putJSON("content_description-json", desc);
SearchEvent.HeuristicResult heuristic = theSearch.getHeuristic(result.hash());
if (heuristic == null) {
prop.put("content_heuristic", 0);
} else {
if (heuristic.redundant) {
prop.put("content_heuristic", 1);
} else {
prop.put("content_heuristic", 2);
}
prop.put("content_heuristic_name", heuristic.heuristicName);
}
EventTracker.update("SEARCH", new ProfilingGraph.searchEvent(theQuery.id(true), SearchEvent.FINALIZATION + "-" + item, 0, 0), false, 30000, ProfilingGraph.maxTime);
return prop;

@ -117,7 +117,6 @@ public class ReferenceOrder {
private final BlockingQueue<WordReferenceVars> decodedEntries;
public NormalizeWorker(final BlockingQueue<WordReferenceVars> out, Semaphore termination) {
// normalize ranking: find minimum and maximum of separate ranking criteria
this.out = out;
this.termination = termination;
this.decodedEntries = new LinkedBlockingQueue<WordReferenceVars>();
@ -131,15 +130,34 @@ public class ReferenceOrder {
}
public void run() {
try {
addNormalizer(decodedEntries, out);
} catch (InterruptedException e) {
Log.logException(e);
} catch (Exception e) {
Log.logException(e);
} finally {
// insert poison to signal the termination to next queue
try {
this.termination.acquire();
if (this.termination.availablePermits() == 0) this.out.put(WordReferenceVars.poison);
} catch (InterruptedException e) {}
}
}
}
Map<String, Integer> doms0 = new HashMap<String, Integer>();
Integer int1 = 1;
/**
* normalize ranking: find minimum and maximum of separate ranking criteria
* @param decodedEntries
* @param out
* @throws InterruptedException
*/
public void addNormalizer(BlockingQueue<WordReferenceVars> decodedEntries, final BlockingQueue<WordReferenceVars> out) throws InterruptedException {
WordReferenceVars iEntry;
Map<String, Integer> doms0 = new HashMap<String, Integer>();
String dom;
Integer count;
try {
// calculate min and max for normalization
final Integer int1 = 1;
while ((iEntry = decodedEntries.take()) != WordReferenceVars.poison) {
out.put(iEntry);
// find min/max
@ -162,19 +180,21 @@ public class ReferenceOrder {
entry = di.next();
doms.addScore(entry.getKey(), (entry.getValue()).intValue());
}
if (!doms.isEmpty()) maxdomcount = doms.getMaxScore();
} catch (InterruptedException e) {
Log.logException(e);
} catch (Exception e) {
Log.logException(e);
} finally {
// insert poison to signal the termination to next queue
try {
this.termination.acquire();
if (this.termination.availablePermits() == 0) this.out.put(WordReferenceVars.poison);
} catch (InterruptedException e) {}
}
if (!doms.isEmpty()) this.maxdomcount = doms.getMaxScore();
}
public void addNormalizer(WordReferenceVars iEntry, final BlockingQueue<WordReferenceVars> out) throws InterruptedException {
out.put(iEntry);
// find min/max
if (min == null) min = iEntry.clone(); else min.min(iEntry);
if (max == null) max = iEntry.clone(); else max.max(iEntry);
// update domcount
String dom = new String(iEntry.metadataHash()).substring(6);
doms.addScore(dom, 1);
if (!doms.isEmpty()) this.maxdomcount = doms.getMaxScore();
}
public int authority(final byte[] urlHash) {

@ -319,7 +319,7 @@ public final class SearchEvent {
return this.rankedCache.getAuthorNavigator(maxentries);
}
public void addHeuristicResult(byte[] urlhash, String heuristicName, boolean redundant) {
public void addHeuristic(byte[] urlhash, String heuristicName, boolean redundant) {
synchronized (this.heuristics) {
this.heuristics.put(urlhash, new HeuristicResult(urlhash, heuristicName, redundant));
}

@ -1930,7 +1930,7 @@ public final class Switchboard extends serverSwitch {
public void addToIndex(final DigestURI url, final SearchEvent searchEvent, final String heuristicName) throws IOException, ParserException {
final Segments.Process process = Segments.Process.LOCALCRAWLING;
if (indexSegments.segment(process).urlMetadata.exists(url.hash())) {
searchEvent.addHeuristicResult(url.hash(), heuristicName, true);
searchEvent.addHeuristic(url.hash(), heuristicName, true);
return; // don't do double-work
}
final Request request = loader.request(url, true, true);
@ -1939,9 +1939,9 @@ public final class Switchboard extends serverSwitch {
log.logInfo("Heuristic: cannot load " + url.toNormalform(false, false) + ": " + acceptedError);
return;
}
searchEvent.addHeuristic(url.hash(), heuristicName, false);
new Thread() {public void run() {
try {
searchEvent.addHeuristicResult(url.hash(), heuristicName, false);
Response response = loader.load(request, CacheStrategy.IFFRESH, Long.MAX_VALUE);
if (response == null) throw new IOException("response == null");
if (response.getContent() == null) throw new IOException("content == null");

Loading…
Cancel
Save