Merge branch 'master' of git://gitorious.org/~reger/yacy/bbyacy-rc1

pull/1/head
Michael Peter Christen 13 years ago
commit c18fa9fa75

@ -1015,6 +1015,8 @@ about.body =
# search heuristics
heuristic.site = false
heuristic.blekko = false
heuristic.searchresults = false
heuristic.searchresults.crawlglobal = false
# colours for generic design
color_background = #FFFFFF

@ -43,6 +43,34 @@
</p>
</fieldset>
</form>
<form id="HeuristicFormSearchResult" method="post" action="ConfigHeuristics_p.html" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset>
<table>
<tr>
<td>
<legend>
<input type="checkbox" name="searchresult_check" id="searchresult" onclick="window.location.href='ConfigHeuristics_p.html?#(searchresult.checked)#searchresult_on=::searchresult_off=#(/searchresult.checked)#'" value="searchresult"#(searchresult.checked)#:: checked="checked"#(/searchresult.checked)# />
<label for="searchresult">search-result: shallow crawl on all displayed search results</label>
</legend>
</td>
<td>
<legend>
<input type="checkbox" name="searchresultglobal_check" id="searchresultglobal" onclick="window.location.href='ConfigHeuristics_p.html?#(searchresultglobal.checked)#searchresultglobal_on=::searchresultglobal_off=#(/searchresultglobal.checked)#'" value="siteresultglobal"#(searchresultglobal.checked)#:: checked="checked"#(/searchresultglobal.checked)# />
<label for="searchresultglobal">add as global crawl job</label>
</legend>
</td>
</tr>
</table>
<p>
When a search is made then all displayed result links are crawled with a depth-1 crawl.
This means: right after the search request every page is loaded and every page that is linked on this page.
If you check 'add as global crawl job' the pages to be crawled are added to the global crawl queue (remote peers can pickup pages to be crawled).
Default is to add the links to the local crawl queue (your peer crawls the linked pages).
</p>
</fieldset>
</form>
<form id="HeuristicFormBlekko" method="post" action="ConfigHeuristics_p.html" enctype="multipart/form-data" accept-charset="UTF-8">
<fieldset>
<legend>
@ -56,7 +84,6 @@
</fieldset>
</form>
#%env/templates/footer.template%#
</body>
</html>

@ -45,11 +45,17 @@ public class ConfigHeuristics_p {
if (post.containsKey("site_on")) sb.setConfig("heuristic.site", true);
if (post.containsKey("site_off")) sb.setConfig("heuristic.site", false);
if (post.containsKey("searchresult_on")) sb.setConfig("heuristic.searchresults", true);
if (post.containsKey("searchresult_off")) sb.setConfig("heuristic.searchresults", false);
if (post.containsKey("searchresultglobal_on")) sb.setConfig("heuristic.searchresults.crawlglobal", true);
if (post.containsKey("searchresultglobal_off")) sb.setConfig("heuristic.searchresults.crawlglobal", false);
if (post.containsKey("blekko_on")) sb.setConfig("heuristic.blekko", true);
if (post.containsKey("blekko_off")) sb.setConfig("heuristic.blekko", false);
}
prop.put("site.checked", sb.getConfigBool("heuristic.site", false) ? 1 : 0);
prop.put("searchresult.checked", sb.getConfigBool("heuristic.searchresults", false) ? 1 : 0);
prop.put("searchresultglobal.checked", sb.getConfigBool("heuristic.searchresults.crawlglobal", false) ? 1 : 0);
prop.put("blekko.checked", sb.getConfigBool("heuristic.blekko", false) ? 1 : 0);
return prop;

@ -248,6 +248,7 @@ public class yacysearchitem {
prop.put("content_loc_lat", result.lat());
prop.put("content_loc_lon", result.lon());
}
if (sb.getConfigBool("heuristic.searchresults",false)) sb.heuristicSearchResults(resultUrlstring);
theQuery.transmitcount = item + 1;
return prop;
}

@ -160,6 +160,7 @@ import de.anomic.crawler.CrawlQueues;
import de.anomic.crawler.CrawlStacker;
import de.anomic.crawler.CrawlSwitchboard;
import de.anomic.crawler.NoticedURL;
import de.anomic.crawler.NoticedURL.StackType;
import de.anomic.crawler.ResourceObserver;
import de.anomic.crawler.ResultImages;
import de.anomic.crawler.ResultURLs;
@ -2791,6 +2792,44 @@ public final class Switchboard extends serverSwitch
}.start();
}
/**
* add url to Crawler - which itself loads the URL, parses the content and adds it to the index
* transparent alternative to "addToIndex" including, double in crawler check, display in crawl monitor
* but doesn't return results for a ongoing search
*
* @param url the url that shall be indexed
* @param asglobal true adds the url to global crawl queue (for remote crawling), false to the local crawler
*/
public void addToCrawler(final DigestURI url, final boolean asglobal) {
if ( this.index.exists(url.hash()) ) {
return; // don't do double-work
}
final Request request = this.loader.request(url, true, true);
final CrawlProfile profile = sb.crawler.getActive(ASCII.getBytes(request.profileHandle()));
final String acceptedError = this.crawlStacker.checkAcceptance(url, profile, 0);
if (acceptedError != null) {
this.log.logInfo("addToCrawler: cannot load "
+ url.toNormalform(false, false)
+ ": "
+ acceptedError);
return;
}
final String s;
if (asglobal) {
s = sb.crawlQueues.noticeURL.push(StackType.GLOBAL, request);
} else {
s = sb.crawlQueues.noticeURL.push(StackType.LOCAL, request);
}
if (s != null) {
Switchboard.this.log.logInfo("addToCrawler: failed to add "
+ url.toNormalform(false, false)
+ ": "
+ s);
}
}
public class receiptSending implements Runnable
{
private final Seed initiatorPeer;
@ -3155,6 +3194,46 @@ public final class Switchboard extends serverSwitch
}.start();
}
public final void heuristicSearchResults(final String host) {
new Thread() {
@Override
public void run() {
// get the links for a specific site
final DigestURI startUrl;
try {
startUrl = new DigestURI(host);
} catch (final MalformedURLException e) {
Log.logException(e);
return;
}
final Map<MultiProtocolURI, String> links;
DigestURI url;
try {
links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH);
if (links != null) {
if (links.size() < 1000) { // limit to 1000 to skip large index pages
final Iterator<MultiProtocolURI> i = links.keySet().iterator();
final boolean globalcrawljob = sb.getConfigBool("heuristic.searchresults.crawlglobal",false);
while (i.hasNext()) {
url = new DigestURI(i.next());
boolean islocal = url.getHost().contentEquals(startUrl.getHost());
// add all external links or links to different page to crawler
if ( !islocal ) {// || (!startUrl.getPath().endsWith(url.getPath()))) {
addToCrawler(url,globalcrawljob);
}
}
}
}
} catch (final Throwable e) {
Log.logException(e);
}
}
}.start();
}
// blekko pattern: http://blekko.com/ws/$+/rss
public final void heuristicRSS(
final String urlpattern,

Loading…
Cancel
Save