added a 'transition feature' that shall lower the barrier to move from g**gle to yacy (yes!):

Here a new concept called 'search heuristics' is introduced. A heuristic is a kind of 'shortcut' to good results in IT, here for good search results. In this case it will be used to get a very transparent way to compare what YaCy is able to produce as search result and what g**gle produces as search result. Here is what your can do now:
- add the phrase 'heuristic:scroogle' to your search query, like 'oil spill heuristic:scroogle' and then a call to scroogle is made to get anonymous search results from g**gle.
- these results are _not_ taken as meta-search results, but are used to instantly feed a crawling and indexing process. This happens very fast, here 20 results from scroogle are taken and loaded all simultanously, parsed and indexed immediately and from the results of the parsed content the search result is feeded, along to the normal p2p search
- when new results from that heuristic (more to come) get part of the search results, then it is verified if such results are redundant to existing (they had been part of the normal YaCy search result anyway) or if they had been completely new to YaCy.
- in the search results the new search results from heuristics are marked with a 'H ++' and search results from heuristics that had been already found by YaCy are marked with a 'H ='. That means:
- you can now see YaCy and Scroogle search results in one result page but you also see that you would not have 'missed' the g**gle results when you would only have used YaCy.

- to make it short: YaCy now subsumes g**gle results. If you use only YaCy, you miss nothing.

to come: a configuration page that let you configure the usage of heuristics and get this feature by default.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6944 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent d5d48b8dc7
commit dcd01698b4

@ -376,7 +376,6 @@ public class Crawler_p {
final Map.Entry<MultiProtocolURI, String> e = linkiterator.next();
if (e.getKey() == null) continue;
nexturl = new DigestURI(e.getKey());
if (nexturl == null) continue;
// enqueuing the url for crawling
sb.crawlStacker.enqueueEntry(new Request(

@ -490,9 +490,9 @@ a:hover.MenuItemLink {
div.urlactions a {
display:block;
width: 12px;
height: 12px;
margin: 2px 0px;
width: 11px;
height: 11px;
margin: 0px 0px 0px 3px;
}
a.bookmarklink:hover, div.searchresults:hover a.bookmarklink, div.searchresults.hover a.bookmarklink {

Binary file not shown.

After

Width:  |  Height:  |  Size: 127 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 126 B

@ -332,6 +332,12 @@ public class yacysearch {
while (sitehost.endsWith(".")) sitehost = sitehost.substring(0, sitehost.length() - 1);
sitehash = DigestURI.domhash(sitehost);
}
int heuristic = querystring.indexOf("heuristic:scroogle");
if (heuristic >= 0) {
querystring = querystring.replace("heuristic:scroogle", "");
}
int authori = querystring.indexOf("author:");
String authorhash = null;
if (authori >= 0) {
@ -503,7 +509,9 @@ public class yacysearch {
final SearchEvent theSearch = SearchEventCache.getEvent(theQuery, sb.peers, sb.crawlResults, (sb.isRobinsonMode()) ? sb.clusterhashes : null, false, sb.loader);
try {Thread.sleep(global ? 100 : 10);} catch (InterruptedException e1) {} // wait a little time to get first results in the search
if (sitehost != null && authenticated) sb.quickFillSite(sitehost, theSearch);
if (sitehost != null && authenticated) sb.heuristicSite(theSearch, sitehost);
if (heuristic >= 0 && authenticated) sb.heuristicScroogle(theSearch);
// generate result object
//serverLog.logFine("LOCAL_SEARCH", "SEARCH TIME AFTER ORDERING OF SEARCH RESULTS: " + (System.currentTimeMillis() - timestamp) + " ms");
//serverLog.logFine("LOCAL_SEARCH", "SEARCH TIME AFTER RESULT PREPARATION: " + (System.currentTimeMillis() - timestamp) + " ms");

@ -1,20 +1,24 @@
#(content)#::
<div class="searchresults">
<h4 class="linktitle">
<img src="ViewImage.png?width=16&amp;height=16&amp;code=#[faviconCode]#" id="f#[urlhash]#" class="favicon" style="width:16px; height:16px;" alt="" />
<img width="16" height="16" src="ViewImage.png?width=16&amp;height=16&amp;code=#[faviconCode]#" id="f#[urlhash]#" class="favicon" style="width:16px; height:16px;" alt="" />
<a href="#[link]#">#[title]#</a></h4>
#(authorized)#::
<div class="urlactions">
<a href="/Bookmarks.html?edit=#[urlhash]#" class="bookmarklink" title="bookmark"><img src="/env/grafics/empty.gif" title="bookmark" alt="bookmark" class="bookmarkIcon" /></a>
<div class="urlactions">
#(heuristic)#::
<img width="16" height="9" src="/env/grafics/heuristic_redundant.gif" title="heuristic:#[name]# (redundant)" style="width:16px; height:9px;" alt="heuristic#[name]# (redundant)"/>::
<img width="16" height="9" src="/env/grafics/heuristic_new.gif" title="heuristic:#[name]# (new link)" style="width:16px; height:9px;" alt="heuristic#[name]# (new link)"/>
#(/heuristic)#
#(authorized)#::
<a href="/Bookmarks.html?edit=#[urlhash]#" class="bookmarklink" title="bookmark"><img width="11" height="11" src="/env/grafics/empty.gif" title="bookmark" alt="bookmark" class="recommendIcon" /></a>
#(recommend)#
<img src="/env/grafics/empty.gif" title="" alt="recommend" class="recommendIcon" />
<img src="/env/grafics/empty.gif" title="" alt="delete" class="deleteIcon" />
<img width="11" height="11" src="/env/grafics/empty.gif" title="" alt="recommend" class="recommendIcon" />
<img width="11" height="11" src="/env/grafics/empty.gif" title="" alt="delete" class="deleteIcon" />
::
<a href="#[recommendlink]#" class="recommendlink" title="recommend"><img src="/env/grafics/empty.gif" title="recommend" alt="recommend" class="recommendIcon" /></a>
<a href="#[deletelink]#" title="delete" class="deletelink" ><img src="/env/grafics/empty.gif" title="delete" alt="delete" class="deleteIcon" /></a>
<a href="#[recommendlink]#" class="recommendlink" title="recommend"><img width="11" height="11" src="/env/grafics/empty.gif" title="recommend" alt="recommend" class="recommendIcon" /></a>
<a href="#[deletelink]#" title="delete" class="deletelink" ><img width="11" height="11" src="/env/grafics/empty.gif" title="delete" alt="delete" class="deleteIcon" /></a>
#(/recommend)#
</div>
#(/authorized)#
#(/authorized)#
</div>
<p class="snippet"><span class="snippetLoaded" id="h#[urlhash]#">#[description]#</span></p>
<p class="url"><a href="#[link]#" id="url#[urlhash]#">#[urlname]#</a></p>
<p class="urlinfo">#[date]# | #[sizename]# | <a href="api/yacydoc.html?urlhash=#[urlhash]#" onclick="return hs.htmlExpand(this, { objectType: 'ajax'} )">Metadata</a> | <a href="ViewFile.html?urlHash=#[urlhash]#&amp;words=#[words]#&amp;display=#[display]#">Parser</a> | <a href="yacysearch.html?cat=image&amp;url=#[link]#&amp;query=#[former]#&amp;display=#[display]#">Pictures</a></p>

@ -117,6 +117,17 @@ public class yacysearchitem {
prop.putHTML("content_authorized_recommend_deletelink", "/yacysearch.html?query=" + theQuery.queryString.replace(' ', '+') + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=local&time=3&deleteref=" + new String(result.hash()) + "&urlmaskfilter=.*");
prop.putHTML("content_authorized_recommend_recommendlink", "/yacysearch.html?query=" + theQuery.queryString.replace(' ', '+') + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=local&time=3&recommendref=" + new String(result.hash()) + "&urlmaskfilter=.*");
prop.put("content_authorized_urlhash", new String(result.hash()));
SearchEvent.HeuristicResult heuristic = theSearch.getHeuristic(result.hash());
if (heuristic == null) {
prop.put("content_heuristic", 0);
} else {
if (heuristic.redundant) {
prop.put("content_heuristic", 1);
} else {
prop.put("content_heuristic", 2);
}
prop.put("content_heuristic_name", heuristic.heuristicName);
}
String resulthashString = new String(result.hash());
prop.putHTML("content_title", result.title());
prop.putXML("content_title-xml", result.title());

@ -49,13 +49,13 @@ public final class CrawlStacker {
private final Log log = new Log("STACKCRAWL");
private final WorkflowProcessor<Request> fastQueue, slowQueue;
//private long dnsHit;
private long dnsMiss;
private final CrawlQueues nextQueue;
private final CrawlSwitchboard crawler;
private final Segment indexSegment;
private final yacySeedDB peers;
private final boolean acceptLocalURLs, acceptGlobalURLs;
//private long dnsHit;
private long dnsMiss;
private final CrawlQueues nextQueue;
private final CrawlSwitchboard crawler;
private final Segment indexSegment;
private final yacySeedDB peers;
private final boolean acceptLocalURLs, acceptGlobalURLs;
// this is the process that checks url for double-occurrences and for allowance/disallowance by robots.txt
@ -178,96 +178,142 @@ public final class CrawlStacker {
// stacks a crawl item. The position can also be remote
// returns null if successful, a reason string if not successful
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
final CrawlProfile.entry profile = crawler.profilesActiveCrawls.getEntry(entry.profileHandle());
String error;
if (profile == null) {
error = "LOST STACKER PROFILE HANDLE '" + entry.profileHandle() + "' for URL " + entry.url();
log.logWarning(error);
return error;
}
error = checkAcceptance(entry.url(), profile, entry.depth());
if (error != null) return error;
final DigestURI referrerURL = (entry.referrerhash() == null || entry.referrerhash().length == 0) ? null : nextQueue.getURL(entry.referrerhash());
final long startTime = System.currentTimeMillis();
// add domain to profile domain list
if ((profile.domFilterDepth() != Integer.MAX_VALUE) || (profile.domMaxPages() != Integer.MAX_VALUE)) {
profile.domInc(entry.url().getHost(), (referrerURL == null) ? null : referrerURL.getHost().toLowerCase(), entry.depth());
}
// store information
final boolean local = Base64Order.enhancedCoder.equal(entry.initiator(), peers.mySeed().hash.getBytes());
final boolean proxy = (entry.initiator() == null || entry.initiator().length == 0 || new String(entry.initiator()).equals("------------")) && profile.handle().equals(crawler.defaultProxyProfile.handle());
final boolean remote = profile.handle().equals(crawler.defaultRemoteProfile.handle());
final boolean global =
(profile.remoteIndexing()) /* granted */ &&
(entry.depth() == profile.depth()) /* leaf node */ &&
//(initiatorHash.equals(yacyCore.seedDB.mySeed.hash)) /* not proxy */ &&
(
(peers.mySeed().isSenior()) ||
(peers.mySeed().isPrincipal())
) /* qualified */;
if (!local && !global && !remote && !proxy) {
error = "URL '" + entry.url().toString() + "' cannot be crawled. initiator = " + new String(entry.initiator()) + ", profile.handle = " + profile.handle();
this.log.logSevere(error);
return error;
}
if (global) {
// it may be possible that global == true and local == true, so do not check an error case against it
if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, proxy = true, initiator = " + new String(entry.initiator()) + ", profile.handle = " + profile.handle());
if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, remote = true, initiator = " + new String(entry.initiator()) + ", profile.handle = " + profile.handle());
//int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT);
nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_LIMIT, entry);
//assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT);
//this.log.logInfo("stacked/global: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT));
} else if (local) {
if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, proxy = true, initiator = " + new String(entry.initiator()) + ", profile.handle = " + profile.handle());
if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, remote = true, initiator = " + new String(entry.initiator()) + ", profile.handle = " + profile.handle());
//int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE);
nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_CORE, entry);
//assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE);
//this.log.logInfo("stacked/local: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE));
} else if (proxy) {
if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: proxy = true, remote = true, initiator = " + new String(entry.initiator()) + ", profile.handle = " + profile.handle());
//int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE);
nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_CORE, entry);
//assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE);
//this.log.logInfo("stacked/proxy: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE));
} else if (remote) {
//int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE);
nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_REMOTE, entry);
//assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE);
//this.log.logInfo("stacked/remote: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE));
}
return null;
}
public String checkAcceptance(final DigestURI url, final CrawlProfile.entry profile, int depth) {
// check if the protocol is supported
final String urlProtocol = entry.url().getProtocol();
final String urlProtocol = url.getProtocol();
if (!Switchboard.getSwitchboard().loader.isSupportedProtocol(urlProtocol)) {
this.log.logSevere("Unsupported protocol in URL '" + entry.url().toString() + "'. " +
"Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
this.log.logSevere("Unsupported protocol in URL '" + url.toString() + "'.");
return "unsupported protocol";
}
// check if ip is local ip address
final String urlRejectReason = urlInAcceptedDomain(entry.url());
final String urlRejectReason = urlInAcceptedDomain(url);
if (urlRejectReason != null) {
if (this.log.isFine()) this.log.logFine("denied_(" + urlRejectReason + ") Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
if (this.log.isFine()) this.log.logFine("denied_(" + urlRejectReason + ")");
return "denied_(" + urlRejectReason + ")";
}
// check blacklist
if (Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, entry.url())) {
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is in blacklist. " +
"Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
if (Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, url)) {
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is in blacklist.");
return "url in blacklist";
}
final CrawlProfile.entry profile = crawler.profilesActiveCrawls.getEntry(entry.profileHandle());
if (profile == null) {
final String errorMsg = "LOST STACKER PROFILE HANDLE '" + entry.profileHandle() + "' for URL " + entry.url();
log.logWarning(errorMsg);
return errorMsg;
}
// filter with must-match
if ((entry.depth() > 0) && !profile.mustMatchPattern().matcher(entry.url().toString()).matches()) {
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' does not match must-match crawling filter '" + profile.mustMatchPattern().toString() + "'. " +
"Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
if ((depth > 0) && !profile.mustMatchPattern().matcher(url.toString()).matches()) {
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' does not match must-match crawling filter '" + profile.mustMatchPattern().toString() + "'.");
return "url does not match must-match filter";
}
// filter with must-not-match
if ((entry.depth() > 0) && profile.mustNotMatchPattern().matcher(entry.url().toString()).matches()) {
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' does matches do-not-match crawling filter '" + profile.mustNotMatchPattern().toString() + "'. " +
"Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
if ((depth > 0) && profile.mustNotMatchPattern().matcher(url.toString()).matches()) {
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' does matches do-not-match crawling filter '" + profile.mustNotMatchPattern().toString() + "'.");
return "url matches must-not-match filter";
}
// deny cgi
if (entry.url().isIndividual()) {
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is CGI URL. " +
"Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
if (url.isIndividual()) {
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is CGI URL.");
return "cgi url not allowed";
}
// deny post properties
if (entry.url().isPOST() && !(profile.crawlingQ())) {
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is post URL. " +
"Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
if (url.isPOST() && !(profile.crawlingQ())) {
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is post URL.");
return "post url not allowed";
}
final DigestURI referrerURL = (entry.referrerhash() == null || entry.referrerhash().length == 0) ? null : nextQueue.getURL(entry.referrerhash());
// add domain to profile domain list
if ((profile.domFilterDepth() != Integer.MAX_VALUE) || (profile.domMaxPages() != Integer.MAX_VALUE)) {
profile.domInc(entry.url().getHost(), (referrerURL == null) ? null : referrerURL.getHost().toLowerCase(), entry.depth());
}
// deny urls that do not match with the profile domain list
if (!(profile.grantedDomAppearance(entry.url().getHost()))) {
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is not listed in granted domains. " +
"Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
if (!(profile.grantedDomAppearance(url.getHost()))) {
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is not listed in granted domains.");
return "url does not match domain filter";
}
// deny urls that exceed allowed number of occurrences
if (!(profile.grantedDomCount(entry.url().getHost()))) {
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' appeared too often, a maximum of " + profile.domMaxPages() + " is allowed. " +
"Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
if (!(profile.grantedDomCount(url.getHost()))) {
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' appeared too often, a maximum of " + profile.domMaxPages() + " is allowed.");
return "domain counter exceeded";
}
// check if the url is double registered
final String dbocc = nextQueue.urlExists(entry.url().hash()); // returns the name of the queue if entry exists
URIMetadataRow oldEntry = indexSegment.urlMetadata().load(entry.url().hash(), null, 0);
final String dbocc = nextQueue.urlExists(url.hash()); // returns the name of the queue if entry exists
URIMetadataRow oldEntry = indexSegment.urlMetadata().load(url.hash(), null, 0);
if (oldEntry == null) {
if (dbocc != null) {
// do double-check
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is double registered in '" + dbocc + "'.");
if (dbocc.equals("errors")) {
ZURL.Entry errorEntry = nextQueue.errorURL.get(entry.url().hash());
ZURL.Entry errorEntry = nextQueue.errorURL.get(url.hash());
return "double in: errors (" + errorEntry.anycause() + ")";
} else {
return "double in: " + dbocc;
@ -277,15 +323,15 @@ public final class CrawlStacker {
final boolean recrawl = profile.recrawlIfOlder() > oldEntry.loaddate().getTime();
if (recrawl) {
if (this.log.isFine())
this.log.logFine("RE-CRAWL of URL '" + entry.url().toString() + "': this url was crawled " +
this.log.logFine("RE-CRAWL of URL '" + url.toString() + "': this url was crawled " +
((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000 / 60 / 24) + " days ago.");
} else {
if (dbocc == null) {
return "double in: LURL-DB";
} else {
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time:");
if (dbocc.equals("errors")) {
ZURL.Entry errorEntry = nextQueue.errorURL.get(entry.url().hash());
ZURL.Entry errorEntry = nextQueue.errorURL.get(url.hash());
return "double in: errors (" + errorEntry.anycause() + ")";
} else {
return "double in: " + dbocc;
@ -293,57 +339,11 @@ public final class CrawlStacker {
}
}
}
// store information
final boolean local = Base64Order.enhancedCoder.equal(entry.initiator(), peers.mySeed().hash.getBytes());
final boolean proxy = (entry.initiator() == null || entry.initiator().length == 0 || new String(entry.initiator()).equals("------------")) && profile.handle().equals(crawler.defaultProxyProfile.handle());
final boolean remote = profile.handle().equals(crawler.defaultRemoteProfile.handle());
final boolean global =
(profile.remoteIndexing()) /* granted */ &&
(entry.depth() == profile.depth()) /* leaf node */ &&
//(initiatorHash.equals(yacyCore.seedDB.mySeed.hash)) /* not proxy */ &&
(
(peers.mySeed().isSenior()) ||
(peers.mySeed().isPrincipal())
) /* qualified */;
if (!local && !global && !remote && !proxy) {
String error = "URL '" + entry.url().toString() + "' cannot be crawled. initiator = " + new String(entry.initiator()) + ", profile.handle = " + profile.handle();
this.log.logSevere(error);
return error;
}
if (global) {
// it may be possible that global == true and local == true, so do not check an error case against it
if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, proxy = true, initiator = " + new String(entry.initiator()) + ", profile.handle = " + profile.handle());
if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, remote = true, initiator = " + new String(entry.initiator()) + ", profile.handle = " + profile.handle());
//int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT);
nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_LIMIT, entry);
//assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT);
//this.log.logInfo("stacked/global: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT));
} else if (local) {
if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, proxy = true, initiator = " + new String(entry.initiator()) + ", profile.handle = " + profile.handle());
if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, remote = true, initiator = " + new String(entry.initiator()) + ", profile.handle = " + profile.handle());
//int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE);
nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_CORE, entry);
//assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE);
//this.log.logInfo("stacked/local: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE));
} else if (proxy) {
if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: proxy = true, remote = true, initiator = " + new String(entry.initiator()) + ", profile.handle = " + profile.handle());
//int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE);
nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_CORE, entry);
//assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE);
//this.log.logInfo("stacked/proxy: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE));
} else if (remote) {
//int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE);
nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_REMOTE, entry);
//assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE);
//this.log.logInfo("stacked/remote: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE));
}
return null;
}
/**
* Test a url if it can be used for crawling/indexing
* This mainly checks if the url is in the declared domain (local/global)

@ -973,7 +973,6 @@ public final class HTTPDFileHandler {
if ((ranges.length == 1)&&(ranges[0].endsWith("-"))) {
rangeStartOffset = Integer.parseInt(ranges[0].substring(0,ranges[0].length()-1));
statusCode = 206;
if (header == null) header = new ResponseHeader();
header.put(HeaderFramework.CONTENT_RANGE, "bytes " + rangeStartOffset + "-" + (targetFile.length()-1) + "/" + targetFile.length());
}
}

@ -80,12 +80,13 @@ public final class SearchEvent {
private final TreeMap<byte[], String> preselectedPeerHashes;
private final ResultURLs crawlResults;
private final Thread localSearchThread;
private final TreeMap<byte[], String> IAResults;
private final TreeMap<byte[], Integer> IACount;
private final TreeMap<byte[], String> IAResults;
private final TreeMap<byte[], HeuristicResult> heuristics;
private byte[] IAmaxcounthash, IAneardhthash;
private final ReferenceOrder order;
@SuppressWarnings("unchecked") SearchEvent(final QueryParams query,
public SearchEvent(final QueryParams query,
final yacySeedDB peers,
final ResultURLs crawlResults,
final TreeMap<byte[], String> preselectedPeerHashes,
@ -102,6 +103,7 @@ public final class SearchEvent {
this.preselectedPeerHashes = preselectedPeerHashes;
this.IAResults = new TreeMap<byte[], String>(Base64Order.enhancedCoder);
this.IACount = new TreeMap<byte[], Integer>(Base64Order.enhancedCoder);
this.heuristics = new TreeMap<byte[], HeuristicResult>(Base64Order.enhancedCoder);
this.IAmaxcounthash = null;
this.IAneardhthash = null;
this.localSearchThread = null;
@ -169,7 +171,7 @@ public final class SearchEvent {
assert this.rankedCache.searchContainerMap() != null;
for (Map.Entry<byte[], ReferenceContainer<WordReference>> entry : this.rankedCache.searchContainerMap().entrySet()) {
wordhash = entry.getKey();
final ReferenceContainer container = entry.getValue();
final ReferenceContainer<WordReference> container = entry.getValue();
assert (Base64Order.enhancedCoder.equal(container.getTermHash(), wordhash)) : "container.getTermHash() = " + new String(container.getTermHash()) + ", wordhash = " + new String(wordhash);
if (container.size() > maxcount) {
IAmaxcounthash = wordhash;
@ -317,6 +319,18 @@ public final class SearchEvent {
return this.rankedCache.getAuthorNavigator(maxentries);
}
public void addHeuristicResult(byte[] urlhash, String heuristicName, boolean redundant) {
synchronized (this.heuristics) {
this.heuristics.put(urlhash, new HeuristicResult(urlhash, heuristicName, redundant));
}
}
public HeuristicResult getHeuristic(byte[] urlhash) {
synchronized (this.heuristics) {
return this.heuristics.get(urlhash);
}
}
public ResultEntry oneResult(final int item) {
if ((query.domType == QueryParams.SEARCHDOM_GLOBALDHT) ||
(query.domType == QueryParams.SEARCHDOM_CLUSTERALL)) {
@ -333,6 +347,22 @@ public final class SearchEvent {
boolean secondarySearchStartet = false;
public static class HeuristicResult /*implements Comparable<HeuristicResult>*/ {
public final byte[] urlhash; public final String heuristicName; public final boolean redundant;
public HeuristicResult(byte[] urlhash, String heuristicName, boolean redundant) {
this.urlhash = urlhash; this.heuristicName = heuristicName; this.redundant = redundant;
}/*
public int compareTo(HeuristicResult o) {
return Base64Order.enhancedCoder.compare(this.urlhash, o.urlhash);
}
public int hashCode() {
return (int) Base64Order.enhancedCoder.cardinal(this.urlhash);
}
public boolean equals(Object o) {
return Base64Order.enhancedCoder.equal(this.urlhash, ((HeuristicResult) o).urlhash);
}*/
}
public class SecondarySearchSuperviser extends Thread {
// cache for index abstracts; word:TreeMap mapping where the embedded TreeMap is a urlhash:peerlist relation

@ -1894,6 +1894,31 @@ public final class Switchboard extends serverSwitch {
}
}
public final void addAllToIndex(final DigestURI url, final Map<MultiProtocolURI, String> links, final SearchEvent searchEvent, final String heuristicName) {
// add the landing page to the index. should not load that again since it should be in the cache
if (url != null) try {
this.addToIndex(url, searchEvent, heuristicName);
} catch (IOException e) {} catch (ParserException e) {}
// check if some of the links match with the query
Map<MultiProtocolURI, String> matcher = searchEvent.getQuery().separateMatches(links);
// take the matcher and load them all
for (Map.Entry<MultiProtocolURI, String> entry: matcher.entrySet()) {
try {
this.addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent, heuristicName);
} catch (IOException e) {} catch (ParserException e) {}
}
// take then the no-matcher and load them also
for (Map.Entry<MultiProtocolURI, String> entry: links.entrySet()) {
try {
this.addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent, heuristicName);
} catch (IOException e) {} catch (ParserException e) {}
}
}
/**
* load the content of a URL, parse the content and add the content to the index
* This process is started concurrently. The method returns immediately after the call.
@ -1902,12 +1927,21 @@ public final class Switchboard extends serverSwitch {
* @throws IOException
* @throws ParserException
*/
public void addToIndex(final DigestURI url, final SearchEvent searchEvent) throws IOException, ParserException {
public void addToIndex(final DigestURI url, final SearchEvent searchEvent, final String heuristicName) throws IOException, ParserException {
final Segments.Process process = Segments.Process.LOCALCRAWLING;
if (indexSegments.segment(process).urlMetadata.exists(url.hash())) {
searchEvent.addHeuristicResult(url.hash(), heuristicName, true);
return; // don't do double-work
}
final Request request = loader.request(url, true, true);
String acceptedError = this.crawlStacker.checkAcceptance(url, this.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), 0);
if (acceptedError != null) {
log.logInfo("Heuristic: cannot load " + url.toNormalform(false, false) + ": " + acceptedError);
return;
}
new Thread() {public void run() {
try {
Segments.Process process = Segments.Process.LOCALCRAWLING;
if (indexSegments.segment(process).urlMetadata.exists(url.hash())) return; // don't do double-work
Request request = loader.request(url, true, true);
searchEvent.addHeuristicResult(url.hash(), heuristicName, false);
Response response = loader.load(request, CacheStrategy.IFFRESH, Long.MAX_VALUE);
if (response == null) throw new IOException("response == null");
if (response.getContent() == null) throw new IOException("content == null");
@ -1918,42 +1952,15 @@ public final class Switchboard extends serverSwitch {
ResultImages.registerImages(document, true);
webStructure.generateCitationReference(document, condenser, response.lastModified());
storeDocumentIndex(process, response, document, condenser, searchEvent);
log.logInfo("QuickFill of url " + url.toNormalform(true, true) + " finished");
log.logInfo("heuristic fill of url " + url.toNormalform(true, true) + " finished");
} catch (IOException e) {
Log.logException(e);
//Log.logException(e);
} catch (ParserException e) {
Log.logException(e);
//Log.logException(e);
}
}}.start();
}
public final void addAllToIndex(final DigestURI url, final Map<MultiProtocolURI, String> links, final SearchEvent searchEvent) {
// add the landing page to the index. should not load that again since it should be in the cache
try {
this.addToIndex(url, searchEvent);
} catch (IOException e) {} catch (ParserException e) {}
// check if some of the links match with the query
Map<MultiProtocolURI, String> matcher = searchEvent.getQuery().separateMatches(links);
// take the matcher and load them all
for (Map.Entry<MultiProtocolURI, String> entry: matcher.entrySet()) {
try {
this.addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent);
} catch (IOException e) {} catch (ParserException e) {}
}
// take then the no-matcher and load them also
for (Map.Entry<MultiProtocolURI, String> entry: links.entrySet()) {
try {
this.addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent);
} catch (IOException e) {} catch (ParserException e) {}
}
}
public class receiptSending implements Runnable {
yacySeed initiatorPeer;
URIMetadataRow reference;
@ -2165,7 +2172,7 @@ public final class Switchboard extends serverSwitch {
crawlQueues.errorURL.push(bentry, initiator, new Date(), 0, failreason);
}
public final void quickFillSite(final String host, final SearchEvent searchEvent) {
public final void heuristicSite(final SearchEvent searchEvent, final String host) {
new Thread() {public void run() {
String r = host;
if (r.indexOf("//") < 0) r = "http://" + r;
@ -2194,7 +2201,42 @@ public final class Switchboard extends serverSwitch {
}
// add all pages to the index
addAllToIndex(url, links, searchEvent);
addAllToIndex(url, links, searchEvent, "site");
}}.start();
}
public final void heuristicScroogle(final SearchEvent searchEvent) {
new Thread() {public void run() {
String query = searchEvent.getQuery().queryString(true);
int meta = query.indexOf("heuristic:");
if (meta >= 0) {
int q = query.indexOf(' ', meta);
if (q >= 0) query = query.substring(0, meta) + query.substring(q + 1); else query = query.substring(0, meta);
}
final String urlString = "http://www.scroogle.org/cgi-bin/nbbw.cgi?Gw=" + query.trim().replaceAll(" ", "+") + "&n=2";
DigestURI url;
try {
url = new DigestURI(MultiProtocolURI.unescape(urlString), null);
} catch (MalformedURLException e1) {
return;
}
Map<MultiProtocolURI, String> links = null;
try {
links = loader.loadLinks(url, CrawlProfile.CacheStrategy.NOCACHE);
} catch (IOException e) {
Log.logException(e);
return;
}
Iterator<MultiProtocolURI> i = links.keySet().iterator();
MultiProtocolURI u;
while (i.hasNext()) {
u = i.next();
if (u.toNormalform(false, false).indexOf("scroogle") >= 0) i.remove();
}
log.logInfo("Heuristic: adding " + links.size() + " links from scroogle");
// add all pages to the index
addAllToIndex(null, links, searchEvent, "scroogle");
}}.start();
}

@ -33,7 +33,6 @@ import java.util.concurrent.ConcurrentHashMap;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceRow;
import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
@ -239,15 +238,15 @@ public class Dispatcher {
// check all entries and split them to the partitions
ReferenceContainer<WordReference>[] partitionBuffer = new ReferenceContainer[partitionCount];
WordReferenceRow re;
for (ReferenceContainer container: containers) {
WordReference re;
for (ReferenceContainer<WordReference> container: containers) {
// init the new partitions
for (int j = 0; j < partitionBuffer.length; j++) {
partitionBuffer[j] = new ReferenceContainer(Segment.wordReferenceFactory, container.getTermHash(), container.size() / partitionCount);
partitionBuffer[j] = new ReferenceContainer<WordReference>(Segment.wordReferenceFactory, container.getTermHash(), container.size() / partitionCount);
}
// split the container
Iterator<WordReferenceRow> i = container.entries();
Iterator<WordReference> i = container.entries();
while (i.hasNext()) {
re = i.next();
if (re == null) continue;

@ -91,7 +91,6 @@ public class OSMTile {
return null;
}
tileb = entry.getContent();
if (entry == null) return null;
}
try {
ImageIO.setUseCache(false); // do not write a cache to disc; keep in RAM

Loading…
Cancel
Save