From dcd01698b43db029b314a8501832e13354dc11d0 Mon Sep 17 00:00:00 2001 From: orbiter Date: Fri, 25 Jun 2010 16:44:57 +0000 Subject: [PATCH] added a 'transition feature' that shall lower the barrier to move from g**gle to yacy (yes!): Here a new concept called 'search heuristics' is introduced. A heuristic is a kind of 'shortcut' to good results in IT, here for good search results. In this case it will be used to get a very transparent way to compare what YaCy is able to produce as search result and what g**gle produces as search result. Here is what your can do now: - add the phrase 'heuristic:scroogle' to your search query, like 'oil spill heuristic:scroogle' and then a call to scroogle is made to get anonymous search results from g**gle. - these results are _not_ taken as meta-search results, but are used to instantly feed a crawling and indexing process. This happens very fast, here 20 results from scroogle are taken and loaded all simultanously, parsed and indexed immediately and from the results of the parsed content the search result is feeded, along to the normal p2p search - when new results from that heuristic (more to come) get part of the search results, then it is verified if such results are redundant to existing (they had been part of the normal YaCy search result anyway) or if they had been completely new to YaCy. - in the search results the new search results from heuristics are marked with a 'H ++' and search results from heuristics that had been already found by YaCy are marked with a 'H ='. That means: - you can now see YaCy and Scroogle search results in one result page but you also see that you would not have 'missed' the g**gle results when you would only have used YaCy. - to make it short: YaCy now subsumes g**gle results. If you use only YaCy, you miss nothing. to come: a configuration page that let you configure the usage of heuristics and get this feature by default. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6944 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/Crawler_p.java | 1 - htroot/env/base.css | 6 +- htroot/env/grafics/heuristic_new.gif | Bin 0 -> 127 bytes htroot/env/grafics/heuristic_redundant.gif | Bin 0 -> 126 bytes htroot/yacysearch.java | 10 +- htroot/yacysearchitem.html | 24 +- htroot/yacysearchitem.java | 11 + source/de/anomic/crawler/CrawlStacker.java | 206 +++++++++--------- .../anomic/http/server/HTTPDFileHandler.java | 1 - source/de/anomic/search/SearchEvent.java | 36 ++- source/de/anomic/search/Switchboard.java | 114 +++++++--- source/de/anomic/yacy/dht/Dispatcher.java | 9 +- source/de/anomic/yacy/graphics/OSMTile.java | 1 - 13 files changed, 255 insertions(+), 164 deletions(-) create mode 100644 htroot/env/grafics/heuristic_new.gif create mode 100644 htroot/env/grafics/heuristic_redundant.gif diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 9a0e5119f..6d3d0e090 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -376,7 +376,6 @@ public class Crawler_p { final Map.Entry e = linkiterator.next(); if (e.getKey() == null) continue; nexturl = new DigestURI(e.getKey()); - if (nexturl == null) continue; // enqueuing the url for crawling sb.crawlStacker.enqueueEntry(new Request( diff --git a/htroot/env/base.css b/htroot/env/base.css index 2da09b500..045e7650f 100644 --- a/htroot/env/base.css +++ b/htroot/env/base.css @@ -490,9 +490,9 @@ a:hover.MenuItemLink { div.urlactions a { display:block; - width: 12px; - height: 12px; - margin: 2px 0px; + width: 11px; + height: 11px; + margin: 0px 0px 0px 3px; } a.bookmarklink:hover, div.searchresults:hover a.bookmarklink, div.searchresults.hover a.bookmarklink { diff --git a/htroot/env/grafics/heuristic_new.gif b/htroot/env/grafics/heuristic_new.gif new file mode 100644 index 0000000000000000000000000000000000000000..6c5acf0526f02a092251c29c5d6bb54dd98b9f5d GIT binary patch literal 127 zcmZ?wbhEHb6ky1UbSk~%$YMgIy%bB%hS@*LPJAc zTwIKejg^#?1O){d7#I|PvM>TE1|1LqGJ}CdN1(xB_0l4?gS(H<-oSI%V^)mAwC6i# etp2@D*=PF~o0es<%cS}P8Yaf{F?zBvSOWkd4>9)u literal 0 HcmV?d00001 diff --git a/htroot/env/grafics/heuristic_redundant.gif b/htroot/env/grafics/heuristic_redundant.gif new file mode 100644 index 0000000000000000000000000000000000000000..b4669b863fa34fac51c721927aa538419ae42e9d GIT binary patch literal 126 zcmZ?wbhEHb6ky1UbSk~%$YMgIy%bB%hS@*LPJAc zTwIKejg^#?1O){d7#I|PvM>TE1|1LqGJ}CdTcE*V_0l4?gS(H<-oSI%V_r literal 0 HcmV?d00001 diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 058eca2d3..2ce96b27f 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -332,6 +332,12 @@ public class yacysearch { while (sitehost.endsWith(".")) sitehost = sitehost.substring(0, sitehost.length() - 1); sitehash = DigestURI.domhash(sitehost); } + + int heuristic = querystring.indexOf("heuristic:scroogle"); + if (heuristic >= 0) { + querystring = querystring.replace("heuristic:scroogle", ""); + } + int authori = querystring.indexOf("author:"); String authorhash = null; if (authori >= 0) { @@ -503,7 +509,9 @@ public class yacysearch { final SearchEvent theSearch = SearchEventCache.getEvent(theQuery, sb.peers, sb.crawlResults, (sb.isRobinsonMode()) ? sb.clusterhashes : null, false, sb.loader); try {Thread.sleep(global ? 100 : 10);} catch (InterruptedException e1) {} // wait a little time to get first results in the search - if (sitehost != null && authenticated) sb.quickFillSite(sitehost, theSearch); + if (sitehost != null && authenticated) sb.heuristicSite(theSearch, sitehost); + if (heuristic >= 0 && authenticated) sb.heuristicScroogle(theSearch); + // generate result object //serverLog.logFine("LOCAL_SEARCH", "SEARCH TIME AFTER ORDERING OF SEARCH RESULTS: " + (System.currentTimeMillis() - timestamp) + " ms"); //serverLog.logFine("LOCAL_SEARCH", "SEARCH TIME AFTER RESULT PREPARATION: " + (System.currentTimeMillis() - timestamp) + " ms"); diff --git a/htroot/yacysearchitem.html b/htroot/yacysearchitem.html index 04eb2177b..173c6f0d7 100644 --- a/htroot/yacysearchitem.html +++ b/htroot/yacysearchitem.html @@ -1,20 +1,24 @@ #(content)#::

- + #[title]#

- #(authorized)#:: -
- bookmark +
+ #(heuristic)#:: + heuristic#[name]# (redundant):: + heuristic#[name]# (new link) + #(/heuristic)# + #(authorized)#:: + bookmark #(recommend)# - recommend - delete + recommend + delete :: - recommend - delete + recommend + delete #(/recommend)# -
- #(/authorized)# + #(/authorized)# +

#[description]#

#[urlname]#

#[date]# | #[sizename]# | Metadata | Parser | Pictures

diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index 758643f52..a7c606499 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -117,6 +117,17 @@ public class yacysearchitem { prop.putHTML("content_authorized_recommend_deletelink", "/yacysearch.html?query=" + theQuery.queryString.replace(' ', '+') + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=local&time=3&deleteref=" + new String(result.hash()) + "&urlmaskfilter=.*"); prop.putHTML("content_authorized_recommend_recommendlink", "/yacysearch.html?query=" + theQuery.queryString.replace(' ', '+') + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=local&time=3&recommendref=" + new String(result.hash()) + "&urlmaskfilter=.*"); prop.put("content_authorized_urlhash", new String(result.hash())); + SearchEvent.HeuristicResult heuristic = theSearch.getHeuristic(result.hash()); + if (heuristic == null) { + prop.put("content_heuristic", 0); + } else { + if (heuristic.redundant) { + prop.put("content_heuristic", 1); + } else { + prop.put("content_heuristic", 2); + } + prop.put("content_heuristic_name", heuristic.heuristicName); + } String resulthashString = new String(result.hash()); prop.putHTML("content_title", result.title()); prop.putXML("content_title-xml", result.title()); diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index 82b5f0a56..15565bb08 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -49,13 +49,13 @@ public final class CrawlStacker { private final Log log = new Log("STACKCRAWL"); private final WorkflowProcessor fastQueue, slowQueue; - //private long dnsHit; - private long dnsMiss; - private final CrawlQueues nextQueue; - private final CrawlSwitchboard crawler; - private final Segment indexSegment; - private final yacySeedDB peers; - private final boolean acceptLocalURLs, acceptGlobalURLs; + //private long dnsHit; + private long dnsMiss; + private final CrawlQueues nextQueue; + private final CrawlSwitchboard crawler; + private final Segment indexSegment; + private final yacySeedDB peers; + private final boolean acceptLocalURLs, acceptGlobalURLs; // this is the process that checks url for double-occurrences and for allowance/disallowance by robots.txt @@ -178,96 +178,142 @@ public final class CrawlStacker { // stacks a crawl item. The position can also be remote // returns null if successful, a reason string if not successful //this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'"); + + final CrawlProfile.entry profile = crawler.profilesActiveCrawls.getEntry(entry.profileHandle()); + String error; + if (profile == null) { + error = "LOST STACKER PROFILE HANDLE '" + entry.profileHandle() + "' for URL " + entry.url(); + log.logWarning(error); + return error; + } + + error = checkAcceptance(entry.url(), profile, entry.depth()); + if (error != null) return error; + + final DigestURI referrerURL = (entry.referrerhash() == null || entry.referrerhash().length == 0) ? null : nextQueue.getURL(entry.referrerhash()); - final long startTime = System.currentTimeMillis(); + // add domain to profile domain list + if ((profile.domFilterDepth() != Integer.MAX_VALUE) || (profile.domMaxPages() != Integer.MAX_VALUE)) { + profile.domInc(entry.url().getHost(), (referrerURL == null) ? null : referrerURL.getHost().toLowerCase(), entry.depth()); + } + // store information + final boolean local = Base64Order.enhancedCoder.equal(entry.initiator(), peers.mySeed().hash.getBytes()); + final boolean proxy = (entry.initiator() == null || entry.initiator().length == 0 || new String(entry.initiator()).equals("------------")) && profile.handle().equals(crawler.defaultProxyProfile.handle()); + final boolean remote = profile.handle().equals(crawler.defaultRemoteProfile.handle()); + final boolean global = + (profile.remoteIndexing()) /* granted */ && + (entry.depth() == profile.depth()) /* leaf node */ && + //(initiatorHash.equals(yacyCore.seedDB.mySeed.hash)) /* not proxy */ && + ( + (peers.mySeed().isSenior()) || + (peers.mySeed().isPrincipal()) + ) /* qualified */; + + if (!local && !global && !remote && !proxy) { + error = "URL '" + entry.url().toString() + "' cannot be crawled. initiator = " + new String(entry.initiator()) + ", profile.handle = " + profile.handle(); + this.log.logSevere(error); + return error; + } + + if (global) { + // it may be possible that global == true and local == true, so do not check an error case against it + if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, proxy = true, initiator = " + new String(entry.initiator()) + ", profile.handle = " + profile.handle()); + if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, remote = true, initiator = " + new String(entry.initiator()) + ", profile.handle = " + profile.handle()); + //int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT); + nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_LIMIT, entry); + //assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT); + //this.log.logInfo("stacked/global: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT)); + } else if (local) { + if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, proxy = true, initiator = " + new String(entry.initiator()) + ", profile.handle = " + profile.handle()); + if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, remote = true, initiator = " + new String(entry.initiator()) + ", profile.handle = " + profile.handle()); + //int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE); + nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_CORE, entry); + //assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE); + //this.log.logInfo("stacked/local: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE)); + } else if (proxy) { + if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: proxy = true, remote = true, initiator = " + new String(entry.initiator()) + ", profile.handle = " + profile.handle()); + //int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE); + nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_CORE, entry); + //assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE); + //this.log.logInfo("stacked/proxy: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE)); + } else if (remote) { + //int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE); + nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_REMOTE, entry); + //assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE); + //this.log.logInfo("stacked/remote: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE)); + } + + return null; + } + + public String checkAcceptance(final DigestURI url, final CrawlProfile.entry profile, int depth) { + // check if the protocol is supported - final String urlProtocol = entry.url().getProtocol(); + final String urlProtocol = url.getProtocol(); if (!Switchboard.getSwitchboard().loader.isSupportedProtocol(urlProtocol)) { - this.log.logSevere("Unsupported protocol in URL '" + entry.url().toString() + "'. " + - "Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms"); + this.log.logSevere("Unsupported protocol in URL '" + url.toString() + "'."); return "unsupported protocol"; } // check if ip is local ip address - final String urlRejectReason = urlInAcceptedDomain(entry.url()); + final String urlRejectReason = urlInAcceptedDomain(url); if (urlRejectReason != null) { - if (this.log.isFine()) this.log.logFine("denied_(" + urlRejectReason + ") Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms"); + if (this.log.isFine()) this.log.logFine("denied_(" + urlRejectReason + ")"); return "denied_(" + urlRejectReason + ")"; } // check blacklist - if (Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, entry.url())) { - if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is in blacklist. " + - "Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms"); + if (Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, url)) { + if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is in blacklist."); return "url in blacklist"; } - final CrawlProfile.entry profile = crawler.profilesActiveCrawls.getEntry(entry.profileHandle()); - if (profile == null) { - final String errorMsg = "LOST STACKER PROFILE HANDLE '" + entry.profileHandle() + "' for URL " + entry.url(); - log.logWarning(errorMsg); - return errorMsg; - } - // filter with must-match - if ((entry.depth() > 0) && !profile.mustMatchPattern().matcher(entry.url().toString()).matches()) { - if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' does not match must-match crawling filter '" + profile.mustMatchPattern().toString() + "'. " + - "Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms"); + if ((depth > 0) && !profile.mustMatchPattern().matcher(url.toString()).matches()) { + if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' does not match must-match crawling filter '" + profile.mustMatchPattern().toString() + "'."); return "url does not match must-match filter"; } // filter with must-not-match - if ((entry.depth() > 0) && profile.mustNotMatchPattern().matcher(entry.url().toString()).matches()) { - if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' does matches do-not-match crawling filter '" + profile.mustNotMatchPattern().toString() + "'. " + - "Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms"); + if ((depth > 0) && profile.mustNotMatchPattern().matcher(url.toString()).matches()) { + if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' does matches do-not-match crawling filter '" + profile.mustNotMatchPattern().toString() + "'."); return "url matches must-not-match filter"; } // deny cgi - if (entry.url().isIndividual()) { - if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is CGI URL. " + - "Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms"); + if (url.isIndividual()) { + if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is CGI URL."); return "cgi url not allowed"; } // deny post properties - if (entry.url().isPOST() && !(profile.crawlingQ())) { - if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is post URL. " + - "Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms"); + if (url.isPOST() && !(profile.crawlingQ())) { + if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is post URL."); return "post url not allowed"; } - final DigestURI referrerURL = (entry.referrerhash() == null || entry.referrerhash().length == 0) ? null : nextQueue.getURL(entry.referrerhash()); - - // add domain to profile domain list - if ((profile.domFilterDepth() != Integer.MAX_VALUE) || (profile.domMaxPages() != Integer.MAX_VALUE)) { - profile.domInc(entry.url().getHost(), (referrerURL == null) ? null : referrerURL.getHost().toLowerCase(), entry.depth()); - } - // deny urls that do not match with the profile domain list - if (!(profile.grantedDomAppearance(entry.url().getHost()))) { - if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is not listed in granted domains. " + - "Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms"); + if (!(profile.grantedDomAppearance(url.getHost()))) { + if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is not listed in granted domains."); return "url does not match domain filter"; } // deny urls that exceed allowed number of occurrences - if (!(profile.grantedDomCount(entry.url().getHost()))) { - if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' appeared too often, a maximum of " + profile.domMaxPages() + " is allowed. " + - "Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms"); + if (!(profile.grantedDomCount(url.getHost()))) { + if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' appeared too often, a maximum of " + profile.domMaxPages() + " is allowed."); return "domain counter exceeded"; } // check if the url is double registered - final String dbocc = nextQueue.urlExists(entry.url().hash()); // returns the name of the queue if entry exists - URIMetadataRow oldEntry = indexSegment.urlMetadata().load(entry.url().hash(), null, 0); + final String dbocc = nextQueue.urlExists(url.hash()); // returns the name of the queue if entry exists + URIMetadataRow oldEntry = indexSegment.urlMetadata().load(url.hash(), null, 0); if (oldEntry == null) { if (dbocc != null) { // do double-check - if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms"); + if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is double registered in '" + dbocc + "'."); if (dbocc.equals("errors")) { - ZURL.Entry errorEntry = nextQueue.errorURL.get(entry.url().hash()); + ZURL.Entry errorEntry = nextQueue.errorURL.get(url.hash()); return "double in: errors (" + errorEntry.anycause() + ")"; } else { return "double in: " + dbocc; @@ -277,15 +323,15 @@ public final class CrawlStacker { final boolean recrawl = profile.recrawlIfOlder() > oldEntry.loaddate().getTime(); if (recrawl) { if (this.log.isFine()) - this.log.logFine("RE-CRAWL of URL '" + entry.url().toString() + "': this url was crawled " + + this.log.logFine("RE-CRAWL of URL '" + url.toString() + "': this url was crawled " + ((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000 / 60 / 24) + " days ago."); } else { if (dbocc == null) { return "double in: LURL-DB"; } else { - if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms"); + if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time:"); if (dbocc.equals("errors")) { - ZURL.Entry errorEntry = nextQueue.errorURL.get(entry.url().hash()); + ZURL.Entry errorEntry = nextQueue.errorURL.get(url.hash()); return "double in: errors (" + errorEntry.anycause() + ")"; } else { return "double in: " + dbocc; @@ -293,57 +339,11 @@ public final class CrawlStacker { } } } - - // store information - final boolean local = Base64Order.enhancedCoder.equal(entry.initiator(), peers.mySeed().hash.getBytes()); - final boolean proxy = (entry.initiator() == null || entry.initiator().length == 0 || new String(entry.initiator()).equals("------------")) && profile.handle().equals(crawler.defaultProxyProfile.handle()); - final boolean remote = profile.handle().equals(crawler.defaultRemoteProfile.handle()); - final boolean global = - (profile.remoteIndexing()) /* granted */ && - (entry.depth() == profile.depth()) /* leaf node */ && - //(initiatorHash.equals(yacyCore.seedDB.mySeed.hash)) /* not proxy */ && - ( - (peers.mySeed().isSenior()) || - (peers.mySeed().isPrincipal()) - ) /* qualified */; - - if (!local && !global && !remote && !proxy) { - String error = "URL '" + entry.url().toString() + "' cannot be crawled. initiator = " + new String(entry.initiator()) + ", profile.handle = " + profile.handle(); - this.log.logSevere(error); - return error; - } - if (global) { - // it may be possible that global == true and local == true, so do not check an error case against it - if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, proxy = true, initiator = " + new String(entry.initiator()) + ", profile.handle = " + profile.handle()); - if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, remote = true, initiator = " + new String(entry.initiator()) + ", profile.handle = " + profile.handle()); - //int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT); - nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_LIMIT, entry); - //assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT); - //this.log.logInfo("stacked/global: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_LIMIT)); - } else if (local) { - if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, proxy = true, initiator = " + new String(entry.initiator()) + ", profile.handle = " + profile.handle()); - if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, remote = true, initiator = " + new String(entry.initiator()) + ", profile.handle = " + profile.handle()); - //int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE); - nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_CORE, entry); - //assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE); - //this.log.logInfo("stacked/local: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE)); - } else if (proxy) { - if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: proxy = true, remote = true, initiator = " + new String(entry.initiator()) + ", profile.handle = " + profile.handle()); - //int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE); - nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_CORE, entry); - //assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE); - //this.log.logInfo("stacked/proxy: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE)); - } else if (remote) { - //int b = nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE); - nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_REMOTE, entry); - //assert b < nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE); - //this.log.logInfo("stacked/remote: " + entry.url().toString() + ", stacksize = " + nextQueue.noticeURL.stackSize(NoticedURL.STACK_TYPE_REMOTE)); - } - return null; } - + + /** * Test a url if it can be used for crawling/indexing * This mainly checks if the url is in the declared domain (local/global) diff --git a/source/de/anomic/http/server/HTTPDFileHandler.java b/source/de/anomic/http/server/HTTPDFileHandler.java index bb18d4882..8e06c86ef 100644 --- a/source/de/anomic/http/server/HTTPDFileHandler.java +++ b/source/de/anomic/http/server/HTTPDFileHandler.java @@ -973,7 +973,6 @@ public final class HTTPDFileHandler { if ((ranges.length == 1)&&(ranges[0].endsWith("-"))) { rangeStartOffset = Integer.parseInt(ranges[0].substring(0,ranges[0].length()-1)); statusCode = 206; - if (header == null) header = new ResponseHeader(); header.put(HeaderFramework.CONTENT_RANGE, "bytes " + rangeStartOffset + "-" + (targetFile.length()-1) + "/" + targetFile.length()); } } diff --git a/source/de/anomic/search/SearchEvent.java b/source/de/anomic/search/SearchEvent.java index 62fee8b59..d58b3f26e 100644 --- a/source/de/anomic/search/SearchEvent.java +++ b/source/de/anomic/search/SearchEvent.java @@ -80,12 +80,13 @@ public final class SearchEvent { private final TreeMap preselectedPeerHashes; private final ResultURLs crawlResults; private final Thread localSearchThread; - private final TreeMap IAResults; private final TreeMap IACount; + private final TreeMap IAResults; + private final TreeMap heuristics; private byte[] IAmaxcounthash, IAneardhthash; private final ReferenceOrder order; - @SuppressWarnings("unchecked") SearchEvent(final QueryParams query, + public SearchEvent(final QueryParams query, final yacySeedDB peers, final ResultURLs crawlResults, final TreeMap preselectedPeerHashes, @@ -102,6 +103,7 @@ public final class SearchEvent { this.preselectedPeerHashes = preselectedPeerHashes; this.IAResults = new TreeMap(Base64Order.enhancedCoder); this.IACount = new TreeMap(Base64Order.enhancedCoder); + this.heuristics = new TreeMap(Base64Order.enhancedCoder); this.IAmaxcounthash = null; this.IAneardhthash = null; this.localSearchThread = null; @@ -169,7 +171,7 @@ public final class SearchEvent { assert this.rankedCache.searchContainerMap() != null; for (Map.Entry> entry : this.rankedCache.searchContainerMap().entrySet()) { wordhash = entry.getKey(); - final ReferenceContainer container = entry.getValue(); + final ReferenceContainer container = entry.getValue(); assert (Base64Order.enhancedCoder.equal(container.getTermHash(), wordhash)) : "container.getTermHash() = " + new String(container.getTermHash()) + ", wordhash = " + new String(wordhash); if (container.size() > maxcount) { IAmaxcounthash = wordhash; @@ -317,6 +319,18 @@ public final class SearchEvent { return this.rankedCache.getAuthorNavigator(maxentries); } + public void addHeuristicResult(byte[] urlhash, String heuristicName, boolean redundant) { + synchronized (this.heuristics) { + this.heuristics.put(urlhash, new HeuristicResult(urlhash, heuristicName, redundant)); + } + } + + public HeuristicResult getHeuristic(byte[] urlhash) { + synchronized (this.heuristics) { + return this.heuristics.get(urlhash); + } + } + public ResultEntry oneResult(final int item) { if ((query.domType == QueryParams.SEARCHDOM_GLOBALDHT) || (query.domType == QueryParams.SEARCHDOM_CLUSTERALL)) { @@ -333,6 +347,22 @@ public final class SearchEvent { boolean secondarySearchStartet = false; + public static class HeuristicResult /*implements Comparable*/ { + public final byte[] urlhash; public final String heuristicName; public final boolean redundant; + public HeuristicResult(byte[] urlhash, String heuristicName, boolean redundant) { + this.urlhash = urlhash; this.heuristicName = heuristicName; this.redundant = redundant; + }/* + public int compareTo(HeuristicResult o) { + return Base64Order.enhancedCoder.compare(this.urlhash, o.urlhash); + } + public int hashCode() { + return (int) Base64Order.enhancedCoder.cardinal(this.urlhash); + } + public boolean equals(Object o) { + return Base64Order.enhancedCoder.equal(this.urlhash, ((HeuristicResult) o).urlhash); + }*/ + } + public class SecondarySearchSuperviser extends Thread { // cache for index abstracts; word:TreeMap mapping where the embedded TreeMap is a urlhash:peerlist relation diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 7fc4e143e..c6a04d837 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -1894,6 +1894,31 @@ public final class Switchboard extends serverSwitch { } } + public final void addAllToIndex(final DigestURI url, final Map links, final SearchEvent searchEvent, final String heuristicName) { + + // add the landing page to the index. should not load that again since it should be in the cache + if (url != null) try { + this.addToIndex(url, searchEvent, heuristicName); + } catch (IOException e) {} catch (ParserException e) {} + + // check if some of the links match with the query + Map matcher = searchEvent.getQuery().separateMatches(links); + + // take the matcher and load them all + for (Map.Entry entry: matcher.entrySet()) { + try { + this.addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent, heuristicName); + } catch (IOException e) {} catch (ParserException e) {} + } + + // take then the no-matcher and load them also + for (Map.Entry entry: links.entrySet()) { + try { + this.addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent, heuristicName); + } catch (IOException e) {} catch (ParserException e) {} + } + } + /** * load the content of a URL, parse the content and add the content to the index * This process is started concurrently. The method returns immediately after the call. @@ -1902,12 +1927,21 @@ public final class Switchboard extends serverSwitch { * @throws IOException * @throws ParserException */ - public void addToIndex(final DigestURI url, final SearchEvent searchEvent) throws IOException, ParserException { + public void addToIndex(final DigestURI url, final SearchEvent searchEvent, final String heuristicName) throws IOException, ParserException { + final Segments.Process process = Segments.Process.LOCALCRAWLING; + if (indexSegments.segment(process).urlMetadata.exists(url.hash())) { + searchEvent.addHeuristicResult(url.hash(), heuristicName, true); + return; // don't do double-work + } + final Request request = loader.request(url, true, true); + String acceptedError = this.crawlStacker.checkAcceptance(url, this.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), 0); + if (acceptedError != null) { + log.logInfo("Heuristic: cannot load " + url.toNormalform(false, false) + ": " + acceptedError); + return; + } new Thread() {public void run() { try { - Segments.Process process = Segments.Process.LOCALCRAWLING; - if (indexSegments.segment(process).urlMetadata.exists(url.hash())) return; // don't do double-work - Request request = loader.request(url, true, true); + searchEvent.addHeuristicResult(url.hash(), heuristicName, false); Response response = loader.load(request, CacheStrategy.IFFRESH, Long.MAX_VALUE); if (response == null) throw new IOException("response == null"); if (response.getContent() == null) throw new IOException("content == null"); @@ -1918,42 +1952,15 @@ public final class Switchboard extends serverSwitch { ResultImages.registerImages(document, true); webStructure.generateCitationReference(document, condenser, response.lastModified()); storeDocumentIndex(process, response, document, condenser, searchEvent); - log.logInfo("QuickFill of url " + url.toNormalform(true, true) + " finished"); + log.logInfo("heuristic fill of url " + url.toNormalform(true, true) + " finished"); } catch (IOException e) { - Log.logException(e); + //Log.logException(e); } catch (ParserException e) { - Log.logException(e); + //Log.logException(e); } }}.start(); } - public final void addAllToIndex(final DigestURI url, final Map links, final SearchEvent searchEvent) { - - // add the landing page to the index. should not load that again since it should be in the cache - try { - this.addToIndex(url, searchEvent); - } catch (IOException e) {} catch (ParserException e) {} - - // check if some of the links match with the query - Map matcher = searchEvent.getQuery().separateMatches(links); - - // take the matcher and load them all - for (Map.Entry entry: matcher.entrySet()) { - try { - this.addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent); - } catch (IOException e) {} catch (ParserException e) {} - } - - // take then the no-matcher and load them also - for (Map.Entry entry: links.entrySet()) { - try { - this.addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent); - } catch (IOException e) {} catch (ParserException e) {} - } - } - - - public class receiptSending implements Runnable { yacySeed initiatorPeer; URIMetadataRow reference; @@ -2165,7 +2172,7 @@ public final class Switchboard extends serverSwitch { crawlQueues.errorURL.push(bentry, initiator, new Date(), 0, failreason); } - public final void quickFillSite(final String host, final SearchEvent searchEvent) { + public final void heuristicSite(final SearchEvent searchEvent, final String host) { new Thread() {public void run() { String r = host; if (r.indexOf("//") < 0) r = "http://" + r; @@ -2194,7 +2201,42 @@ public final class Switchboard extends serverSwitch { } // add all pages to the index - addAllToIndex(url, links, searchEvent); + addAllToIndex(url, links, searchEvent, "site"); + }}.start(); + } + + public final void heuristicScroogle(final SearchEvent searchEvent) { + new Thread() {public void run() { + String query = searchEvent.getQuery().queryString(true); + int meta = query.indexOf("heuristic:"); + if (meta >= 0) { + int q = query.indexOf(' ', meta); + if (q >= 0) query = query.substring(0, meta) + query.substring(q + 1); else query = query.substring(0, meta); + } + final String urlString = "http://www.scroogle.org/cgi-bin/nbbw.cgi?Gw=" + query.trim().replaceAll(" ", "+") + "&n=2"; + DigestURI url; + try { + url = new DigestURI(MultiProtocolURI.unescape(urlString), null); + } catch (MalformedURLException e1) { + return; + } + + Map links = null; + try { + links = loader.loadLinks(url, CrawlProfile.CacheStrategy.NOCACHE); + } catch (IOException e) { + Log.logException(e); + return; + } + Iterator i = links.keySet().iterator(); + MultiProtocolURI u; + while (i.hasNext()) { + u = i.next(); + if (u.toNormalform(false, false).indexOf("scroogle") >= 0) i.remove(); + } + log.logInfo("Heuristic: adding " + links.size() + " links from scroogle"); + // add all pages to the index + addAllToIndex(null, links, searchEvent, "scroogle"); }}.start(); } diff --git a/source/de/anomic/yacy/dht/Dispatcher.java b/source/de/anomic/yacy/dht/Dispatcher.java index 10c63343c..d9cce6257 100755 --- a/source/de/anomic/yacy/dht/Dispatcher.java +++ b/source/de/anomic/yacy/dht/Dispatcher.java @@ -33,7 +33,6 @@ import java.util.concurrent.ConcurrentHashMap; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.word.WordReference; -import net.yacy.kelondro.data.word.WordReferenceRow; import net.yacy.kelondro.index.HandleSet; import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; @@ -239,15 +238,15 @@ public class Dispatcher { // check all entries and split them to the partitions ReferenceContainer[] partitionBuffer = new ReferenceContainer[partitionCount]; - WordReferenceRow re; - for (ReferenceContainer container: containers) { + WordReference re; + for (ReferenceContainer container: containers) { // init the new partitions for (int j = 0; j < partitionBuffer.length; j++) { - partitionBuffer[j] = new ReferenceContainer(Segment.wordReferenceFactory, container.getTermHash(), container.size() / partitionCount); + partitionBuffer[j] = new ReferenceContainer(Segment.wordReferenceFactory, container.getTermHash(), container.size() / partitionCount); } // split the container - Iterator i = container.entries(); + Iterator i = container.entries(); while (i.hasNext()) { re = i.next(); if (re == null) continue; diff --git a/source/de/anomic/yacy/graphics/OSMTile.java b/source/de/anomic/yacy/graphics/OSMTile.java index 8cc95fe44..59a8c0744 100644 --- a/source/de/anomic/yacy/graphics/OSMTile.java +++ b/source/de/anomic/yacy/graphics/OSMTile.java @@ -91,7 +91,6 @@ public class OSMTile { return null; } tileb = entry.getContent(); - if (entry == null) return null; } try { ImageIO.setUseCache(false); // do not write a cache to disc; keep in RAM