From c5279691857b45747f3babcb265f057d559de068 Mon Sep 17 00:00:00 2001 From: orbiter Date: Fri, 16 Nov 2007 14:48:09 +0000 Subject: [PATCH] - enhanced monitoring of ranking parameters for details, please try http://localhost:8080/IndexControlRWIs_p.html - fixed computation of ranking ordering in some cases git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4220 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/Bookmarks.java | 2 +- htroot/CrawlResults.java | 2 +- htroot/IndexControlRWIs_p.html | 61 +++++-- htroot/IndexControlRWIs_p.java | 72 +++++--- htroot/IndexControlURLs_p.java | 8 +- htroot/Ranking_p.java | 5 + htroot/ViewFile.java | 2 +- htroot/yacy/crawlOrder.java | 2 +- htroot/yacy/search.java | 4 +- htroot/yacysearch.java | 7 +- source/de/anomic/data/SitemapParser.java | 2 +- source/de/anomic/http/httpc.java | 4 +- .../de/anomic/index/indexRWIEntryOrder.java | 19 +- source/de/anomic/index/indexURLEntry.java | 12 +- .../kelondro/kelondroRowCollection.java | 2 + .../plasma/dbImport/plasmaDbImporter.java | 2 +- source/de/anomic/plasma/plasmaCrawlLURL.java | 8 +- .../de/anomic/plasma/plasmaCrawlStacker.java | 2 +- source/de/anomic/plasma/plasmaDHTChunk.java | 4 +- .../de/anomic/plasma/plasmaSearchEvent.java | 21 +-- .../de/anomic/plasma/plasmaSearchQuery.java | 65 +++++-- .../plasma/plasmaSearchRankingProcess.java | 62 ++++++- .../plasma/plasmaSearchRankingProfile.java | 3 + .../de/anomic/plasma/plasmaSwitchboard.java | 4 +- .../anomic/plasma/plasmaSwitchboardQueue.java | 2 +- source/de/anomic/plasma/plasmaWordIndex.java | 169 +++++++++--------- source/de/anomic/yacy/yacyURL.java | 5 +- source/yacy.java | 2 +- 28 files changed, 344 insertions(+), 209 deletions(-) diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java index 615363589..e97cc9ce2 100644 --- a/htroot/Bookmarks.java +++ b/htroot/Bookmarks.java @@ -169,7 +169,7 @@ public class Bookmarks { bookmarksDB.Bookmark bookmark = switchboard.bookmarksDB.getBookmark(urlHash); if (bookmark == null) { // try to get the bookmark from the LURL database - indexURLEntry urlentry = switchboard.wordIndex.loadedURL.load(urlHash, null); + indexURLEntry urlentry = switchboard.wordIndex.loadedURL.load(urlHash, null, 0); plasmaParserDocument document = null; if (urlentry != null) { indexURLEntry.Components comp = urlentry.comp(); diff --git a/htroot/CrawlResults.java b/htroot/CrawlResults.java index b1c4c1797..53eab501a 100644 --- a/htroot/CrawlResults.java +++ b/htroot/CrawlResults.java @@ -167,7 +167,7 @@ public class CrawlResults { urlHash = sb.wordIndex.loadedURL.getUrlHash(tabletype, i); // serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urlHash=" + urlHash); try { - urle = sb.wordIndex.loadedURL.load(urlHash, null); + urle = sb.wordIndex.loadedURL.load(urlHash, null, 0); indexURLEntry.Components comp = urle.comp(); // serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urle=" + urle.toString()); initiatorSeed = yacyCore.seedDB.getConnected(initiatorHash); diff --git a/htroot/IndexControlRWIs_p.html b/htroot/IndexControlRWIs_p.html index 1600bcd31..cf4b5b49a 100644 --- a/htroot/IndexControlRWIs_p.html +++ b/htroot/IndexControlRWIs_p.html @@ -92,9 +92,9 @@ 1000  
Ordering of list:
-
by URL   +
by Ranking   + by URL   by URL Hash   -
@@ -134,30 +134,61 @@

- + + + + + + + + + + + + + + + + + + + + + + + + + + + #{urlList}# #(urlExists)# - - - - - - - - + :: - - - - + + + + + + + + + + + + + + + #(/urlExists)# #{/urlList}# diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index b07e4fdc2..6eb51e443 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -30,6 +30,7 @@ import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import java.net.MalformedURLException; +import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; @@ -43,10 +44,13 @@ import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBitfield; import de.anomic.plasma.plasmaCondenser; import de.anomic.plasma.plasmaSearchEvent; +import de.anomic.plasma.plasmaSearchQuery; +import de.anomic.plasma.plasmaSearchRankingProcess; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaWordIndex; import de.anomic.plasma.urlPattern.abstractURLPattern; import de.anomic.plasma.urlPattern.plasmaURLPattern; +import de.anomic.server.serverDate; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.yacy.yacyClient; @@ -198,7 +202,7 @@ public class IndexControlRWIs_p { indexURLEntry lurl; while (urlIter.hasNext()) { iEntry = (indexRWIEntry) urlIter.next(); - lurl = sb.wordIndex.loadedURL.load(iEntry.urlHash(), null); + lurl = sb.wordIndex.loadedURL.load(iEntry.urlHash(), null, 0); if (lurl == null) { unknownURLEntries.add(iEntry.urlHash()); urlIter.remove(); @@ -255,7 +259,7 @@ public class IndexControlRWIs_p { yacyURL url; for (int i=0; i 60) ? (us.substring(0, 60) + "...") : us); - prop.put("genUrlList_urlList_"+i+"_urlExists_pos", entry.index().posintext()); - prop.put("genUrlList_urlList_"+i+"_urlExists_phrase", entry.index().posofphrase()); - prop.put("genUrlList_urlList_"+i+"_urlExists_urlcomps", entry.index().urlcomps()); - prop.put("genUrlList_urlList_"+i+"_urlExists_urllength", entry.index().urllength()); + prop.put("genUrlList_urlList_"+i+"_urlExists_urlStringShort", (us.length() > 40) ? (us.substring(0, 20) + "
" + us.substring(20, 40) + "...") : ((us.length() > 30) ? (us.substring(0, 20) + "
" + us.substring(20)) : us)); + prop.putNum("genUrlList_urlList_"+i+"_urlExists_ranking", (entry.ranking() - rn)); + prop.put("genUrlList_urlList_"+i+"_urlExists_domlength", yacyURL.domLengthEstimation(entry.hash())); + prop.put("genUrlList_urlList_"+i+"_urlExists_ybr", plasmaSearchRankingProcess.ybr(entry.hash())); + prop.put("genUrlList_urlList_"+i+"_urlExists_date", serverDate.shortDayTime(new Date(entry.word().lastModified()))); + prop.put("genUrlList_urlList_"+i+"_urlExists_wordsintitle", entry.word().wordsintitle()); + prop.put("genUrlList_urlList_"+i+"_urlExists_wordsintext", entry.word().wordsintext()); + prop.put("genUrlList_urlList_"+i+"_urlExists_phrasesintext", entry.word().phrasesintext()); + prop.put("genUrlList_urlList_"+i+"_urlExists_llocal", entry.word().llocal()); + prop.put("genUrlList_urlList_"+i+"_urlExists_lother", entry.word().lother()); + prop.put("genUrlList_urlList_"+i+"_urlExists_hitcount", entry.word().hitcount()); + prop.put("genUrlList_urlList_"+i+"_urlExists_worddistance", entry.word().worddistance()); + prop.put("genUrlList_urlList_"+i+"_urlExists_pos", entry.word().posintext()); + prop.put("genUrlList_urlList_"+i+"_urlExists_phrase", entry.word().posofphrase()); + prop.put("genUrlList_urlList_"+i+"_urlExists_posinphrase", entry.word().posinphrase()); + prop.put("genUrlList_urlList_"+i+"_urlExists_urlcomps", entry.word().urlcomps()); + prop.put("genUrlList_urlList_"+i+"_urlExists_urllength", entry.word().urllength()); prop.put("genUrlList_urlList_"+i+"_urlExists_props", - ((entry.index().flags().get(plasmaCondenser.flag_cat_hasimage)) ? "contains images, " : "") + - ((entry.index().flags().get(plasmaCondenser.flag_cat_hasaudio)) ? "contains audio, " : "") + - ((entry.index().flags().get(plasmaCondenser.flag_cat_hasvideo)) ? "contains video, " : "") + - ((entry.index().flags().get(plasmaCondenser.flag_cat_hasapp)) ? "contains applications, " : "") + - ((entry.index().flags().get(indexRWIEntry.flag_app_url)) ? "appears in url, " : "") + - ((entry.index().flags().get(indexRWIEntry.flag_app_descr)) ? "appears in description, " : "") + - ((entry.index().flags().get(indexRWIEntry.flag_app_author)) ? "appears in author, " : "") + - ((entry.index().flags().get(indexRWIEntry.flag_app_tags)) ? "appears in tags, " : "") + - ((entry.index().flags().get(indexRWIEntry.flag_app_reference)) ? "appears in reference, " : "") + - ((entry.index().flags().get(indexRWIEntry.flag_app_emphasized)) ? "appears emphasized" : "") + ((entry.word().flags().get(plasmaCondenser.flag_cat_indexof)) ? "appears on index page, " : "") + + ((entry.word().flags().get(plasmaCondenser.flag_cat_hasimage)) ? "contains images, " : "") + + ((entry.word().flags().get(plasmaCondenser.flag_cat_hasaudio)) ? "contains audio, " : "") + + ((entry.word().flags().get(plasmaCondenser.flag_cat_hasvideo)) ? "contains video, " : "") + + ((entry.word().flags().get(plasmaCondenser.flag_cat_hasapp)) ? "contains applications, " : "") + + ((entry.word().flags().get(indexRWIEntry.flag_app_url)) ? "appears in url, " : "") + + ((entry.word().flags().get(indexRWIEntry.flag_app_descr)) ? "appears in description, " : "") + + ((entry.word().flags().get(indexRWIEntry.flag_app_author)) ? "appears in author, " : "") + + ((entry.word().flags().get(indexRWIEntry.flag_app_tags)) ? "appears in tags, " : "") + + ((entry.word().flags().get(indexRWIEntry.flag_app_reference)) ? "appears in reference, " : "") + + ((entry.word().flags().get(indexRWIEntry.flag_app_emphasized)) ? "appears emphasized, " : "") + + ((yacyURL.probablyRootURL(entry.word().urlHash())) ? "probably root url" : "") ); - prop.put("genUrlList_urlList_"+i+"_urlExists_phrase", entry.index().posofphrase()); - prop.put("genUrlList_urlList_"+i+"_urlExists_phrase", entry.index().posofphrase()); + prop.put("genUrlList_urlList_"+i+"_urlExists_phrase", entry.word().posofphrase()); + prop.put("genUrlList_urlList_"+i+"_urlExists_phrase", entry.word().posofphrase()); try { url = new yacyURL(us, null); } catch (MalformedURLException e) { diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java index dfc46c03f..d92076d38 100644 --- a/htroot/IndexControlURLs_p.java +++ b/htroot/IndexControlURLs_p.java @@ -76,7 +76,7 @@ public class IndexControlURLs_p { } if (post.containsKey("urlhashdelete")) { - indexURLEntry entry = sb.wordIndex.loadedURL.load(urlhash, null); + indexURLEntry entry = sb.wordIndex.loadedURL.load(urlhash, null, 0); if (entry == null) { prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted."); } else { @@ -106,7 +106,7 @@ public class IndexControlURLs_p { yacyURL url = new yacyURL(urlstring, null); urlhash = url.hash(); prop.put("urlhash", urlhash); - indexURLEntry entry = sb.wordIndex.loadedURL.load(urlhash, null); + indexURLEntry entry = sb.wordIndex.loadedURL.load(urlhash, null, 0); if (entry == null) { prop.putHTML("urlstring", "unknown url: " + urlstring); prop.put("urlhash", ""); @@ -120,7 +120,7 @@ public class IndexControlURLs_p { } if (post.containsKey("urlhashsearch")) { - indexURLEntry entry = sb.wordIndex.loadedURL.load(urlhash, null); + indexURLEntry entry = sb.wordIndex.loadedURL.load(urlhash, null, 0); if (entry == null) { prop.put("result", "No Entry for URL hash " + urlhash); } else { @@ -172,7 +172,7 @@ public class IndexControlURLs_p { } indexURLEntry.Components comp = entry.comp(); String referrer = null; - indexURLEntry le = (entry.referrerHash() == null) ? null : switchboard.wordIndex.loadedURL.load(entry.referrerHash(), null); + indexURLEntry le = (entry.referrerHash() == null) ? null : switchboard.wordIndex.loadedURL.load(entry.referrerHash(), null, 0); if (le == null) { referrer = ""; } else { diff --git a/htroot/Ranking_p.java b/htroot/Ranking_p.java index 1923cd43d..69b6b83b3 100644 --- a/htroot/Ranking_p.java +++ b/htroot/Ranking_p.java @@ -30,6 +30,7 @@ import java.util.Iterator; import java.util.Map; import de.anomic.http.httpHeader; +import de.anomic.plasma.plasmaSearchEvent; import de.anomic.plasma.plasmaSearchQuery; import de.anomic.plasma.plasmaSearchRankingProfile; import de.anomic.plasma.plasmaSwitchboard; @@ -63,6 +64,7 @@ public class Ranking_p { rankingParameters.put(plasmaSearchRankingProfile.PHRASESINTEXT, "Phrases In Text"); rankingParameters.put(plasmaSearchRankingProfile.POSINTEXT, "Position In Text"); rankingParameters.put(plasmaSearchRankingProfile.POSOFPHRASE, "Position Of Phrase"); + rankingParameters.put(plasmaSearchRankingProfile.POSINPHRASE, "Position In Phrase"); rankingParameters.put(plasmaSearchRankingProfile.PREFER, "Application Of Prefer Pattern"); rankingParameters.put(plasmaSearchRankingProfile.URLCOMPINTOPLIST, "URL Component Appears In Toplist"); rankingParameters.put(plasmaSearchRankingProfile.URLCOMPS, "URL Components"); @@ -127,6 +129,9 @@ public class Ranking_p { public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) { final plasmaSwitchboard sb = (plasmaSwitchboard) env; + // clean up all search events + plasmaSearchEvent.cleanupEvents(true); + // case if no values are requested if ((post == null) || (env == null)) { // we create empty entries for template strings diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index eb7a4fc48..15c8876ee 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -109,7 +109,7 @@ public class ViewFile { if (urlHash.length() > 0) { // getting the urlEntry that belongs to the url hash indexURLEntry urlEntry = null; - urlEntry = sb.wordIndex.loadedURL.load(urlHash, null); + urlEntry = sb.wordIndex.loadedURL.load(urlHash, null, 0); if (urlEntry == null) { prop.put("error", "2"); prop.put("viewMode",VIEW_MODE_NO_TEXT); diff --git a/htroot/yacy/crawlOrder.java b/htroot/yacy/crawlOrder.java index 6719ea853..5016d0694 100644 --- a/htroot/yacy/crawlOrder.java +++ b/htroot/yacy/crawlOrder.java @@ -261,7 +261,7 @@ public final class crawlOrder { reason = reasonString; // send lurl-Entry as response indexURLEntry entry; - entry = switchboard.wordIndex.loadedURL.load(url.hash(), null); + entry = switchboard.wordIndex.loadedURL.load(url.hash(), null, 0); if (entry == null) { response = "rejected"; lurl = ""; diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 6b0d17965..0ddf35061 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -133,7 +133,7 @@ public final class search { long urlRetrievalAllTime = 0, snippetComputationAllTime = 0; if ((query.length() == 0) && (abstractSet != null)) { // this is _not_ a normal search, only a request for index abstracts - theQuery = new plasmaSearchQuery(null, abstractSet, new TreeSet(kelondroBase64Order.enhancedCoder), maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), false, count, 0, duetime, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, plasmaSearchQuery.catchall_constraint); + theQuery = new plasmaSearchQuery(null, abstractSet, new TreeSet(kelondroBase64Order.enhancedCoder), maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), false, count, 0, duetime, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, plasmaSearchQuery.catchall_constraint, false); theQuery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL; yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links"); @@ -162,7 +162,7 @@ public final class search { } else { // retrieve index containers from search request - theQuery = new plasmaSearchQuery(null, queryhashes, excludehashes, maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), false, count, 0, duetime, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, constraint); + theQuery = new plasmaSearchQuery(null, queryhashes, excludehashes, maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), false, count, 0, duetime, filter, plasmaSearchQuery.SEARCHDOM_LOCAL, null, -1, constraint, false); theQuery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL; yacyCore.log.logInfo("INIT HASH SEARCH (query-" + abstracts + "): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links"); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index a97cbe1c1..a78d1cdd0 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -169,7 +169,7 @@ public class yacysearch { kelondroBitfield constraint = post.containsKey("constraint") ? new kelondroBitfield(4, post.get("constraint", "______")) : plasmaSearchQuery.catchall_constraint; if (indexof) { - constraint = new kelondroBitfield(); + constraint = new kelondroBitfield(4); constraint.set(plasmaCondenser.flag_cat_indexof, true); } @@ -225,7 +225,7 @@ public class yacysearch { return prop; } final String recommendHash = post.get("recommendref", ""); // urlhash - indexURLEntry urlentry = sb.wordIndex.loadedURL.load(recommendHash, null); + indexURLEntry urlentry = sb.wordIndex.loadedURL.load(recommendHash, null, 0); if (urlentry != null) { indexURLEntry.Components comp = urlentry.comp(); plasmaParserDocument document; @@ -266,7 +266,8 @@ public class yacysearch { ((globalsearch) ? plasmaSearchQuery.SEARCHDOM_GLOBALDHT : plasmaSearchQuery.SEARCHDOM_LOCAL), "", 20, - constraint); + constraint, + false); plasmaSearchProcessing localTiming = new plasmaSearchProcessing(4 * theQuery.maximumTime / 10, theQuery.displayResults()); String client = (String) header.get("CLIENTIP"); // the search client who initiated the search diff --git a/source/de/anomic/data/SitemapParser.java b/source/de/anomic/data/SitemapParser.java index 851b840b8..ac154eb11 100644 --- a/source/de/anomic/data/SitemapParser.java +++ b/source/de/anomic/data/SitemapParser.java @@ -286,7 +286,7 @@ public class SitemapParser extends DefaultHandler { String dbocc = this.switchboard.urlExists(nexturlhash); if ((dbocc != null) && (dbocc.equalsIgnoreCase("loaded"))) { // the url was already loaded. we need to check the date - indexURLEntry oldEntry = this.switchboard.wordIndex.loadedURL.load(nexturlhash, null); + indexURLEntry oldEntry = this.switchboard.wordIndex.loadedURL.load(nexturlhash, null, 0); if (oldEntry != null) { Date modDate = oldEntry.moddate(); // check if modDate is null diff --git a/source/de/anomic/http/httpc.java b/source/de/anomic/http/httpc.java index 9aac710c5..04fc115f1 100644 --- a/source/de/anomic/http/httpc.java +++ b/source/de/anomic/http/httpc.java @@ -234,6 +234,7 @@ public final class httpc { // do NOT remove this check; in case that everything works fine this call does nothing // but if in any arror case connections stay open, this will ensure that the peer keeps running and the host server is not blocked from working checkIdleConnections(); + assert timeout != 0; // register new connection this.hashIndex = objCounter; @@ -401,9 +402,10 @@ public final class httpc { this.initTime = System.currentTimeMillis(); this.lastIO = System.currentTimeMillis(); this.socket.setKeepAlive(false); - this.socket.connect(address, timeout); // setting socket timeout and keep alive behaviour this.socket.setSoTimeout(timeout); // waiting time for read + // get the connection + this.socket.connect(address, timeout); if (incomingByteCountAccounting != null) { this.clientInputByteCount = new httpdByteCountInputStream(this.socket.getInputStream(),incomingByteCountAccounting); diff --git a/source/de/anomic/index/indexRWIEntryOrder.java b/source/de/anomic/index/indexRWIEntryOrder.java index d676ef125..3325e0845 100644 --- a/source/de/anomic/index/indexRWIEntryOrder.java +++ b/source/de/anomic/index/indexRWIEntryOrder.java @@ -87,21 +87,22 @@ public class indexRWIEntryOrder extends kelondroAbstractOrder implements kelondr //return Long.MAX_VALUE - preRanking(ranking, iEntry, this.entryMin, this.entryMax, this.searchWords); // the normalizedEntry must be a normalized indexEntry kelondroBitfield flags = t.flags(); - long r = ((255 - yacyURL.domLengthNormalized(t.urlHash())) << ranking.coeff_domlength) - + ((255 - (plasmaSearchRankingProcess.ybr(t.urlHash()) << 4 )) << ranking.coeff_ybr) - + ((255 - (((t.virtualAge() - min.virtualAge() ) << 8) / (1 + max.virtualAge() - min.virtualAge()) )) << ranking.coeff_date) + long r = + ((256 - yacyURL.domLengthNormalized(t.urlHash())) << ranking.coeff_domlength) + + ((256 - (plasmaSearchRankingProcess.ybr(t.urlHash()) << 4)) << ranking.coeff_ybr) + + ((t.urlcomps() == 0) ? 0 : ((256 - (((t.urlcomps() - min.urlcomps() ) << 8) / (1 + max.urlcomps() - min.urlcomps()) )) << ranking.coeff_urlcomps)) + + ((t.urllength() == 0) ? 0 : ((256 - (((t.urllength() - min.urllength() ) << 8) / (1 + max.urllength() - min.urllength()) )) << ranking.coeff_urllength)) + + ((t.posintext() == 0) ? 0 : ((256 - (((t.posintext() - min.posintext() ) << 8) / (1 + max.posintext() - min.posintext()) )) << ranking.coeff_posintext)) + + ((t.posofphrase() == 0) ? 0 : ((256 - (((t.posofphrase() - min.posofphrase() ) << 8) / (1 + max.posofphrase() - min.posofphrase()) )) << ranking.coeff_posofphrase)) + + ((t.posinphrase() == 0) ? 0 : ((256 - (((t.posinphrase() - min.posinphrase() ) << 8) / (1 + max.posinphrase() - min.posinphrase()) )) << ranking.coeff_posinphrase)) + + ((256 - (((t.worddistance() - min.worddistance() ) << 8) / (1 + max.worddistance() - min.worddistance()) )) << ranking.coeff_worddistance) + + ( (((t.virtualAge() - min.virtualAge() ) << 8) / (1 + max.virtualAge() - min.virtualAge()) ) << ranking.coeff_date) + ( (((t.wordsintitle() - min.wordsintitle() ) << 8) / (1 + max.wordsintitle() - min.wordsintitle()) ) << ranking.coeff_wordsintitle) + ( (((t.wordsintext() - min.wordsintext() ) << 8) / (1 + max.wordsintext() - min.wordsintext()) ) << ranking.coeff_wordsintext) + ( (((t.phrasesintext()- min.phrasesintext()) << 8) / (1 + max.phrasesintext()- min.phrasesintext()) ) << ranking.coeff_phrasesintext) + ( (((t.llocal() - min.llocal() ) << 8) / (1 + max.llocal() - min.llocal()) ) << ranking.coeff_llocal) + ( (((t.lother() - min.lother() ) << 8) / (1 + max.lother() - min.lother()) ) << ranking.coeff_lother) + ( (((t.hitcount() - min.hitcount() ) << 8) / (1 + max.hitcount() - min.hitcount()) ) << ranking.coeff_hitcount) - + ((255 - (((t.urllength() - min.urllength() ) << 8) / (1 + max.urllength() - min.urllength()) )) << ranking.coeff_urllength) - + ((255 - (((t.urlcomps() - min.urlcomps() ) << 8) / (1 + max.urlcomps() - min.urlcomps()) )) << ranking.coeff_urlcomps) - + ((255 - (((t.posintext() - min.posintext() ) << 8) / (1 + max.posintext() - min.posintext()) )) << ranking.coeff_posintext) - + ((255 - (((t.posofphrase() - min.posofphrase() ) << 8) / (1 + max.posofphrase() - min.posofphrase()) )) << ranking.coeff_posofphrase) - + ((255 - (((t.posinphrase() - min.posinphrase() ) << 8) / (1 + max.posinphrase() - min.posinphrase()) )) << ranking.coeff_posinphrase) - + ((255 - (((t.worddistance() - min.worddistance() ) << 8) / (1 + max.worddistance() - min.worddistance()) )) << ranking.coeff_worddistance) + (((flags.get(indexRWIEntry.flag_app_url)) ? 255 << ranking.coeff_appurl : 0)) + (((flags.get(indexRWIEntry.flag_app_descr)) ? 255 << ranking.coeff_appdescr : 0)) + (((flags.get(indexRWIEntry.flag_app_author)) ? 255 << ranking.coeff_appauthor : 0)) diff --git a/source/de/anomic/index/indexURLEntry.java b/source/de/anomic/index/indexURLEntry.java index 73de100a5..2d487f392 100644 --- a/source/de/anomic/index/indexURLEntry.java +++ b/source/de/anomic/index/indexURLEntry.java @@ -117,7 +117,8 @@ public class indexURLEntry { private kelondroRow.Entry entry; private String snippet; private indexRWIEntry word; // this is only used if the url is transported via remote search requests - + private long ranking; // during generation of a search result this value is set + public indexURLEntry( yacyURL url, String descr, @@ -163,6 +164,7 @@ public class indexURLEntry { //System.out.println("===DEBUG=== " + load.toString() + ", " + decodeDate(col_load).toString()); this.snippet = null; this.word = null; + this.ranking = 0; } private void encodeDate(int col, Date d) { @@ -184,10 +186,11 @@ public class indexURLEntry { return s.toString().getBytes(); } - public indexURLEntry(kelondroRow.Entry entry, indexRWIEntry searchedWord) { + public indexURLEntry(kelondroRow.Entry entry, indexRWIEntry searchedWord, long ranking) { this.entry = entry; this.snippet = null; this.word = searchedWord; + this.ranking = ranking; } public indexURLEntry(Properties prop){ @@ -243,6 +246,7 @@ public class indexURLEntry { if (prop.containsKey("wi")) { this.word = new indexRWIRowEntry(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("wi", ""))); } + this.ranking = 0; } private StringBuffer corePropList() { @@ -301,6 +305,10 @@ public class indexURLEntry { return this.entry.getColString(col_hash, null); } + public long ranking() { + return this.ranking; + } + public indexURLEntry.Components comp() { ArrayList cl = nxTools.strings(this.entry.getCol("comp", null), "UTF-8"); return new indexURLEntry.Components( diff --git a/source/de/anomic/kelondro/kelondroRowCollection.java b/source/de/anomic/kelondro/kelondroRowCollection.java index c6c2795ca..bdb746cd6 100644 --- a/source/de/anomic/kelondro/kelondroRowCollection.java +++ b/source/de/anomic/kelondro/kelondroRowCollection.java @@ -413,6 +413,7 @@ public class kelondroRowCollection { if (this.chunkcount < isortlimit) { isort(0, this.chunkcount, new byte[this.rowdef.objectsize]); this.sortBound = this.chunkcount; + assert this.isSorted(); return; } byte[] swapspace = new byte[this.rowdef.objectsize]; @@ -555,6 +556,7 @@ public class kelondroRowCollection { public synchronized boolean isSorted() { assert (this.rowdef.objectOrder != null); if (chunkcount <= 1) return true; + if (chunkcount != this.sortBound) return false; for (int i = 0; i < chunkcount - 1; i++) { //System.out.println("*" + new String(get(i).getColBytes(0))); if (compare(i, i + 1) > 0) { diff --git a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java index a05a3f79a..6d2913273 100644 --- a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java +++ b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java @@ -193,7 +193,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { // we need to import the url // getting the url entry - indexURLEntry urlEntry = this.importWordIndex.loadedURL.load(urlHash, null); + indexURLEntry urlEntry = this.importWordIndex.loadedURL.load(urlHash, null, 0); if (urlEntry != null) { /* write it into the home url db */ diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java index fca8a0b0f..60e016552 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURL.java +++ b/source/de/anomic/plasma/plasmaCrawlLURL.java @@ -153,7 +153,7 @@ public final class plasmaCrawlLURL { return 0; } - public synchronized indexURLEntry load(String urlHash, indexRWIEntry searchedWord) { + public synchronized indexURLEntry load(String urlHash, indexRWIEntry searchedWord, long ranking) { // generates an plasmaLURLEntry using the url hash // to speed up the access, the url-hashes are buffered // in the hash cache. @@ -165,7 +165,7 @@ public final class plasmaCrawlLURL { try { kelondroRow.Entry entry = urlIndexFile.get(urlHash.getBytes()); if (entry == null) return null; - return new indexURLEntry(entry, searchedWord); + return new indexURLEntry(entry, searchedWord, ranking); } catch (IOException e) { return null; } @@ -176,7 +176,7 @@ public final class plasmaCrawlLURL { indexURLEntry oldEntry; try { if (exists(entry.hash())) { - oldEntry = load(entry.hash(), null); + oldEntry = load(entry.hash(), null, 0); } else { oldEntry = null; } @@ -342,7 +342,7 @@ public final class plasmaCrawlLURL { if (this.iter == null) { return null; } if (this.iter.hasNext()) { e = (kelondroRow.Entry) this.iter.next(); } if (e == null) { return null; } - return new indexURLEntry(e, null); + return new indexURLEntry(e, null, 0); } public final void remove() { diff --git a/source/de/anomic/plasma/plasmaCrawlStacker.java b/source/de/anomic/plasma/plasmaCrawlStacker.java index 9f86eac0b..2eaf51365 100644 --- a/source/de/anomic/plasma/plasmaCrawlStacker.java +++ b/source/de/anomic/plasma/plasmaCrawlStacker.java @@ -455,7 +455,7 @@ public final class plasmaCrawlStacker extends Thread { // check if the url is double registered String dbocc = sb.crawlQueues.urlExists(entry.url().hash()); - indexURLEntry oldEntry = this.sb.wordIndex.loadedURL.load(entry.url().hash(), null); + indexURLEntry oldEntry = this.sb.wordIndex.loadedURL.load(entry.url().hash(), null, 0); boolean recrawl = (oldEntry != null) && ((System.currentTimeMillis() - oldEntry.loaddate().getTime()) > profile.recrawlIfOlder()); // apply recrawl rule if ((dbocc != null) && (!(recrawl))) { diff --git a/source/de/anomic/plasma/plasmaDHTChunk.java b/source/de/anomic/plasma/plasmaDHTChunk.java index ff1ad7ed7..2e2abe194 100644 --- a/source/de/anomic/plasma/plasmaDHTChunk.java +++ b/source/de/anomic/plasma/plasmaDHTChunk.java @@ -8,7 +8,7 @@ // // $LastChangedDate$ // $LastChangedRevision$ -// $LastChangedBy: $ +// $LastChangedBy$ // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by @@ -247,7 +247,7 @@ public class plasmaDHTChunk { urlIter.remove(); continue; } - lurl = wordIndex.loadedURL.load(iEntry.urlHash(), iEntry); + lurl = wordIndex.loadedURL.load(iEntry.urlHash(), iEntry, 0); if ((lurl == null) || (lurl.comp() == null) || (lurl.comp().url() == null)) { //yacyCore.log.logFine("DEBUG selectTransferContainersResource: not-bound url hash '" + iEntry.urlHash() + "' for word hash " + container.getWordHash()); notBoundCounter++; diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index 306ca4276..a05ed6383 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -205,22 +205,15 @@ public final class plasmaSearchEvent { } else { // prepare result vector directly without worker threads process.startTimer(); - indexRWIEntry entry; - indexURLEntry page; + indexURLEntry uentry; ResultEntry resultEntry; synchronized (rankedCache) { - Iterator indexRWIEntryIterator = rankedCache.entries(); - while ((indexRWIEntryIterator.hasNext()) && (resultList.size() < (query.neededResults()))) { + Iterator urlIterator = rankedCache.entries(wordIndex, true); + while ((urlIterator.hasNext()) && (resultList.size() < (query.neededResults()))) { // fetch next entry - entry = (indexRWIEntry) indexRWIEntryIterator.next(); - page = wordIndex.loadedURL.load(entry.urlHash(), entry); + uentry = (indexURLEntry) urlIterator.next(); - if (page == null) { - registerFailure(entry.urlHash(), "url does not exist in lurl-db"); - continue; - } - - resultEntry = obtainResultEntry(page, (snippetComputationAllTime < 300) ? 1 : 0); + resultEntry = obtainResultEntry(uentry, (snippetComputationAllTime < 300) ? 1 : 0); if (resultEntry == null) continue; // the entry had some problems, cannot be used urlRetrievalAllTime += resultEntry.dbRetrievalTime; snippetComputationAllTime += resultEntry.snippetComputationTime; @@ -581,7 +574,7 @@ public final class plasmaSearchEvent { } } - indexURLEntry page = wordIndex.loadedURL.load(entry.urlHash(), entry); + indexURLEntry page = wordIndex.loadedURL.load(entry.urlHash(), entry, 0); if (page == null) { registerFailure(entry.urlHash(), "url does not exist in lurl-db"); continue; @@ -609,7 +602,7 @@ public final class plasmaSearchEvent { private indexRWIEntry nextOrder() { synchronized (rankedCache) { - Iterator i = rankedCache.entries(); + Iterator i = rankedCache.entries(null, false); indexRWIEntry entry; String urlhash; while (i.hasNext()) { diff --git a/source/de/anomic/plasma/plasmaSearchQuery.java b/source/de/anomic/plasma/plasmaSearchQuery.java index 53a607978..a19fbc8a0 100644 --- a/source/de/anomic/plasma/plasmaSearchQuery.java +++ b/source/de/anomic/plasma/plasmaSearchQuery.java @@ -85,28 +85,57 @@ public final class plasmaSearchQuery { public int domMaxTargets; public int maxDistance; public kelondroBitfield constraint; + public boolean allofconstraint; public boolean onlineSnippetFetch; - public plasmaSearchQuery(String queryString, TreeSet queryHashes, TreeSet excludeHashes, int maxDistance, String prefer, int contentdom, - boolean onlineSnippetFetch, - int lines, int offset, long maximumTime, String urlMask, - int domType, String domGroupName, int domMaxTargets, - kelondroBitfield constraint) { - this.queryString = queryString; - this.queryHashes = queryHashes; - this.excludeHashes = excludeHashes; - this.maxDistance = maxDistance; - this.prefer = prefer; - this.contentdom = contentdom; + public plasmaSearchQuery(String queryString, int lines, kelondroBitfield constraint) { + if ((queryString.length() == 12) && (kelondroBase64Order.enhancedCoder.wellformed(queryString.getBytes()))) { + this.queryString = null; + this.queryHashes = new TreeSet(); + this.excludeHashes = new TreeSet(); + this.queryHashes.add(queryString); + } else { + this.queryString = queryString; + TreeSet[] cq = cleanQuery(queryString); + this.queryHashes = plasmaCondenser.words2hashes(cq[0]); + this.excludeHashes = plasmaCondenser.words2hashes(cq[1]); + } + this.maxDistance = Integer.MAX_VALUE; + this.prefer = ""; + this.contentdom = CONTENTDOM_ALL; this.linesPerPage = lines; - this.offset = offset; - this.maximumTime = maximumTime; - this.urlMask = urlMask; - this.domType = domType; - this.domGroupName = domGroupName; - this.domMaxTargets = domMaxTargets; + this.offset = 0; + this.maximumTime = 10000; + this.urlMask = ".*"; + this.domType = SEARCHDOM_LOCAL; + this.domGroupName = ""; + this.domMaxTargets = 0; this.constraint = constraint; - this.onlineSnippetFetch = onlineSnippetFetch; + this.allofconstraint = false; + this.onlineSnippetFetch = false; + } + +public plasmaSearchQuery(String queryString, TreeSet queryHashes, TreeSet excludeHashes, int maxDistance, String prefer, int contentdom, + boolean onlineSnippetFetch, + int lines, int offset, long maximumTime, String urlMask, + int domType, String domGroupName, int domMaxTargets, + kelondroBitfield constraint, boolean allofconstraint) { + this.queryString = queryString; + this.queryHashes = queryHashes; + this.excludeHashes = excludeHashes; + this.maxDistance = maxDistance; + this.prefer = prefer; + this.contentdom = contentdom; + this.linesPerPage = lines; + this.offset = offset; + this.maximumTime = maximumTime; + this.urlMask = urlMask; + this.domType = domType; + this.domGroupName = domGroupName; + this.domMaxTargets = domMaxTargets; + this.constraint = constraint; + this.allofconstraint = allofconstraint; + this.onlineSnippetFetch = onlineSnippetFetch; } public int neededResults() { diff --git a/source/de/anomic/plasma/plasmaSearchRankingProcess.java b/source/de/anomic/plasma/plasmaSearchRankingProcess.java index b65fc7749..80df289a1 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProcess.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProcess.java @@ -61,6 +61,7 @@ public final class plasmaSearchRankingProcess { private int globalcount; private HashMap urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion) private kelondroMScoreCluster ref; // reference score computation for the commonSense heuristic + private int[] c; // flag counter public plasmaSearchRankingProcess(plasmaSearchQuery query, plasmaSearchProcessing process, plasmaSearchRankingProfile ranking, int maxentries) { // we collect the urlhashes and construct a list with urlEntry objects @@ -74,6 +75,8 @@ public final class plasmaSearchRankingProcess { this.globalcount = 0; this.urlhashes = new HashMap(); this.ref = new kelondroMScoreCluster(); + c = new int[32]; + for (int i = 0; i < 32; i++) {c[i] = 0;} } public void insert(indexContainer container, boolean local) { @@ -83,12 +86,12 @@ public final class plasmaSearchRankingProcess { assert (container != null); if (container.size() == 0) return; - process.startTimer(); + if (process != null) process.startTimer(); if (this.order == null) { this.order = new indexRWIEntryOrder(ranking); } this.order.extend(container); - process.yield("normalizing", container.size()); + if (process != null) process.yield("normalizing", container.size()); /* container.setOrdering(o, 0); @@ -96,7 +99,7 @@ public final class plasmaSearchRankingProcess { */ // normalize entries and get ranking - process.startTimer(); + if (process != null) process.startTimer(); Iterator i = container.entries(); this.pageAcc = new TreeMap(); indexRWIEntry iEntry, l; @@ -106,9 +109,15 @@ public final class plasmaSearchRankingProcess { while (i.hasNext()) { iEntry = (indexRWIEntry) i.next(); if (iEntry.urlHash().length() != container.row().primaryKeyLength) continue; + + // increase flag counts + for (int j = 0; j < 32; j++) { + if (iEntry.flags().get(j)) {c[j]++;} + } + // kick out entries that are too bad acording to current findings r = new Long(order.cardinal(iEntry)); - if ((pageAcc.size() >= maxentries) && (r.longValue() > biggestEntry)) continue; + if ((maxentries >= 0) && (pageAcc.size() >= maxentries) && (r.longValue() > biggestEntry)) continue; // check constraints if ((!(query.constraint.equals(plasmaSearchQuery.catchall_constraint))) && (!(iEntry.flags().allOf(query.constraint)))) continue; // filter out entries that do not match the search constraint @@ -118,7 +127,7 @@ public final class plasmaSearchRankingProcess { if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasimage)))) continue; if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasapp )))) continue; } - if (pageAcc.size() < maxentries) { + if ((maxentries < 0) || (pageAcc.size() < maxentries)) { if (urlhashes.containsKey(iEntry.urlHash())) continue; while (pageAcc.containsKey(r)) r = new Long(r.longValue() + 1); pageAcc.put(r, iEntry); @@ -145,7 +154,38 @@ public final class plasmaSearchRankingProcess { if (container.size() > query.neededResults()) remove(true, true); - process.yield(plasmaSearchProcessing.PRESORT, container.size()); + if (process != null) process.yield(plasmaSearchProcessing.PRESORT, container.size()); + } + + public class rIterator implements Iterator { + + boolean urls; + Iterator r; + plasmaWordIndex wi; + public rIterator(plasmaWordIndex wi, boolean fetchURLs) { + // if fetchURLs == true, this iterates indexURLEntry objects, otherwise it iterates indexRWIEntry objects + this.urls = fetchURLs; + this.r = pageAcc.entrySet().iterator(); + this.wi = wi; + } + + public boolean hasNext() { + return r.hasNext(); + } + + public Object next() { + Map.Entry entry = (Map.Entry) r.next(); + indexRWIEntry ientry = (indexRWIEntry) entry.getValue(); + if (urls) { + return wi.loadedURL.load(ientry.urlHash(), ientry, ((Long) entry.getKey()).longValue()); + } else { + return ientry; + } + } + + public void remove() { + throw new UnsupportedOperationException(); + } } public int size() { @@ -153,6 +193,10 @@ public final class plasmaSearchRankingProcess { return pageAcc.size(); } + public int[] flagCount() { + return c; + } + public int filteredCount() { return this.filteredCount; } @@ -170,9 +214,9 @@ public final class plasmaSearchRankingProcess { return iEntry; } - public Iterator entries() { - // returns an iterator of indexRWIEntry objects in the ranked order, best entry first - return this.pageAcc.values().iterator(); + public Iterator entries(plasmaWordIndex wi, boolean fetchURLs) { + // if fetchURLs == true, this iterates indexURLEntry objects, otherwise it iterates indexRWIEntry objects + return new rIterator(wi, fetchURLs); } public Set getReferences(int count) { diff --git a/source/de/anomic/plasma/plasmaSearchRankingProfile.java b/source/de/anomic/plasma/plasmaSearchRankingProfile.java index 277a2d34a..021408ec0 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProfile.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProfile.java @@ -64,6 +64,7 @@ public class plasmaSearchRankingProfile { public static final String HITCOUNT = "hitcount"; public static final String POSINTEXT = "posintext"; public static final String POSOFPHRASE = "posofphrase"; + public static final String POSINPHRASE = "posinphrase"; public static final String WORDDISTANCE = "worddistance"; public static final String APPURL = "appurl"; public static final String APPDESCR = "appdescr"; @@ -154,6 +155,7 @@ public class plasmaSearchRankingProfile { coeff_hitcount = parseMap(coeff, HITCOUNT, coeff_hitcount); coeff_posintext = parseMap(coeff, POSINTEXT, coeff_posintext); coeff_posofphrase = parseMap(coeff, POSOFPHRASE, coeff_posofphrase); + coeff_posinphrase = parseMap(coeff, POSINPHRASE, coeff_posinphrase); coeff_worddistance = parseMap(coeff, WORDDISTANCE, coeff_worddistance); coeff_appurl = parseMap(coeff, APPURL, coeff_appurl); coeff_appdescr = parseMap(coeff, APPDESCR, coeff_appdescr); @@ -207,6 +209,7 @@ public class plasmaSearchRankingProfile { ext.put(prefix + HITCOUNT, Integer.toString(coeff_hitcount)); ext.put(prefix + POSINTEXT, Integer.toString(coeff_posintext)); ext.put(prefix + POSOFPHRASE, Integer.toString(coeff_posofphrase)); + ext.put(prefix + POSINPHRASE, Integer.toString(coeff_posinphrase)); ext.put(prefix + WORDDISTANCE, Integer.toString(coeff_worddistance)); ext.put(prefix + APPURL, Integer.toString(coeff_appurl)); ext.put(prefix + APPDESCR, Integer.toString(coeff_appdescr)); diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index a34282c02..688737558 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1499,7 +1499,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if (urlhash.equals(yacyURL.dummyHash)) return null; yacyURL ne = crawlQueues.getURL(urlhash); if (ne != null) return ne; - indexURLEntry le = wordIndex.loadedURL.load(urlhash, null); + indexURLEntry le = wordIndex.loadedURL.load(urlhash, null, 0); if (le != null) return le.comp().url(); return null; } @@ -2541,7 +2541,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // finally, delete the url entry // determine the url string - indexURLEntry entry = wordIndex.loadedURL.load(urlhash, null); + indexURLEntry entry = wordIndex.loadedURL.load(urlhash, null, 0); if (entry == null) return 0; indexURLEntry.Components comp = entry.comp(); if (comp.url() == null) return 0; diff --git a/source/de/anomic/plasma/plasmaSwitchboardQueue.java b/source/de/anomic/plasma/plasmaSwitchboardQueue.java index 3d6ee6ff4..f7ff1f6f3 100644 --- a/source/de/anomic/plasma/plasmaSwitchboardQueue.java +++ b/source/de/anomic/plasma/plasmaSwitchboardQueue.java @@ -328,7 +328,7 @@ public class plasmaSwitchboardQueue { public yacyURL referrerURL() { if (referrerURL == null) { if ((referrerHash == null) || (referrerHash.equals(yacyURL.dummyHash))) return null; - indexURLEntry entry = lurls.load(referrerHash, null); + indexURLEntry entry = lurls.load(referrerHash, null, 0); if (entry == null) referrerURL = null; else referrerURL = entry.comp().url(); } return referrerURL; diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 8c40fc6ed..f772caae7 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -389,89 +389,106 @@ public final class plasmaWordIndex implements indexRI { return containers; } - public Finding retrieveURLs(String keyhash, kelondroBitfield filter, boolean all, int maxcount, boolean loadurl, int sortorder) { + public Finding retrieveURLs(plasmaSearchQuery query, boolean loadurl, int sortorder, plasmaSearchRankingProfile ranking) { // search for a word hash and generate a list of url links // sortorder: 0 = hash, 1 = url, 2 = ranking + assert query.queryHashes.size() == 1; + final TreeSet mi = new TreeSet(); + String keyhash = (String) query.queryHashes.first(); + kelondroBitfield filter = query.constraint; indexContainer index = getContainer(keyhash, null); - final TreeMap tm = new TreeMap(); - final TreeSet mi = new TreeSet(); - final ArrayList indexes = new ArrayList(); + indexRWIEntry ientry; + indexURLEntry uentry; final int[] c = new int[32]; - for (int i = 0; i < 32; i++) {c[i] = 0;} - - if ((index != null) && (index.size() != 0)) { - final Iterator en = index.entries(); - // generate a new map where the urls are sorted (not by hash but by the url text) - - indexRWIEntry ientry; - indexURLEntry uentry; - loop: while (en.hasNext()) { - ientry = (indexRWIEntry) en.next(); - - // test if ientry matches with filter - if (filter != null) { - // if all = true: let only entries pass that has all matching bits - // if all = false: let all entries pass that has at least one matching bit - if (all) { - for (int i = 0; i < 32; i++) { - if ((filter.get(i)) && (!ientry.flags().get(i))) continue loop; - } - } else { - boolean nok = true; - flagtest: for (int i = 0; i < 32; i++) { - if ((filter.get(i)) && (ientry.flags().get(i))) {nok = false; break flagtest;} - } - if (nok) continue loop; - } - } - - // increase flag counts - for (int i = 0; i < 32; i++) { - if (ientry.flags().get(i)) {c[i]++;} - } - - // load url - if (loadurl) { - uentry = loadedURL.load(ientry.urlHash(), null); - if (uentry == null) { - mi.add(ientry.urlHash()); - } else { - if (sortorder == 0) { - tm.put(uentry.comp().url().toNormalform(false, true), new Item(ientry, uentry)); - } - if (sortorder == 1) { - tm.put(ientry.urlHash(), new Item(ientry, uentry)); - } - } - } else { - indexes.add(new Item(ientry, null)); - } - if ((maxcount > 0) && (mi.size() + tm.size() > maxcount)) break loop; - } + for (int i = 0; i < 32; i++) {c[i] = 0;} + + if ((index == null) || (index.size() == 0)) { + return new Finding(mi.iterator(), mi.iterator(), mi, 0, c); } - if (loadurl) { - return new Finding(tm.values().iterator(), mi, tm.size(), c); + + if (sortorder == 2) { + plasmaSearchRankingProcess process = new plasmaSearchRankingProcess(query, null, ranking, query.neededResults()); + process.insert(index, true); + return new Finding(process.entries(this, true), null, mi, process.filteredCount(), process.flagCount()); } else { - return new Finding(indexes.iterator(), mi, indexes.size(), c); + final TreeMap tm = new TreeMap(); + final ArrayList indexes = new ArrayList(); + + final Iterator en = index.entries(); + // generate a new map where the urls are sorted (not by hash but by the url text) + + loop: while (en.hasNext()) { + ientry = (indexRWIEntry) en.next(); + + // test if ientry matches with filter + if (filter != null) { + // if all = true: let only entries pass that has all matching bits + // if all = false: let all entries pass that has at least one matching bit + if (query.allofconstraint) { + for (int i = 0; i < 32; i++) { + if ((filter.get(i)) && (!ientry.flags().get(i))) continue loop; + } + } else { + boolean nok = true; + flagtest: for (int i = 0; i < 32; i++) { + if ((filter.get(i)) && (ientry.flags().get(i))) {nok = false; break flagtest;} + } + if (nok) continue loop; + } + } + + // increase flag counts + for (int i = 0; i < 32; i++) { + if (ientry.flags().get(i)) {c[i]++;} + } + + // load url + if (loadurl) { + uentry = loadedURL.load(ientry.urlHash(), ientry, 0); + if (uentry == null) { + mi.add(ientry.urlHash()); + } else { + if (sortorder == 0) { + tm.put(uentry.comp().url().toNormalform(false, true), uentry); + } + if (sortorder == 1) { + tm.put(ientry.urlHash(), uentry); + } + } + } else { + indexes.add(ientry); + } + if ((query.neededResults() > 0) && (mi.size() + tm.size() > query.neededResults())) break loop; + } // end loop + if (loadurl) { + return new Finding(tm.values().iterator(), null, mi, tm.size(), c); + } else { + return new Finding(null, indexes.iterator(), mi, indexes.size(), c); + } } } - public class Finding { - private Iterator items; // an iterator if Items objects + public static class Finding { + private Iterator urls; // an iterator if indexURLEntry objects + private Iterator rwientries; // an iterator of indexRWIEntry objects private Set misses; // a set of hashes where we did not found items private int findcount; private int[] flagcount; - public Finding(Iterator items, Set misses, int findcount, int[] flagcount) { + public Finding(Iterator urls, Iterator rwientries, Set misses, int findcount, int[] flagcount) { this.findcount = findcount; - this.items = items; + this.urls = urls; + this.rwientries = rwientries; this.misses = misses; this.flagcount = flagcount; } public int size() { return this.findcount; } - public Iterator hit() { - return this.items; + public Iterator urls() { + return this.urls; + } + public Iterator rwientries() { + return this.rwientries; } public Set miss() { return this.misses; @@ -481,28 +498,6 @@ public final class plasmaWordIndex implements indexRI { } } - public class Item { - private indexRWIEntry ientry; - private indexURLEntry uentry; - public Item() { - ientry = null; - uentry = null; - } - public Item(indexRWIEntry ientry, indexURLEntry uentry) { - this.ientry = ientry; - this.uentry = uentry; - } - public boolean found() { - return (ientry != null) && (uentry != null); - } - public indexRWIEntry index() { - return this.ientry; - } - public indexURLEntry url() { - return this.uentry; - } - } - public int size() { return java.lang.Math.max(collections.size(), java.lang.Math.max(dhtInCache.size(), dhtOutCache.size())); } @@ -712,7 +707,7 @@ public final class plasmaWordIndex implements indexRI { entry = (indexRWIEntry) containerIterator.next(); // System.out.println("Wordhash: "+wordHash+" UrlHash: // "+entry.getUrlHash()); - indexURLEntry ue = lurl.load(entry.urlHash(), null); + indexURLEntry ue = lurl.load(entry.urlHash(), entry, 0); if (ue == null) { urlHashs.add(entry.urlHash()); } else { diff --git a/source/de/anomic/yacy/yacyURL.java b/source/de/anomic/yacy/yacyURL.java index a3f499051..393e5e6b2 100644 --- a/source/de/anomic/yacy/yacyURL.java +++ b/source/de/anomic/yacy/yacyURL.java @@ -991,10 +991,11 @@ public class yacyURL { return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(subdom + ":" + port + ":" + rootpath)).charAt(0); } - private static final char rootURLFlag = subdomPortPath("www", 80, ""); + private static final char rootURLFlag0 = subdomPortPath("", 80, ""); + private static final char rootURLFlag1 = subdomPortPath("www", 80, ""); public static final boolean probablyRootURL(String urlHash) { - return (urlHash.charAt(5) == rootURLFlag); + return (urlHash.charAt(5) == rootURLFlag0) || (urlHash.charAt(5) == rootURLFlag1); } private static String protocolHostPort(String protocol, String host, int port) { diff --git a/source/yacy.java b/source/yacy.java index 1e7500fa7..3fd2089c3 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -630,7 +630,7 @@ public final class yacy { iEntry = (indexRWIEntry) wordIdxEntries.next(); String urlHash = iEntry.urlHash(); if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try { - indexURLEntry urlEntry = currentUrlDB.load(urlHash, null); + indexURLEntry urlEntry = currentUrlDB.load(urlHash, null, 0); urlCounter++; minimizedUrlDB.store(urlEntry); if (urlCounter % 500 == 0) {
 hashurlposphraseurlcompsurllengthprops
ResourceNegative Ranking FactorsPositive Ranking FactorsReverse Normalized Weighted Ranking Sum
 hashurldom lengthybrurl compsurl lengthpos in textpos of phrasepos in phraseword distancedatewords in titlewords in textlocal linksremote linkshitcountprops
+ <unresolved URL Hash> #[urlhxValue]# #[urlStringShort]##[pos]##[phrase]##[urlcomps]##[urllength]##[domlength]##[ybr]##[urlcomps]##[urllength]##[pos]##[phrase]##[posinphrase]##[worddistance]##[date]##[wordsintitle]##[wordsintext]##[llocal]##[lother]##[hitcount]# #[props]##[ranking]#