From f0db5016307fc33ebc180a9852dc7a0416ca054b Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Thu, 22 May 2014 03:01:07 +0200 Subject: [PATCH] better handling of ranking parameters and new default values for date navigation which is done using ranking in solr. --- defaults/yacy.init | 8 ++++---- htroot/IndexControlRWIs_p.java | 8 ++++---- htroot/RankingSolr_p.html | 2 +- htroot/yacysearchitem.java | 2 +- .../sorting/WeakPriorityBlockingQueue.java | 18 +++++++++++------- .../kelondro/data/meta/URIMetadataNode.java | 16 +++++++++------- source/net/yacy/search/index/Fulltext.java | 10 +++++----- source/net/yacy/search/query/SearchEvent.java | 6 +++--- .../net/yacy/search/snippet/ResultEntry.java | 4 ++-- 9 files changed, 40 insertions(+), 34 deletions(-) diff --git a/defaults/yacy.init b/defaults/yacy.init index ca104a19e..232b7ea27 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -992,19 +992,19 @@ search.ranking.rwi.profile = # All boost methods > 0 must have names to be able to select this name with a query, with the syntax /name search.ranking.solr.collection.boostname.tmpa.0=Default Profile search.ranking.solr.collection.boostfields.tmpa.0=url_paths_sxt^3.0,synonyms_sxt^0.5,title^5.0,text_t^1.0,host_s^6.0,h1_txt^5.0,url_file_name_tokens_t^4.0,h2_txt^3.0 -search.ranking.solr.collection.boostquery.tmpa.0=clickdepth_i:0^0.8 clickdepth_i:1^0.4 +search.ranking.solr.collection.boostquery.tmpa.0=crawldepth_i:0^0.8 crawldepth_i:1^0.4 search.ranking.solr.collection.boostfunction.tmpb.0= search.ranking.solr.collection.boostname.tmpa.1=Date Profile: sort by date in descending order for a '/data' usage search.ranking.solr.collection.boostfields.tmpa.1=text_t^1.0 -search.ranking.solr.collection.boostquery.tmpa.1=clickdepth_i:0^0.8 clickdepth_i:1^0.4 -search.ranking.solr.collection.boostfunction.tmpb.1=recip(rord(last_modified),1,1000,1000) +search.ranking.solr.collection.boostquery.tmpa.1=crawldepth_i:0^0.8 crawldepth_i:1^0.4 +search.ranking.solr.collection.boostfunction.tmpb.1=recip(ms(NOW,last_modified),3.16e-11,1,1) search.ranking.solr.collection.boostname.tmpa.2=Intranet Profile: when a search is done on a singe domain only, i.e. if a site:-operator is used search.ranking.solr.collection.boostfields.tmpa.2=url_paths_sxt^3.0,synonyms_sxt^0.5,title^5.0,text_t^1.0,h1_txt^5.0,url_file_name_tokens_t^4.0,h2_txt^3.0,h3_txt^2.0 search.ranking.solr.collection.boostquery.tmpa.2=fuzzy_signature_unique_b:true^10.0 search.ranking.solr.collection.boostfunction.tmpb.2= search.ranking.solr.collection.boostname.tmpa.3=_unused3 search.ranking.solr.collection.boostfields.tmpa.3=text_t^1.0 -search.ranking.solr.collection.boostquery.tmpa.3=clickdepth_i:0^0.8 clickdepth_i:1^0.4 +search.ranking.solr.collection.boostquery.tmpa.3=crawldepth_i:0^0.8 crawldepth_i:1^0.4 search.ranking.solr.collection.boostfunction.tmpb.3= # the following values are used to identify duplicate content diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index 8afd32597..6ceaa4de1 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -479,15 +479,15 @@ public class IndexControlRWIs_p { DigestURL url; URIMetadataNode entry; String us; - long rn = -1; + float rn = Float.MIN_VALUE; while (!theSearch.rwiIsEmpty() && (entry = theSearch.pullOneFilteredFromRWI(false)) != null) { url = entry.url(); if ( url == null ) { continue; } us = url.toNormalform(true); - if ( rn == -1 ) { - rn = entry.ranking(); + if ( rn == Float.MIN_VALUE ) { + rn = entry.score(); } prop.put("genUrlList_urlList_" + i + "_urlExists", "1"); prop.put("genUrlList_urlList_" + i + "_urlExists_urlhxCount", i); @@ -497,7 +497,7 @@ public class IndexControlRWIs_p { prop.putHTML("genUrlList_urlList_" + i + "_urlExists_urlString", us); prop.put("genUrlList_urlList_" + i + "_urlExists_urlStringShort", (us.length() > 40) ? (us.substring(0, 20) + "
" + us.substring(20, 40) + "...") : ((us.length() > 30) ? (us.substring(0, 20) + "
" + us.substring(20)) : us)); - prop.putNum("genUrlList_urlList_" + i + "_urlExists_ranking", (entry.ranking() - rn)); + prop.putNum("genUrlList_urlList_" + i + "_urlExists_ranking", Float.toString(entry.score() - rn)); prop.putNum("genUrlList_urlList_" + i + "_urlExists_domlength", DigestURL.domLengthEstimation(entry.hash())); prop.putNum("genUrlList_urlList_" + i + "_urlExists_tf", 1000.0 * entry.word().termFrequency()); prop.putNum("genUrlList_urlList_" + i + "_urlExists_authority", (theSearch.getOrder() == null) ? -1 : theSearch.getOrder().authority(ASCII.String(entry.hash(), 6, 6))); diff --git a/htroot/RankingSolr_p.html b/htroot/RankingSolr_p.html index 9fab3980b..34a431f95 100644 --- a/htroot/RankingSolr_p.html +++ b/htroot/RankingSolr_p.html @@ -25,7 +25,7 @@ A Boost Function can combine numeric values from the result document to produce a number which is multiplied with the score value from the query result. To see all available fields, see the YaCy Solr Schema and look for numeric values (these are names with suffix '_i'). To find out which kind of operations are possible, see the Solr Function Query documentation. - Example: to order by date, use "recip(rord(last_modified),1,1000,1000)", to order by clickdepth, use "div(100,add(clickdepth_i,1))". + Example: to order by date, use "recip(ms(NOW,last_modified),3.16e-11,1,1)", to order by crawldepth, use "div(100,add(crawldepth_i,1))".
diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index 434bcee3f..ae6d9b311 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -195,7 +195,7 @@ public class yacysearchitem { if (faviconURL != null && fileType == FileType.HTML) sb.loader.loadIfNotExistBackground(faviconURL, 1024 * 1024 * 10, null, ClientIdentification.yacyIntranetCrawlerAgent); prop.putHTML("content_faviconCode", URLLicense.aquireLicense(faviconURL)); // acquire license for favicon url loading prop.put("content_urlhash", resulthashString); - prop.put("content_ranking", result.ranking()); + prop.put("content_ranking", Float.toString(result.score())); prop.put("content_showMetadata_urlhash", resulthashString); prop.put("content_showCache_link", resultUrlstring); prop.put("content_showProxy_link", resultUrlstring); diff --git a/source/net/yacy/cora/sorting/WeakPriorityBlockingQueue.java b/source/net/yacy/cora/sorting/WeakPriorityBlockingQueue.java index 95b66f3db..c926303f0 100644 --- a/source/net/yacy/cora/sorting/WeakPriorityBlockingQueue.java +++ b/source/net/yacy/cora/sorting/WeakPriorityBlockingQueue.java @@ -287,7 +287,7 @@ public class WeakPriorityBlockingQueue implements Serializable { return this.drained.iterator(); } - public interface Element extends Serializable { + public interface Element extends Serializable, Comparable>, Comparator> { public long getWeight(); public E getElement(); public boolean equals(Element o); @@ -295,6 +295,10 @@ public class WeakPriorityBlockingQueue implements Serializable { public int hashCode(); @Override public String toString(); + @Override + public int compare(Element o1, Element o2); + @Override + public int compareTo(Element o); } private abstract static class AbstractElement implements Element, Serializable { @@ -334,7 +338,7 @@ public class WeakPriorityBlockingQueue implements Serializable { * natural ordering elements, can be used as container of objects in the priority queue * the elements with smallest ordering weights are first in the queue when elements are taken */ - public static class NaturalElement extends AbstractElement implements Element, Comparable>, Comparator> { + public static class NaturalElement extends AbstractElement implements Element, Comparable>, Comparator> { private static final long serialVersionUID = 6816543012966928794L; @@ -344,12 +348,12 @@ public class WeakPriorityBlockingQueue implements Serializable { } @Override - public int compare(NaturalElement o1, NaturalElement o2) { + public int compare(Element o1, Element o2) { return o1.compareTo(o2); } @Override - public int compareTo(NaturalElement o) { + public int compareTo(Element o) { if (this.element == o.getElement()) return 0; if (this.element.equals(o.getElement())) return 0; if (this.weight > o.getWeight()) return 1; @@ -367,7 +371,7 @@ public class WeakPriorityBlockingQueue implements Serializable { * reverse ordering elements, can be used as container of objects in the priority queue * the elements with highest ordering weights are first in the queue when elements are taken */ - public static class ReverseElement extends AbstractElement implements Element, Comparable>, Comparator> { + public static class ReverseElement extends AbstractElement implements Element, Comparable>, Comparator> { private static final long serialVersionUID = -8166724491837508921L; @@ -377,12 +381,12 @@ public class WeakPriorityBlockingQueue implements Serializable { } @Override - public int compare(ReverseElement o1, ReverseElement o2) { + public int compare(Element o1, Element o2) { return o1.compareTo(o2); } @Override - public int compareTo(ReverseElement o) { + public int compareTo(Element o) { if (this.element == o.getElement()) return 0; if (this.element.equals(o.getElement())) return 0; if (this.weight > o.getWeight()) return -1; diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java index 13e4351b1..9f09c7fad 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java @@ -72,7 +72,7 @@ public class URIMetadataNode extends SolrDocument { protected Bitfield flags = null; protected int imagec = -1, audioc = -1, videoc = -1, appc = -1; protected double lat = Double.NaN, lon = Double.NaN; - protected long ranking = 0; // during generation of a search result this value is set + protected float score = 0; // during generation of a search result this value is set protected String snippet = null; protected WordReferenceVars word = null; // this is only used if the url is transported via remote search requests @@ -139,6 +139,7 @@ public class URIMetadataNode extends SolrDocument { this.videoc = Integer.parseInt(prop.getProperty("lvideo", "0")); this.appc = Integer.parseInt(prop.getProperty("lapp", "0")); this.snippet = crypt.simpleDecode(prop.getProperty("snippet", "")); + this.score = Float.parseFloat(prop.getProperty("score", "0.0")); this.word = null; if (prop.containsKey("wi")) { this.word = new WordReferenceVars(new WordReferenceRow(Base64Order.enhancedCoder.decodeString(prop.getProperty("wi", ""))), false); @@ -151,8 +152,8 @@ public class URIMetadataNode extends SolrDocument { this.addField(name, doc.getFieldValue(name)); } this.snippet = ""; - Float score = (Float) doc.getFieldValue("score"); // this is a special field containing the ranking score of a search result - this.ranking = score == null ? 0 : (long) (1000000.0f * score.floatValue()); // solr score values are sometimes very low + Float scorex = (Float) doc.getFieldValue("score"); // this is a special field containing the ranking score of a search result + this.score = scorex == null ? 0.0f : scorex.floatValue(); this.hash = ASCII.getBytes(getString(CollectionSchema.id)); this.urlRaw = getString(CollectionSchema.sku); try { @@ -163,10 +164,10 @@ public class URIMetadataNode extends SolrDocument { } } - public URIMetadataNode(final SolrDocument doc, final WordReferenceVars searchedWord, final long ranking) { + public URIMetadataNode(final SolrDocument doc, final WordReferenceVars searchedWord, final float scorex) { this(doc); this.word = searchedWord; - this.ranking = ranking; + this.score = scorex; } /** @@ -254,8 +255,8 @@ public class URIMetadataNode extends SolrDocument { return this.lon; } - public long ranking() { - return this.ranking; + public float score() { + return this.score; } public Date loaddate() { @@ -467,6 +468,7 @@ public class URIMetadataNode extends SolrDocument { s.append(",laudio=").append(this.laudio()); s.append(",lvideo=").append(this.lvideo()); s.append(",lapp=").append(this.lapp()); + s.append(",score=").append(Float.toString(this.score())); if (this.word() != null) { // append also word properties final String wprop = this.word().toPropertyForm(); diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index cd05bdabb..e62a1b947 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -276,24 +276,24 @@ public final class Fulltext { if (element == null) return null; WordReferenceVars wre = element.getElement(); if (wre == null) return null; // all time was already wasted in takeRWI to get another element - long weight = element.getWeight(); - URIMetadataNode node = getMetadata(wre.urlhash(), wre, weight); + float score = element.getWeight(); + URIMetadataNode node = getMetadata(wre.urlhash(), wre, score); return node; } public URIMetadataNode getMetadata(final byte[] urlHash) { if (urlHash == null) return null; - return getMetadata(urlHash, null, 0); + return getMetadata(urlHash, null, 0.0f); } - private URIMetadataNode getMetadata(final byte[] urlHash, final WordReferenceVars wre, final long weight) { + private URIMetadataNode getMetadata(final byte[] urlHash, final WordReferenceVars wre, final float score) { String u = ASCII.String(urlHash); // get the metadata from Solr try { SolrDocument doc = this.getDefaultConnector().getDocumentById(u); if (doc != null) { - return new URIMetadataNode(doc, wre, weight); + return new URIMetadataNode(doc, wre, score); } } catch (final IOException e) { ConcurrentLog.logException(e); diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index 1bf8c7a1e..d851d3142 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -907,7 +907,7 @@ public final class SearchEvent { this.urlhashes.putUnique(iEntry.hash()); rankingtryloop: while (true) { try { - long score = iEntry.ranking(); + long score = (long) (1000000.0f * iEntry.score()); this.nodeStack.put(new ReverseElement(iEntry, score == 0 ? this.order.cardinal(iEntry) : score)); // inserts the element and removes the worst (which is smallest) break rankingtryloop; } catch (final ArithmeticException e ) { @@ -1291,8 +1291,8 @@ public final class SearchEvent { */ public void addResult(ResultEntry resultEntry) { if (resultEntry == null) return; - long ranking = resultEntry.ranking(); - ranking += postRanking(resultEntry, new ConcurrentScoreMap() /*this.snippetProcess.rankingProcess.getTopicNavigator(10)*/); + float score = resultEntry.score(); + final long ranking = ((long) (score * 128.f)) + postRanking(resultEntry, new ConcurrentScoreMap() /*this.snippetProcess.rankingProcess.getTopicNavigator(10)*/); this.resultList.put(new ReverseElement(resultEntry, ranking)); // remove smallest in case of overflow if (pollImmediately) this.resultList.poll(); // prevent re-ranking in case there is only a single index source which has already ranked entries. this.addTopics(resultEntry); diff --git a/source/net/yacy/search/snippet/ResultEntry.java b/source/net/yacy/search/snippet/ResultEntry.java index 7eb6b855f..056e0cde0 100644 --- a/source/net/yacy/search/snippet/ResultEntry.java +++ b/source/net/yacy/search/snippet/ResultEntry.java @@ -229,7 +229,7 @@ public class ResultEntry implements Comparable, Comparator