From 9d5d86cd03dae92363ec1ddf83de9fefd1dbe331 Mon Sep 17 00:00:00 2001 From: orbiter Date: Sun, 15 Jun 2014 12:38:30 +0200 Subject: [PATCH] Added filter query options to the ranking servlet /RankingSolr_p.html. Filter queries are not actually related to ranking, but user requests have pointed out that specific boost queries to move results to the end of the result list are not sufficient. Such boost filters may be better executed as actual filter and therefore such a filter can now be statically applied to every search request. A typical use could be the expression "http_unique_b:true AND www_unique_b:true" which uses the recently introduced fields http_unique_b and www_unique_b which are true only for one of the alternatives with/without http(s) and with/without prefix 'www.' in host names. --- defaults/yacy.init | 4 +++ htroot/RankingSolr_p.html | 24 +++++++++++++++-- htroot/RankingSolr_p.java | 16 ++++++++++++ .../net/yacy/cora/federate/solr/Ranking.java | 26 ++++++++++++++++--- .../yacy/http/servlets/GSAsearchServlet.java | 2 ++ .../yacy/http/servlets/SolrSelectServlet.java | 2 ++ source/net/yacy/search/Switchboard.java | 2 ++ .../net/yacy/search/SwitchboardConstants.java | 1 + source/net/yacy/search/query/QueryParams.java | 7 ++++- 9 files changed, 78 insertions(+), 6 deletions(-) diff --git a/defaults/yacy.init b/defaults/yacy.init index 5bbe3ecff..9dfb51027 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -1000,18 +1000,22 @@ search.ranking.rwi.profile = # All boost methods > 0 must have names to be able to select this name with a query, with the syntax /name search.ranking.solr.collection.boostname.tmpa.0=Default Profile search.ranking.solr.collection.boostfields.tmpa.0=url_paths_sxt^3.0,synonyms_sxt^0.5,title^5.0,text_t^1.0,host_s^6.0,h1_txt^5.0,url_file_name_tokens_t^4.0,h2_txt^3.0 +search.ranking.solr.collection.filterquery.tmpa.0= search.ranking.solr.collection.boostquery.tmpa.0=crawldepth_i:0^0.8 crawldepth_i:1^0.4 search.ranking.solr.collection.boostfunction.tmpb.0= search.ranking.solr.collection.boostname.tmpa.1=Date Profile: sort by date in descending order for a '/data' usage search.ranking.solr.collection.boostfields.tmpa.1=text_t^1.0 +search.ranking.solr.collection.filterquery.tmpa.1= search.ranking.solr.collection.boostquery.tmpa.1=crawldepth_i:0^0.8 crawldepth_i:1^0.4 search.ranking.solr.collection.boostfunction.tmpb.1=recip(ms(NOW,last_modified),3.16e-11,1,1) search.ranking.solr.collection.boostname.tmpa.2=Intranet Profile: when a search is done on a singe domain only, i.e. if a site:-operator is used search.ranking.solr.collection.boostfields.tmpa.2=url_paths_sxt^3.0,synonyms_sxt^0.5,title^5.0,text_t^1.0,h1_txt^5.0,url_file_name_tokens_t^4.0,h2_txt^3.0,h3_txt^2.0 +search.ranking.solr.collection.filterquery.tmpa.2= search.ranking.solr.collection.boostquery.tmpa.2=fuzzy_signature_unique_b:true^10.0 search.ranking.solr.collection.boostfunction.tmpb.2= search.ranking.solr.collection.boostname.tmpa.3=_unused3 search.ranking.solr.collection.boostfields.tmpa.3=text_t^1.0 +search.ranking.solr.collection.filterquery.tmpa.3= search.ranking.solr.collection.boostquery.tmpa.3=crawldepth_i:0^0.8 crawldepth_i:1^0.4 search.ranking.solr.collection.boostfunction.tmpb.3= diff --git a/htroot/RankingSolr_p.html b/htroot/RankingSolr_p.html index 34a431f95..6c37b03f4 100644 --- a/htroot/RankingSolr_p.html +++ b/htroot/RankingSolr_p.html @@ -27,7 +27,7 @@ To find out which kind of operations are possible, see the Solr Function Query documentation. Example: to order by date, use "recip(ms(NOW,last_modified),3.16e-11,1,1)", to order by crawldepth, use "div(100,add(crawldepth_i,1))".
-
+
@@ -47,7 +47,7 @@ Example: "fuzzy_signature_unique_b:true^100000.0f" means that documents, identified as 'double' are ranked very bad and appended to the end of all results (because the unique are ranked high). To find appropriate fields for this query, see the YaCy Solr Schema and look for boolean values (with suffix '_b') or tags inside string fields (with suffix '_s' or '_sxt').
-
+
@@ -59,6 +59,26 @@
+
+
+ + Filter Query + The Filter Query is attached to every query. Use this to statically add a selection criteria to reduce the set of results. + Example: "http_unique_b:true AND www_unique_b:true" will filter out all results where urls appear also with/without http(s) and/or with/without 'www.' prefix. + To find appropriate fields for this query, see the YaCy Solr Schema. Warning: bad expressions here will cause that you don't have any search result! +
+
+
+ +
+
+
+ + +
+
+
+
diff --git a/htroot/RankingSolr_p.java b/htroot/RankingSolr_p.java index 55f1547d6..aa930c9be 100644 --- a/htroot/RankingSolr_p.java +++ b/htroot/RankingSolr_p.java @@ -88,6 +88,21 @@ public class RankingSolr_p { } } + if (post != null && post.containsKey("EnterFQ")) { + String fq = post.get("fq"); + if (fq != null) { + sb.setConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_FILTERQUERY_ + profileNr, fq); + sb.index.fulltext().getDefaultConfiguration().getRanking(profileNr).setFilterQuery(fq); + } + } + if (post != null && post.containsKey("ResetFQ")) { + String fq = ""; // i.e. "http_unique_b:true AND www_unique_b:true" + if (fq != null) { + sb.setConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_FILTERQUERY_ + profileNr, fq); + sb.index.fulltext().getDefaultConfiguration().getRanking(profileNr).setFilterQuery(fq); + } + } + if (post != null && post.containsKey("EnterBF")) { String bf = post.get("bf"); if (bf != null) { @@ -124,6 +139,7 @@ public class RankingSolr_p { i++; } prop.put("boosts", i); + prop.put("fq", ranking.getFilterQuery()); prop.put("bq", ranking.getBoostQuery()); prop.put("bf", ranking.getBoostFunction()); diff --git a/source/net/yacy/cora/federate/solr/Ranking.java b/source/net/yacy/cora/federate/solr/Ranking.java index 1d62e5772..b182e5150 100644 --- a/source/net/yacy/cora/federate/solr/Ranking.java +++ b/source/net/yacy/cora/federate/solr/Ranking.java @@ -38,17 +38,17 @@ public class Ranking { private static int minTokenLen = 3; // to be filled with search.ranking.solr.doubledetection.minlength private Map fieldBoosts; - private String name, boostQuery, boostFunction; + private String name, filterQuery, boostQuery, boostFunction; public Ranking() { super(); this.name = ""; this.fieldBoosts = new LinkedHashMap(); + this.filterQuery = ""; this.boostQuery = ""; this.boostFunction = ""; } - public String getName() { return name; } @@ -95,12 +95,32 @@ public class Ranking { } } + /** + * set a filter query which will be added as fq-attribute to the query + * @param filterQuery + */ + public void setFilterQuery(String filterQuery) { + this.filterQuery = filterQuery; + } + + /** + * get a string that can be added as a filter query at the fq-attribute + * @return + */ + public String getFilterQuery() { + return this.filterQuery; + } + + /** + * set a boost query which will be added as bq-attribute to the query + * @param boostQuery + */ public void setBoostQuery(String boostQuery) { this.boostQuery = boostQuery; } /** - * produce a string that can be added as a 'boost query' at the bq-attribute + * get a string that can be added as a 'boost query' at the bq-attribute * @return */ public String getBoostQuery() { diff --git a/source/net/yacy/http/servlets/GSAsearchServlet.java b/source/net/yacy/http/servlets/GSAsearchServlet.java index bd67c1d59..0109a308b 100644 --- a/source/net/yacy/http/servlets/GSAsearchServlet.java +++ b/source/net/yacy/http/servlets/GSAsearchServlet.java @@ -148,8 +148,10 @@ public class GSAsearchServlet extends HttpServlet { } else { // if no such sort attribute is given, use the ranking as configured for YaCy Ranking ranking = sb.index.fulltext().getDefaultConfiguration().getRanking(0); + String fq = ranking.getFilterQuery(); String bq = ranking.getBoostQuery(); String bf = ranking.getBoostFunction(); + if (fq.length() > 0) post.put(CommonParams.FQ, bq); if (bq.length() > 0) post.put("bq", bq); if (bf.length() > 0) post.put("boost", bf); // a boost function extension, see http://wiki.apache.org/solr/ExtendedDisMax#bf_.28Boost_Function.2C_additive.29 } diff --git a/source/net/yacy/http/servlets/SolrSelectServlet.java b/source/net/yacy/http/servlets/SolrSelectServlet.java index 0d3e3b072..bdf5b72ce 100644 --- a/source/net/yacy/http/servlets/SolrSelectServlet.java +++ b/source/net/yacy/http/servlets/SolrSelectServlet.java @@ -157,8 +157,10 @@ public class SolrSelectServlet extends HttpServlet { if (!mmsp.getMap().containsKey("sort") && !mmsp.getMap().containsKey("bq") && !mmsp.getMap().containsKey("bf") && !mmsp.getMap().containsKey("boost")) { if (!mmsp.getMap().containsKey("defType")) mmsp.getMap().put("defType", new String[]{"edismax"}); Ranking ranking = sb.index.fulltext().getDefaultConfiguration().getRanking(profileNr); + String fq = ranking.getFilterQuery(); String bq = ranking.getBoostQuery(); String bf = ranking.getBoostFunction(); + if (fq.length() > 0) mmsp.getMap().put("fq", new String[]{fq}); if (bq.length() > 0) mmsp.getMap().put("bq", new String[]{bq}); if (bf.length() > 0) mmsp.getMap().put("boost", new String[]{bf}); // a boost function extension, see http://wiki.apache.org/solr/ExtendedDisMax#bf_.28Boost_Function.2C_additive.29 } diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 5677b6cf1..773afccf2 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -487,6 +487,7 @@ public final class Switchboard extends serverSwitch { Ranking r = solrCollectionConfigurationWork.getRanking(i); String name = this.getConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTNAME_ + i, "_dummy" + i); String boosts = this.getConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTFIELDS_ + i, "text_t^1.0"); + String fq = this.getConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_FILTERQUERY_ + i, ""); String bq = this.getConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTQUERY_ + i, ""); String bf = this.getConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_COLLECTION_BOOSTFUNCTION_ + i, ""); // apply some hard-coded patches for earlier experiments we do not want any more @@ -496,6 +497,7 @@ public final class Switchboard extends serverSwitch { if (boosts.equals("url_paths_sxt^1000.0,synonyms_sxt^1.0,title^10000.0,text_t^2.0,h1_txt^1000.0,h2_txt^100.0,host_organization_s^100000.0")) boosts = "url_paths_sxt^3.0,synonyms_sxt^0.5,title^5.0,text_t^1.0,host_s^6.0,h1_txt^5.0,url_file_name_tokens_t^4.0,h2_txt^2.0"; r.setName(name); r.updateBoosts(boosts); + r.setFilterQuery(fq); r.setBoostQuery(bq); r.setBoostFunction(bf); } diff --git a/source/net/yacy/search/SwitchboardConstants.java b/source/net/yacy/search/SwitchboardConstants.java index 9e8d187c8..d5ee6b25e 100644 --- a/source/net/yacy/search/SwitchboardConstants.java +++ b/source/net/yacy/search/SwitchboardConstants.java @@ -523,6 +523,7 @@ public final class SwitchboardConstants { */ public static final String SEARCH_RANKING_SOLR_COLLECTION_BOOSTNAME_ = "search.ranking.solr.collection.boostname.tmpa."; // temporary until we know best default values; add the index number (0..3) to that string public static final String SEARCH_RANKING_SOLR_COLLECTION_BOOSTFIELDS_ = "search.ranking.solr.collection.boostfields.tmpa."; + public static final String SEARCH_RANKING_SOLR_COLLECTION_FILTERQUERY_ = "search.ranking.solr.collection.filterquery.tmpa."; public static final String SEARCH_RANKING_SOLR_COLLECTION_BOOSTQUERY_ = "search.ranking.solr.collection.boostquery.tmpa."; public static final String SEARCH_RANKING_SOLR_COLLECTION_BOOSTFUNCTION_ = "search.ranking.solr.collection.boostfunction.tmpb."; diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java index c5ed882cb..af71cf6f6 100644 --- a/source/net/yacy/search/query/QueryParams.java +++ b/source/net/yacy/search/query/QueryParams.java @@ -347,7 +347,8 @@ public final class QueryParams { int rankingProfile = this.ranking.coeff_date == RankingProfile.COEFF_MAX ? 1 : (this.modifier.sitehash != null || this.modifier.sitehost != null) ? 2 : 0; params.setQuery(this.queryGoal.collectionTextQueryString(this.indexSegment.fulltext().getDefaultConfiguration(), rankingProfile, excludeintext_image).toString()); Ranking ranking = indexSegment.fulltext().getDefaultConfiguration().getRanking(rankingProfile); // for a by-date ranking select different ranking profile - + + String fq = ranking.getFilterQuery(); String bq = ranking.getBoostQuery(); String bf = ranking.getBoostFunction(); if (this.queryGoal.getIncludeSize() > 1) { @@ -355,6 +356,10 @@ public final class QueryParams { if (bq.length() > 0) bq += " "; bq += CollectionSchema.text_t.getSolrFieldName() + ":\"" + this.queryGoal.getIncludeString() + "\"^10"; } + if (fq.length() > 0) { + String oldfq = params.get("fq"); + params.setParam("fq", oldfq == null || oldfq.length() == 0 ? fq : "(" + oldfq + ") AND (" + fq + ")"); + } if (bq.length() > 0) params.setParam("bq", bq); if (bf.length() > 0) params.setParam("boost", bf); // a boost function extension, see http://wiki.apache.org/solr/ExtendedDisMax#bf_.28Boost_Function.2C_additive.29