added a Boost class which stores solr query boost values. The class can

be configured using the yacy.init file. The boost information is taken
from the configuration each time when a query to solr is done.
pull/1/head
Michael Peter Christen 12 years ago
parent ea033f8f8e
commit 72f165d58b

@ -943,7 +943,10 @@ WikiAccess = admin
# we will support different search profiles
# this is currently only a single default profile
# If this profile setting is empty, a hard-coded profile from plasmaSearchRanking is used
rankingProfile =
search.ranking.rwi.profile =
search.ranking.solr.boost.tmp=
search.ranking.solr.doubledetection.minlength=3
search.ranking.solr.doubledetection.quantrate=0.5f
#optional extern thumbnail program.
#the program must accept the invocation PROGRAM http://url /path/to/filename

@ -32,6 +32,7 @@ import java.util.Map.Entry;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.query.SearchEventCache;
import net.yacy.search.ranking.RankingProfile;
import net.yacy.server.serverObjects;
@ -159,7 +160,7 @@ public class Ranking_p {
if (post.containsKey("EnterRanking")) {
final RankingProfile ranking = new RankingProfile("local", post.toString());
sb.setConfig("rankingProfile", crypt.simpleEncode(ranking.toExternalString()));
sb.setConfig(SwitchboardConstants.SEARCH_RANKING_RWI_PROFILE, crypt.simpleEncode(ranking.toExternalString()));
final serverObjects prop = defaultValues();
//prop.putAll(ranking.toExternalMap("local"));
putRanking(prop, ranking, "local");
@ -167,7 +168,7 @@ public class Ranking_p {
}
if (post.containsKey("ResetRanking")) {
sb.setConfig("rankingProfile", "");
sb.setConfig(SwitchboardConstants.SEARCH_RANKING_RWI_PROFILE, "");
final RankingProfile ranking = new RankingProfile(Classification.ContentDomain.TEXT);
final serverObjects prop = defaultValues();
//prop.putAll(ranking.toExternalMap("local"));

@ -26,6 +26,7 @@ import java.util.ArrayList;
import java.util.Map;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.federate.solr.Boost;
import net.yacy.cora.federate.solr.YaCySchema;
import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector;
import net.yacy.cora.federate.solr.responsewriter.GSAResponseWriter;
@ -34,6 +35,7 @@ import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.CommonPattern;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.query.AccessTracker;
import net.yacy.search.query.QueryGoal;
import net.yacy.search.query.SearchEvent;
@ -99,6 +101,9 @@ public class searchresult {
Log.logInfo("GSA Query", post.toString());
sb.intermissionAllThreads(3000); // tell all threads to do nothing for a specific time
// update the boost values
Boost.RANKING.update(sb.getConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_BOOST, ""));
// rename post fields according to result style
//post.put(CommonParams.Q, post.remove("q")); // same as solr
//post.put(CommonParams.START, post.remove("start")); // same as solr
@ -115,7 +120,8 @@ public class searchresult {
post.put(CommonParams.ROWS, post.remove("num"));
post.put(CommonParams.ROWS, Math.min(post.getInt(CommonParams.ROWS, 10), (authenticated) ? 5000 : 100));
post.put("defType", "edismax");
post.put("bq", YaCySchema.fuzzy_signature_unique_b.getSolrFieldName() + ":true^100000.0"); // a boost query that moves double content to the back
float f = Boost.RANKING.get(YaCySchema.fuzzy_signature_unique_b);
post.put("bq", YaCySchema.fuzzy_signature_unique_b.getSolrFieldName() + ":true^" + Float.toString(f)); // a boost query that moves double content to the back
post.put(CommonParams.FL,
YaCySchema.content_type.getSolrFieldName() + ',' +
YaCySchema.id.getSolrFieldName() + ',' +

@ -142,7 +142,7 @@ public class select {
if (post == null) return null;
Log.logInfo("SOLR Query", post.toString());
sb.intermissionAllThreads(3000); // tell all threads to do nothing for a specific time
// rename post fields according to result style
if (!post.containsKey(CommonParams.Q)) post.put(CommonParams.Q, post.remove("query")); // sru patch
String q = post.get(CommonParams.Q, "");

@ -57,7 +57,7 @@ import org.apache.solr.update.processor.Lookup3Signature;
public class EnhancedTextProfileSignature extends Lookup3Signature {
private float quantRate = 0.01f;
private float minTokenLen = 2;
private int minTokenLen = 2;
private StringBuilder evalText = new StringBuilder(120); // start with some capacity, makes it much faster.
@Override

@ -0,0 +1,102 @@
/**
* Boost
* Copyright 2012 by Michael Peter Christen
* First released 30.11.2012 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.federate.solr;
import java.util.LinkedHashMap;
import net.yacy.cora.util.CommonPattern;
/**
* The Boost class is the solr ranking definition file. It contains boost values in a Linked HashMap; the 'linked'-Version is used
* to maintain the order of the arguments which shall be stable according to the iteration order within a configuration servlet.
* Because the order is influence by a double-check mechanismn the attributes to apply a document signature are also integrated
* into this class.
*/
public class Boost extends LinkedHashMap<YaCySchema, Float> {
private static final long serialVersionUID = 5248172257724571603L;
public final static Boost RANKING = new Boost();
// for minTokenLen = 2 the quantRate value should not be below 0.24; for minTokenLen = 3 the quantRate value must be not below 0.5!
private float quantRate = 0.5f; // to be filled with search.ranking.solr.doubledetection.quantrate
private int minTokenLen = 3; // to be filled with search.ranking.solr.doubledetection.minlength
private Boost() {
super();
put(YaCySchema.sku, 20.0f);
put(YaCySchema.url_paths_sxt, 20.0f);
put(YaCySchema.title, 15.0f);
put(YaCySchema.h1_txt, 11.0f);
put(YaCySchema.h2_txt, 10.0f);
put(YaCySchema.author, 8.0f);
put(YaCySchema.description, 5.0f);
put(YaCySchema.keywords, 2.0f);
put(YaCySchema.text_t, 1.0f);
put(YaCySchema.fuzzy_signature_unique_b, 100000.0f); // must be very high to move double results to end of list
}
/**
* override the get method to return 1.0f for each non-resolvable object
*/
public Float get(Object field) {
Float boost = super.get(field);
if (boost == null) return 1.0f;
return boost;
}
/**
* the updateDef is a definition string that comes from a configuration file.
* It should be a comma-separated list of field^boost values
* This should be called with the field in search.ranking.solr.boost
* @param boostDef the definition string
*/
public void update(String boostDef) {
// call i.e. with "sku^20.0f,url_paths_sxt^20.0f,title^15.0f,h1_txt^11.0f,h2_txt^10.0f,author^8.0f,description^5.0f,keywords^2.0f,text_t^1.0f,fuzzy_signature_unique_b^100000.0f"
if (boostDef == null || boostDef.length() == 0) return;
String[] bf = CommonPattern.COMMA.split(boostDef);
for (String boost: bf) {
int p = boost.indexOf('^');
if (p < 0) continue;
YaCySchema field = YaCySchema.valueOf(boost.substring(0, p));
Float factor = Float.parseFloat(boost.substring(p + 1));
this.put(field, factor);
}
}
public void setQuantRate(float quantRate) {
this.quantRate = quantRate;
}
public void setMinTokenLen(int minTokenLen) {
this.minTokenLen = minTokenLen;
}
public float getQuantRate() {
return quantRate;
}
public int getMinTokenLen() {
return minTokenLen;
}
}

@ -341,10 +341,5 @@ public enum YaCySchema implements Schema {
doc.setField(this.getSolrFieldName(), value);
}
public final void add(final SolrInputDocument doc, final String value, final float boost) {
assert !this.isMultiValued();
doc.setField(this.getSolrFieldName(), value, boost);
}
}

@ -104,8 +104,8 @@ public final class CrawlStacker {
this.acceptGlobalURLs = acceptGlobalURLs;
this.domainList = domainList;
this.fastQueue = new WorkflowProcessor<Request>("CrawlStackerFast", "This process checks new urls before they are enqueued into the balancer (proper, double-check, correct domain, filter)", new String[]{"Balancer"}, this, "job", 10000, null, 2);
this.slowQueue = new WorkflowProcessor<Request>("CrawlStackerSlow", "This is like CrawlStackerFast, but does additionaly a DNS lookup. The CrawlStackerFast does not need this because it can use the DNS cache.", new String[]{"Balancer"}, this, "job", 1000, null, 5);
this.fastQueue = new WorkflowProcessor<Request>("CrawlStackerFast", "This process checks new urls before they are enqueued into the balancer (proper, double-check, correct domain, filter)", new String[]{"Balancer"}, this, "job", 10000, null, WorkflowProcessor.availableCPU);
this.slowQueue = new WorkflowProcessor<Request>("CrawlStackerSlow", "This is like CrawlStackerFast, but does additionaly a DNS lookup. The CrawlStackerFast does not need this because it can use the DNS cache.", new String[]{"Balancer"}, this, "job", 1000, null, 2);
this.log.logInfo("STACKCRAWL thread initialized.");
}

@ -45,6 +45,7 @@ import net.yacy.cora.document.WordCache;
import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.document.analysis.EnhancedTextProfileSignature;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.federate.solr.Boost;
import net.yacy.cora.language.synonyms.SynonymLibrary;
import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.document.language.Identificator;
@ -236,8 +237,8 @@ public final class Condenser {
// check dups with http://localhost:8090/solr/select?q=*:*&start=0&rows=3&fl=sku,fuzzy_signature_text_t,fuzzy_signature_l,fuzzy_signature_unique_b
EnhancedTextProfileSignature fuzzySignatureFactory = new EnhancedTextProfileSignature();
Map<String,String> sp = new HashMap<String,String>();
sp.put("quantRate", "0.5"); // for minTokenLen = 2 the value should not be below 0.24; for minTokenLen = 3 the value must be not below 0.5!
sp.put("minTokenLen", "3");
sp.put("quantRate", Float.toString(Boost.RANKING.getQuantRate())); // for minTokenLen = 2 the value should not be below 0.24; for minTokenLen = 3 the value must be not below 0.5!
sp.put("minTokenLen", Integer.toString(Boost.RANKING.getMinTokenLen()));
fuzzySignatureFactory.init(new MapSolrParams(sp));
fuzzySignatureFactory.add(text);
byte[] fuzzy_signature_hash = fuzzySignatureFactory.getSignature();

@ -90,6 +90,7 @@ import net.yacy.cora.document.RSSReader;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.WordCache;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.federate.solr.Boost;
import net.yacy.cora.federate.solr.YaCySchema;
import net.yacy.cora.federate.solr.connector.ShardSelection;
import net.yacy.cora.federate.solr.connector.ShardSolrConnector;
@ -401,6 +402,9 @@ public final class Switchboard extends serverSwitch {
ConfigurationSet.Entry entry = solrScheme.get(field.name()); entry.setEnable(true); solrScheme.put(field.name(), entry);
}
solrScheme.commit();
Boost.RANKING.update(this.getConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_BOOST, "")); // must be called every time the boosts change
Boost.RANKING.setMinTokenLen(this.getConfigInt(SwitchboardConstants.SEARCH_RANKING_SOLR_DOUBLEDETECTION_MINLENGTH, 3));
Boost.RANKING.setQuantRate(this.getConfigFloat(SwitchboardConstants.SEARCH_RANKING_SOLR_DOUBLEDETECTION_QUANTRATE, 0.5f));
// initialize index
ReferenceContainer.maxReferences = getConfigInt("index.maxReferences", 0);
@ -1529,9 +1533,9 @@ public final class Switchboard extends serverSwitch {
}
public RankingProfile getRanking() {
return (getConfig("rankingProfile", "").isEmpty())
return (getConfig(SwitchboardConstants.SEARCH_RANKING_RWI_PROFILE, "").isEmpty())
? new RankingProfile(Classification.ContentDomain.TEXT)
: new RankingProfile("", crypt.simpleDecode(this.getConfig("rankingProfile", "")));
: new RankingProfile("", crypt.simpleDecode(this.getConfig(SwitchboardConstants.SEARCH_RANKING_RWI_PROFILE, "")));
}
/**

@ -459,6 +459,15 @@ public final class SwitchboardConstants {
public static final String SEARCH_VERIFY = "search.verify";
public static final String SEARCH_VERIFY_DELETE = "search.verify.delete";
/**
* ranking
*/
public static final String SEARCH_RANKING_RWI_PROFILE = "search.ranking.rwi.profile"; // old rwi rankingProfile ranking
public static final String SEARCH_RANKING_SOLR_BOOST = "search.ranking.solr.boost.tmp"; // temporary until we know best default values
public static final String SEARCH_RANKING_SOLR_DOUBLEDETECTION_MINLENGTH = "search.ranking.solr.doubledetection.minlength";
public static final String SEARCH_RANKING_SOLR_DOUBLEDETECTION_QUANTRATE = "search.ranking.solr.doubledetection.quantrate";
/**
* system tray
*/

@ -77,7 +77,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
super();
this.lazy = false;
}
/**
* initialize the scheme with a given configuration file
* the configuration file simply contains a list of lines with keywords
@ -86,6 +86,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
*/
public SolrConfiguration(final File configurationFile, boolean lazy) {
super(configurationFile);
this.lazy = lazy;
// check consistency: compare with YaCyField enum
if (this.isEmpty()) return;
Iterator<Entry> it = this.entryIterator();
@ -104,7 +105,6 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
Log.logWarning("SolrScheme", " solr scheme file " + configurationFile.getAbsolutePath() + " is missing declaration for '" + field.name() + "'");
}
}
this.lazy = lazy;
}
public boolean contains(YaCySchema field) {

@ -28,6 +28,7 @@ import java.util.LinkedHashMap;
import java.util.Map;
import java.util.SortedSet;
import net.yacy.cora.federate.solr.Boost;
import net.yacy.cora.federate.solr.YaCySchema;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.storage.HandleSet;
@ -221,19 +222,6 @@ public class QueryGoal {
YaCySchema.author,YaCySchema.description,YaCySchema.keywords,YaCySchema.text_t,YaCySchema.synonyms_sxt
};
private final static Map<YaCySchema,Float> boosts = new LinkedHashMap<YaCySchema,Float>();
static {
boosts.put(YaCySchema.sku, 20.0f);
boosts.put(YaCySchema.url_paths_sxt, 20.0f);
boosts.put(YaCySchema.title, 15.0f);
boosts.put(YaCySchema.h1_txt, 11.0f);
boosts.put(YaCySchema.h2_txt, 10.0f);
boosts.put(YaCySchema.author, 8.0f);
boosts.put(YaCySchema.description, 5.0f);
boosts.put(YaCySchema.keywords, 2.0f);
boosts.put(YaCySchema.text_t, 1.0f);
}
public StringBuilder solrQueryString(SolrConfiguration configuration) {
final StringBuilder q = new StringBuilder(80);
@ -266,7 +254,7 @@ public class QueryGoal {
if (wc > 0) q.append(" OR ");
q.append('(');
q.append(field.getSolrFieldName()).append(':').append(w);
boost = boosts.get(field);
boost = Boost.RANKING.get(field);
if (boost != null) q.append('^').append(boost.toString());
q.append(')');
wc++;

@ -46,6 +46,7 @@ import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.federate.solr.Boost;
import net.yacy.cora.federate.solr.YaCySchema;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.geo.GeoLocation;
@ -430,7 +431,8 @@ public final class QueryParams {
// construct query
final SolrQuery params = new SolrQuery();
params.setParam("defType", "edismax");
params.setParam("bq", YaCySchema.fuzzy_signature_unique_b.getSolrFieldName() + ":true^100000.0"); // a boost query that moves double content to the back
float f = Boost.RANKING.get(YaCySchema.fuzzy_signature_unique_b);
params.setParam("bq", YaCySchema.fuzzy_signature_unique_b.getSolrFieldName() + ":true^" + Float.toString(f)); // a boost query that moves double content to the back
params.setStart(this.offset);
params.setRows(this.itemsPerPage);
params.setFacet(false);

@ -31,6 +31,7 @@ import java.util.Map;
import java.util.SortedMap;
import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.federate.solr.Boost;
import net.yacy.data.WorkTables;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.MemoryControl;
@ -148,7 +149,9 @@ public class SearchEventCache {
Log.logInfo("SearchEventCache", "getEvent: " + lastEvents.size() + " in cache; " + countAliveThreads() + " alive");
// start a new event
final boolean delete = Switchboard.getSwitchboard() == null || Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.SEARCH_VERIFY_DELETE, true);
Switchboard sb = Switchboard.getSwitchboard();
final boolean delete = sb == null || Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.SEARCH_VERIFY_DELETE, true);
if (sb != null) Boost.RANKING.update(sb.getConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_BOOST, "")); // update the boost values
event = new SearchEvent(query, peers, workTables, preselectedPeerHashes, generateAbstracts, loader, remote_maxcount, remote_maxtime, burstRobinsonPercent, burstMultiwordPercent, delete);
MemoryControl.request(100 * 1024 * 1024, false); // this may trigger a short memory status which causes a reducing of cache space of other threads
}

@ -264,7 +264,7 @@ public class serverSwitch
* @param dflt default value which will be used in case parameter can not be found or if it is invalid
* @return value if the parameter or default value
*/
public double getConfigFloat(final String key, final float dflt) {
public float getConfigFloat(final String key, final float dflt) {
try {
return Float.parseFloat(getConfig(key, Float.toString(dflt)));
} catch ( final NumberFormatException e ) {

Loading…
Cancel
Save