diff --git a/defaults/yacy.init b/defaults/yacy.init
index f3d7837c3..22ad6b4dd 100644
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@@ -943,7 +943,10 @@ WikiAccess = admin
# we will support different search profiles
# this is currently only a single default profile
# If this profile setting is empty, a hard-coded profile from plasmaSearchRanking is used
-rankingProfile =
+search.ranking.rwi.profile =
+search.ranking.solr.boost.tmp=
+search.ranking.solr.doubledetection.minlength=3
+search.ranking.solr.doubledetection.quantrate=0.5f
#optional extern thumbnail program.
#the program must accept the invocation PROGRAM http://url /path/to/filename
diff --git a/htroot/Ranking_p.java b/htroot/Ranking_p.java
index 6a549a837..3b4129e75 100644
--- a/htroot/Ranking_p.java
+++ b/htroot/Ranking_p.java
@@ -32,6 +32,7 @@ import java.util.Map.Entry;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.search.Switchboard;
+import net.yacy.search.SwitchboardConstants;
import net.yacy.search.query.SearchEventCache;
import net.yacy.search.ranking.RankingProfile;
import net.yacy.server.serverObjects;
@@ -159,7 +160,7 @@ public class Ranking_p {
if (post.containsKey("EnterRanking")) {
final RankingProfile ranking = new RankingProfile("local", post.toString());
- sb.setConfig("rankingProfile", crypt.simpleEncode(ranking.toExternalString()));
+ sb.setConfig(SwitchboardConstants.SEARCH_RANKING_RWI_PROFILE, crypt.simpleEncode(ranking.toExternalString()));
final serverObjects prop = defaultValues();
//prop.putAll(ranking.toExternalMap("local"));
putRanking(prop, ranking, "local");
@@ -167,7 +168,7 @@ public class Ranking_p {
}
if (post.containsKey("ResetRanking")) {
- sb.setConfig("rankingProfile", "");
+ sb.setConfig(SwitchboardConstants.SEARCH_RANKING_RWI_PROFILE, "");
final RankingProfile ranking = new RankingProfile(Classification.ContentDomain.TEXT);
final serverObjects prop = defaultValues();
//prop.putAll(ranking.toExternalMap("local"));
diff --git a/htroot/gsa/searchresult.java b/htroot/gsa/searchresult.java
index dcff883bf..a86694574 100644
--- a/htroot/gsa/searchresult.java
+++ b/htroot/gsa/searchresult.java
@@ -26,6 +26,7 @@ import java.util.ArrayList;
import java.util.Map;
import net.yacy.cora.document.UTF8;
+import net.yacy.cora.federate.solr.Boost;
import net.yacy.cora.federate.solr.YaCySchema;
import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector;
import net.yacy.cora.federate.solr.responsewriter.GSAResponseWriter;
@@ -34,6 +35,7 @@ import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.CommonPattern;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard;
+import net.yacy.search.SwitchboardConstants;
import net.yacy.search.query.AccessTracker;
import net.yacy.search.query.QueryGoal;
import net.yacy.search.query.SearchEvent;
@@ -99,6 +101,9 @@ public class searchresult {
Log.logInfo("GSA Query", post.toString());
sb.intermissionAllThreads(3000); // tell all threads to do nothing for a specific time
+ // update the boost values
+ Boost.RANKING.update(sb.getConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_BOOST, ""));
+
// rename post fields according to result style
//post.put(CommonParams.Q, post.remove("q")); // same as solr
//post.put(CommonParams.START, post.remove("start")); // same as solr
@@ -115,7 +120,8 @@ public class searchresult {
post.put(CommonParams.ROWS, post.remove("num"));
post.put(CommonParams.ROWS, Math.min(post.getInt(CommonParams.ROWS, 10), (authenticated) ? 5000 : 100));
post.put("defType", "edismax");
- post.put("bq", YaCySchema.fuzzy_signature_unique_b.getSolrFieldName() + ":true^100000.0"); // a boost query that moves double content to the back
+ float f = Boost.RANKING.get(YaCySchema.fuzzy_signature_unique_b);
+ post.put("bq", YaCySchema.fuzzy_signature_unique_b.getSolrFieldName() + ":true^" + Float.toString(f)); // a boost query that moves double content to the back
post.put(CommonParams.FL,
YaCySchema.content_type.getSolrFieldName() + ',' +
YaCySchema.id.getSolrFieldName() + ',' +
diff --git a/htroot/solr/select.java b/htroot/solr/select.java
index e2f2a1f7e..5fd789126 100644
--- a/htroot/solr/select.java
+++ b/htroot/solr/select.java
@@ -142,7 +142,7 @@ public class select {
if (post == null) return null;
Log.logInfo("SOLR Query", post.toString());
sb.intermissionAllThreads(3000); // tell all threads to do nothing for a specific time
-
+
// rename post fields according to result style
if (!post.containsKey(CommonParams.Q)) post.put(CommonParams.Q, post.remove("query")); // sru patch
String q = post.get(CommonParams.Q, "");
diff --git a/source/net/yacy/cora/document/analysis/EnhancedTextProfileSignature.java b/source/net/yacy/cora/document/analysis/EnhancedTextProfileSignature.java
index 8c82e129f..af991e4e0 100644
--- a/source/net/yacy/cora/document/analysis/EnhancedTextProfileSignature.java
+++ b/source/net/yacy/cora/document/analysis/EnhancedTextProfileSignature.java
@@ -57,7 +57,7 @@ import org.apache.solr.update.processor.Lookup3Signature;
public class EnhancedTextProfileSignature extends Lookup3Signature {
private float quantRate = 0.01f;
- private float minTokenLen = 2;
+ private int minTokenLen = 2;
private StringBuilder evalText = new StringBuilder(120); // start with some capacity, makes it much faster.
@Override
diff --git a/source/net/yacy/cora/federate/solr/Boost.java b/source/net/yacy/cora/federate/solr/Boost.java
new file mode 100644
index 000000000..51b3c91e5
--- /dev/null
+++ b/source/net/yacy/cora/federate/solr/Boost.java
@@ -0,0 +1,102 @@
+/**
+ * Boost
+ * Copyright 2012 by Michael Peter Christen
+ * First released 30.11.2012 at http://yacy.net
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program in the file lgpl21.txt
+ * If not, see .
+ */
+
+package net.yacy.cora.federate.solr;
+
+import java.util.LinkedHashMap;
+
+import net.yacy.cora.util.CommonPattern;
+
+/**
+ * The Boost class is the solr ranking definition file. It contains boost values in a Linked HashMap; the 'linked'-Version is used
+ * to maintain the order of the arguments which shall be stable according to the iteration order within a configuration servlet.
+ * Because the order is influence by a double-check mechanismn the attributes to apply a document signature are also integrated
+ * into this class.
+ */
+public class Boost extends LinkedHashMap {
+
+ private static final long serialVersionUID = 5248172257724571603L;
+
+ public final static Boost RANKING = new Boost();
+
+ // for minTokenLen = 2 the quantRate value should not be below 0.24; for minTokenLen = 3 the quantRate value must be not below 0.5!
+ private float quantRate = 0.5f; // to be filled with search.ranking.solr.doubledetection.quantrate
+ private int minTokenLen = 3; // to be filled with search.ranking.solr.doubledetection.minlength
+
+ private Boost() {
+ super();
+ put(YaCySchema.sku, 20.0f);
+ put(YaCySchema.url_paths_sxt, 20.0f);
+ put(YaCySchema.title, 15.0f);
+ put(YaCySchema.h1_txt, 11.0f);
+ put(YaCySchema.h2_txt, 10.0f);
+ put(YaCySchema.author, 8.0f);
+ put(YaCySchema.description, 5.0f);
+ put(YaCySchema.keywords, 2.0f);
+ put(YaCySchema.text_t, 1.0f);
+ put(YaCySchema.fuzzy_signature_unique_b, 100000.0f); // must be very high to move double results to end of list
+ }
+
+
+ /**
+ * override the get method to return 1.0f for each non-resolvable object
+ */
+ public Float get(Object field) {
+ Float boost = super.get(field);
+ if (boost == null) return 1.0f;
+ return boost;
+ }
+
+ /**
+ * the updateDef is a definition string that comes from a configuration file.
+ * It should be a comma-separated list of field^boost values
+ * This should be called with the field in search.ranking.solr.boost
+ * @param boostDef the definition string
+ */
+ public void update(String boostDef) {
+ // call i.e. with "sku^20.0f,url_paths_sxt^20.0f,title^15.0f,h1_txt^11.0f,h2_txt^10.0f,author^8.0f,description^5.0f,keywords^2.0f,text_t^1.0f,fuzzy_signature_unique_b^100000.0f"
+ if (boostDef == null || boostDef.length() == 0) return;
+ String[] bf = CommonPattern.COMMA.split(boostDef);
+ for (String boost: bf) {
+ int p = boost.indexOf('^');
+ if (p < 0) continue;
+ YaCySchema field = YaCySchema.valueOf(boost.substring(0, p));
+ Float factor = Float.parseFloat(boost.substring(p + 1));
+ this.put(field, factor);
+ }
+ }
+
+ public void setQuantRate(float quantRate) {
+ this.quantRate = quantRate;
+ }
+
+ public void setMinTokenLen(int minTokenLen) {
+ this.minTokenLen = minTokenLen;
+ }
+
+ public float getQuantRate() {
+ return quantRate;
+ }
+
+ public int getMinTokenLen() {
+ return minTokenLen;
+ }
+
+}
diff --git a/source/net/yacy/cora/federate/solr/YaCySchema.java b/source/net/yacy/cora/federate/solr/YaCySchema.java
index 078e15355..bf2e7e01d 100644
--- a/source/net/yacy/cora/federate/solr/YaCySchema.java
+++ b/source/net/yacy/cora/federate/solr/YaCySchema.java
@@ -341,10 +341,5 @@ public enum YaCySchema implements Schema {
doc.setField(this.getSolrFieldName(), value);
}
- public final void add(final SolrInputDocument doc, final String value, final float boost) {
- assert !this.isMultiValued();
- doc.setField(this.getSolrFieldName(), value, boost);
- }
-
}
diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java
index bee51d884..7d84285ab 100644
--- a/source/net/yacy/crawler/CrawlStacker.java
+++ b/source/net/yacy/crawler/CrawlStacker.java
@@ -104,8 +104,8 @@ public final class CrawlStacker {
this.acceptGlobalURLs = acceptGlobalURLs;
this.domainList = domainList;
- this.fastQueue = new WorkflowProcessor("CrawlStackerFast", "This process checks new urls before they are enqueued into the balancer (proper, double-check, correct domain, filter)", new String[]{"Balancer"}, this, "job", 10000, null, 2);
- this.slowQueue = new WorkflowProcessor("CrawlStackerSlow", "This is like CrawlStackerFast, but does additionaly a DNS lookup. The CrawlStackerFast does not need this because it can use the DNS cache.", new String[]{"Balancer"}, this, "job", 1000, null, 5);
+ this.fastQueue = new WorkflowProcessor("CrawlStackerFast", "This process checks new urls before they are enqueued into the balancer (proper, double-check, correct domain, filter)", new String[]{"Balancer"}, this, "job", 10000, null, WorkflowProcessor.availableCPU);
+ this.slowQueue = new WorkflowProcessor("CrawlStackerSlow", "This is like CrawlStackerFast, but does additionaly a DNS lookup. The CrawlStackerFast does not need this because it can use the DNS cache.", new String[]{"Balancer"}, this, "job", 1000, null, 2);
this.log.logInfo("STACKCRAWL thread initialized.");
}
diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java
index 77beb20e7..68b10c174 100644
--- a/source/net/yacy/document/Condenser.java
+++ b/source/net/yacy/document/Condenser.java
@@ -45,6 +45,7 @@ import net.yacy.cora.document.WordCache;
import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.document.analysis.EnhancedTextProfileSignature;
import net.yacy.cora.document.MultiProtocolURI;
+import net.yacy.cora.federate.solr.Boost;
import net.yacy.cora.language.synonyms.SynonymLibrary;
import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.document.language.Identificator;
@@ -236,8 +237,8 @@ public final class Condenser {
// check dups with http://localhost:8090/solr/select?q=*:*&start=0&rows=3&fl=sku,fuzzy_signature_text_t,fuzzy_signature_l,fuzzy_signature_unique_b
EnhancedTextProfileSignature fuzzySignatureFactory = new EnhancedTextProfileSignature();
Map sp = new HashMap();
- sp.put("quantRate", "0.5"); // for minTokenLen = 2 the value should not be below 0.24; for minTokenLen = 3 the value must be not below 0.5!
- sp.put("minTokenLen", "3");
+ sp.put("quantRate", Float.toString(Boost.RANKING.getQuantRate())); // for minTokenLen = 2 the value should not be below 0.24; for minTokenLen = 3 the value must be not below 0.5!
+ sp.put("minTokenLen", Integer.toString(Boost.RANKING.getMinTokenLen()));
fuzzySignatureFactory.init(new MapSolrParams(sp));
fuzzySignatureFactory.add(text);
byte[] fuzzy_signature_hash = fuzzySignatureFactory.getSignature();
diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java
index 6b9fb18dc..3b81afbae 100644
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@@ -90,6 +90,7 @@ import net.yacy.cora.document.RSSReader;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.WordCache;
import net.yacy.cora.document.analysis.Classification;
+import net.yacy.cora.federate.solr.Boost;
import net.yacy.cora.federate.solr.YaCySchema;
import net.yacy.cora.federate.solr.connector.ShardSelection;
import net.yacy.cora.federate.solr.connector.ShardSolrConnector;
@@ -401,6 +402,9 @@ public final class Switchboard extends serverSwitch {
ConfigurationSet.Entry entry = solrScheme.get(field.name()); entry.setEnable(true); solrScheme.put(field.name(), entry);
}
solrScheme.commit();
+ Boost.RANKING.update(this.getConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_BOOST, "")); // must be called every time the boosts change
+ Boost.RANKING.setMinTokenLen(this.getConfigInt(SwitchboardConstants.SEARCH_RANKING_SOLR_DOUBLEDETECTION_MINLENGTH, 3));
+ Boost.RANKING.setQuantRate(this.getConfigFloat(SwitchboardConstants.SEARCH_RANKING_SOLR_DOUBLEDETECTION_QUANTRATE, 0.5f));
// initialize index
ReferenceContainer.maxReferences = getConfigInt("index.maxReferences", 0);
@@ -1529,9 +1533,9 @@ public final class Switchboard extends serverSwitch {
}
public RankingProfile getRanking() {
- return (getConfig("rankingProfile", "").isEmpty())
+ return (getConfig(SwitchboardConstants.SEARCH_RANKING_RWI_PROFILE, "").isEmpty())
? new RankingProfile(Classification.ContentDomain.TEXT)
- : new RankingProfile("", crypt.simpleDecode(this.getConfig("rankingProfile", "")));
+ : new RankingProfile("", crypt.simpleDecode(this.getConfig(SwitchboardConstants.SEARCH_RANKING_RWI_PROFILE, "")));
}
/**
diff --git a/source/net/yacy/search/SwitchboardConstants.java b/source/net/yacy/search/SwitchboardConstants.java
index 8d79ac9f4..285a92bd3 100644
--- a/source/net/yacy/search/SwitchboardConstants.java
+++ b/source/net/yacy/search/SwitchboardConstants.java
@@ -459,6 +459,15 @@ public final class SwitchboardConstants {
public static final String SEARCH_VERIFY = "search.verify";
public static final String SEARCH_VERIFY_DELETE = "search.verify.delete";
+ /**
+ * ranking
+ */
+ public static final String SEARCH_RANKING_RWI_PROFILE = "search.ranking.rwi.profile"; // old rwi rankingProfile ranking
+ public static final String SEARCH_RANKING_SOLR_BOOST = "search.ranking.solr.boost.tmp"; // temporary until we know best default values
+ public static final String SEARCH_RANKING_SOLR_DOUBLEDETECTION_MINLENGTH = "search.ranking.solr.doubledetection.minlength";
+ public static final String SEARCH_RANKING_SOLR_DOUBLEDETECTION_QUANTRATE = "search.ranking.solr.doubledetection.quantrate";
+
+
/**
* system tray
*/
diff --git a/source/net/yacy/search/index/SolrConfiguration.java b/source/net/yacy/search/index/SolrConfiguration.java
index 19e5364f6..4b4df69b4 100644
--- a/source/net/yacy/search/index/SolrConfiguration.java
+++ b/source/net/yacy/search/index/SolrConfiguration.java
@@ -77,7 +77,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
super();
this.lazy = false;
}
-
+
/**
* initialize the scheme with a given configuration file
* the configuration file simply contains a list of lines with keywords
@@ -86,6 +86,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
*/
public SolrConfiguration(final File configurationFile, boolean lazy) {
super(configurationFile);
+ this.lazy = lazy;
// check consistency: compare with YaCyField enum
if (this.isEmpty()) return;
Iterator it = this.entryIterator();
@@ -104,7 +105,6 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
Log.logWarning("SolrScheme", " solr scheme file " + configurationFile.getAbsolutePath() + " is missing declaration for '" + field.name() + "'");
}
}
- this.lazy = lazy;
}
public boolean contains(YaCySchema field) {
diff --git a/source/net/yacy/search/query/QueryGoal.java b/source/net/yacy/search/query/QueryGoal.java
index a26c07765..dc71f7716 100644
--- a/source/net/yacy/search/query/QueryGoal.java
+++ b/source/net/yacy/search/query/QueryGoal.java
@@ -28,6 +28,7 @@ import java.util.LinkedHashMap;
import java.util.Map;
import java.util.SortedSet;
+import net.yacy.cora.federate.solr.Boost;
import net.yacy.cora.federate.solr.YaCySchema;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.storage.HandleSet;
@@ -221,19 +222,6 @@ public class QueryGoal {
YaCySchema.author,YaCySchema.description,YaCySchema.keywords,YaCySchema.text_t,YaCySchema.synonyms_sxt
};
- private final static Map boosts = new LinkedHashMap();
- static {
- boosts.put(YaCySchema.sku, 20.0f);
- boosts.put(YaCySchema.url_paths_sxt, 20.0f);
- boosts.put(YaCySchema.title, 15.0f);
- boosts.put(YaCySchema.h1_txt, 11.0f);
- boosts.put(YaCySchema.h2_txt, 10.0f);
- boosts.put(YaCySchema.author, 8.0f);
- boosts.put(YaCySchema.description, 5.0f);
- boosts.put(YaCySchema.keywords, 2.0f);
- boosts.put(YaCySchema.text_t, 1.0f);
- }
-
public StringBuilder solrQueryString(SolrConfiguration configuration) {
final StringBuilder q = new StringBuilder(80);
@@ -266,7 +254,7 @@ public class QueryGoal {
if (wc > 0) q.append(" OR ");
q.append('(');
q.append(field.getSolrFieldName()).append(':').append(w);
- boost = boosts.get(field);
+ boost = Boost.RANKING.get(field);
if (boost != null) q.append('^').append(boost.toString());
q.append(')');
wc++;
diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java
index 44006cbc3..279b7b4e2 100644
--- a/source/net/yacy/search/query/QueryParams.java
+++ b/source/net/yacy/search/query/QueryParams.java
@@ -46,6 +46,7 @@ import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.document.MultiProtocolURI;
+import net.yacy.cora.federate.solr.Boost;
import net.yacy.cora.federate.solr.YaCySchema;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.geo.GeoLocation;
@@ -430,7 +431,8 @@ public final class QueryParams {
// construct query
final SolrQuery params = new SolrQuery();
params.setParam("defType", "edismax");
- params.setParam("bq", YaCySchema.fuzzy_signature_unique_b.getSolrFieldName() + ":true^100000.0"); // a boost query that moves double content to the back
+ float f = Boost.RANKING.get(YaCySchema.fuzzy_signature_unique_b);
+ params.setParam("bq", YaCySchema.fuzzy_signature_unique_b.getSolrFieldName() + ":true^" + Float.toString(f)); // a boost query that moves double content to the back
params.setStart(this.offset);
params.setRows(this.itemsPerPage);
params.setFacet(false);
diff --git a/source/net/yacy/search/query/SearchEventCache.java b/source/net/yacy/search/query/SearchEventCache.java
index e3daec4da..de2d2bde8 100644
--- a/source/net/yacy/search/query/SearchEventCache.java
+++ b/source/net/yacy/search/query/SearchEventCache.java
@@ -31,6 +31,7 @@ import java.util.Map;
import java.util.SortedMap;
import java.util.concurrent.ConcurrentHashMap;
+import net.yacy.cora.federate.solr.Boost;
import net.yacy.data.WorkTables;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.MemoryControl;
@@ -148,7 +149,9 @@ public class SearchEventCache {
Log.logInfo("SearchEventCache", "getEvent: " + lastEvents.size() + " in cache; " + countAliveThreads() + " alive");
// start a new event
- final boolean delete = Switchboard.getSwitchboard() == null || Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.SEARCH_VERIFY_DELETE, true);
+ Switchboard sb = Switchboard.getSwitchboard();
+ final boolean delete = sb == null || Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.SEARCH_VERIFY_DELETE, true);
+ if (sb != null) Boost.RANKING.update(sb.getConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_BOOST, "")); // update the boost values
event = new SearchEvent(query, peers, workTables, preselectedPeerHashes, generateAbstracts, loader, remote_maxcount, remote_maxtime, burstRobinsonPercent, burstMultiwordPercent, delete);
MemoryControl.request(100 * 1024 * 1024, false); // this may trigger a short memory status which causes a reducing of cache space of other threads
}
diff --git a/source/net/yacy/server/serverSwitch.java b/source/net/yacy/server/serverSwitch.java
index 8cac6cdb4..82321eb5e 100644
--- a/source/net/yacy/server/serverSwitch.java
+++ b/source/net/yacy/server/serverSwitch.java
@@ -264,7 +264,7 @@ public class serverSwitch
* @param dflt default value which will be used in case parameter can not be found or if it is invalid
* @return value if the parameter or default value
*/
- public double getConfigFloat(final String key, final float dflt) {
+ public float getConfigFloat(final String key, final float dflt) {
try {
return Float.parseFloat(getConfig(key, Float.toString(dflt)));
} catch ( final NumberFormatException e ) {