diff --git a/defaults/yacy.init b/defaults/yacy.init index f3d7837c3..22ad6b4dd 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -943,7 +943,10 @@ WikiAccess = admin # we will support different search profiles # this is currently only a single default profile # If this profile setting is empty, a hard-coded profile from plasmaSearchRanking is used -rankingProfile = +search.ranking.rwi.profile = +search.ranking.solr.boost.tmp= +search.ranking.solr.doubledetection.minlength=3 +search.ranking.solr.doubledetection.quantrate=0.5f #optional extern thumbnail program. #the program must accept the invocation PROGRAM http://url /path/to/filename diff --git a/htroot/Ranking_p.java b/htroot/Ranking_p.java index 6a549a837..3b4129e75 100644 --- a/htroot/Ranking_p.java +++ b/htroot/Ranking_p.java @@ -32,6 +32,7 @@ import java.util.Map.Entry; import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.search.Switchboard; +import net.yacy.search.SwitchboardConstants; import net.yacy.search.query.SearchEventCache; import net.yacy.search.ranking.RankingProfile; import net.yacy.server.serverObjects; @@ -159,7 +160,7 @@ public class Ranking_p { if (post.containsKey("EnterRanking")) { final RankingProfile ranking = new RankingProfile("local", post.toString()); - sb.setConfig("rankingProfile", crypt.simpleEncode(ranking.toExternalString())); + sb.setConfig(SwitchboardConstants.SEARCH_RANKING_RWI_PROFILE, crypt.simpleEncode(ranking.toExternalString())); final serverObjects prop = defaultValues(); //prop.putAll(ranking.toExternalMap("local")); putRanking(prop, ranking, "local"); @@ -167,7 +168,7 @@ public class Ranking_p { } if (post.containsKey("ResetRanking")) { - sb.setConfig("rankingProfile", ""); + sb.setConfig(SwitchboardConstants.SEARCH_RANKING_RWI_PROFILE, ""); final RankingProfile ranking = new RankingProfile(Classification.ContentDomain.TEXT); final serverObjects prop = defaultValues(); //prop.putAll(ranking.toExternalMap("local")); diff --git a/htroot/gsa/searchresult.java b/htroot/gsa/searchresult.java index dcff883bf..a86694574 100644 --- a/htroot/gsa/searchresult.java +++ b/htroot/gsa/searchresult.java @@ -26,6 +26,7 @@ import java.util.ArrayList; import java.util.Map; import net.yacy.cora.document.UTF8; +import net.yacy.cora.federate.solr.Boost; import net.yacy.cora.federate.solr.YaCySchema; import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector; import net.yacy.cora.federate.solr.responsewriter.GSAResponseWriter; @@ -34,6 +35,7 @@ import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.CommonPattern; import net.yacy.kelondro.logging.Log; import net.yacy.search.Switchboard; +import net.yacy.search.SwitchboardConstants; import net.yacy.search.query.AccessTracker; import net.yacy.search.query.QueryGoal; import net.yacy.search.query.SearchEvent; @@ -99,6 +101,9 @@ public class searchresult { Log.logInfo("GSA Query", post.toString()); sb.intermissionAllThreads(3000); // tell all threads to do nothing for a specific time + // update the boost values + Boost.RANKING.update(sb.getConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_BOOST, "")); + // rename post fields according to result style //post.put(CommonParams.Q, post.remove("q")); // same as solr //post.put(CommonParams.START, post.remove("start")); // same as solr @@ -115,7 +120,8 @@ public class searchresult { post.put(CommonParams.ROWS, post.remove("num")); post.put(CommonParams.ROWS, Math.min(post.getInt(CommonParams.ROWS, 10), (authenticated) ? 5000 : 100)); post.put("defType", "edismax"); - post.put("bq", YaCySchema.fuzzy_signature_unique_b.getSolrFieldName() + ":true^100000.0"); // a boost query that moves double content to the back + float f = Boost.RANKING.get(YaCySchema.fuzzy_signature_unique_b); + post.put("bq", YaCySchema.fuzzy_signature_unique_b.getSolrFieldName() + ":true^" + Float.toString(f)); // a boost query that moves double content to the back post.put(CommonParams.FL, YaCySchema.content_type.getSolrFieldName() + ',' + YaCySchema.id.getSolrFieldName() + ',' + diff --git a/htroot/solr/select.java b/htroot/solr/select.java index e2f2a1f7e..5fd789126 100644 --- a/htroot/solr/select.java +++ b/htroot/solr/select.java @@ -142,7 +142,7 @@ public class select { if (post == null) return null; Log.logInfo("SOLR Query", post.toString()); sb.intermissionAllThreads(3000); // tell all threads to do nothing for a specific time - + // rename post fields according to result style if (!post.containsKey(CommonParams.Q)) post.put(CommonParams.Q, post.remove("query")); // sru patch String q = post.get(CommonParams.Q, ""); diff --git a/source/net/yacy/cora/document/analysis/EnhancedTextProfileSignature.java b/source/net/yacy/cora/document/analysis/EnhancedTextProfileSignature.java index 8c82e129f..af991e4e0 100644 --- a/source/net/yacy/cora/document/analysis/EnhancedTextProfileSignature.java +++ b/source/net/yacy/cora/document/analysis/EnhancedTextProfileSignature.java @@ -57,7 +57,7 @@ import org.apache.solr.update.processor.Lookup3Signature; public class EnhancedTextProfileSignature extends Lookup3Signature { private float quantRate = 0.01f; - private float minTokenLen = 2; + private int minTokenLen = 2; private StringBuilder evalText = new StringBuilder(120); // start with some capacity, makes it much faster. @Override diff --git a/source/net/yacy/cora/federate/solr/Boost.java b/source/net/yacy/cora/federate/solr/Boost.java new file mode 100644 index 000000000..51b3c91e5 --- /dev/null +++ b/source/net/yacy/cora/federate/solr/Boost.java @@ -0,0 +1,102 @@ +/** + * Boost + * Copyright 2012 by Michael Peter Christen + * First released 30.11.2012 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.cora.federate.solr; + +import java.util.LinkedHashMap; + +import net.yacy.cora.util.CommonPattern; + +/** + * The Boost class is the solr ranking definition file. It contains boost values in a Linked HashMap; the 'linked'-Version is used + * to maintain the order of the arguments which shall be stable according to the iteration order within a configuration servlet. + * Because the order is influence by a double-check mechanismn the attributes to apply a document signature are also integrated + * into this class. + */ +public class Boost extends LinkedHashMap { + + private static final long serialVersionUID = 5248172257724571603L; + + public final static Boost RANKING = new Boost(); + + // for minTokenLen = 2 the quantRate value should not be below 0.24; for minTokenLen = 3 the quantRate value must be not below 0.5! + private float quantRate = 0.5f; // to be filled with search.ranking.solr.doubledetection.quantrate + private int minTokenLen = 3; // to be filled with search.ranking.solr.doubledetection.minlength + + private Boost() { + super(); + put(YaCySchema.sku, 20.0f); + put(YaCySchema.url_paths_sxt, 20.0f); + put(YaCySchema.title, 15.0f); + put(YaCySchema.h1_txt, 11.0f); + put(YaCySchema.h2_txt, 10.0f); + put(YaCySchema.author, 8.0f); + put(YaCySchema.description, 5.0f); + put(YaCySchema.keywords, 2.0f); + put(YaCySchema.text_t, 1.0f); + put(YaCySchema.fuzzy_signature_unique_b, 100000.0f); // must be very high to move double results to end of list + } + + + /** + * override the get method to return 1.0f for each non-resolvable object + */ + public Float get(Object field) { + Float boost = super.get(field); + if (boost == null) return 1.0f; + return boost; + } + + /** + * the updateDef is a definition string that comes from a configuration file. + * It should be a comma-separated list of field^boost values + * This should be called with the field in search.ranking.solr.boost + * @param boostDef the definition string + */ + public void update(String boostDef) { + // call i.e. with "sku^20.0f,url_paths_sxt^20.0f,title^15.0f,h1_txt^11.0f,h2_txt^10.0f,author^8.0f,description^5.0f,keywords^2.0f,text_t^1.0f,fuzzy_signature_unique_b^100000.0f" + if (boostDef == null || boostDef.length() == 0) return; + String[] bf = CommonPattern.COMMA.split(boostDef); + for (String boost: bf) { + int p = boost.indexOf('^'); + if (p < 0) continue; + YaCySchema field = YaCySchema.valueOf(boost.substring(0, p)); + Float factor = Float.parseFloat(boost.substring(p + 1)); + this.put(field, factor); + } + } + + public void setQuantRate(float quantRate) { + this.quantRate = quantRate; + } + + public void setMinTokenLen(int minTokenLen) { + this.minTokenLen = minTokenLen; + } + + public float getQuantRate() { + return quantRate; + } + + public int getMinTokenLen() { + return minTokenLen; + } + +} diff --git a/source/net/yacy/cora/federate/solr/YaCySchema.java b/source/net/yacy/cora/federate/solr/YaCySchema.java index 078e15355..bf2e7e01d 100644 --- a/source/net/yacy/cora/federate/solr/YaCySchema.java +++ b/source/net/yacy/cora/federate/solr/YaCySchema.java @@ -341,10 +341,5 @@ public enum YaCySchema implements Schema { doc.setField(this.getSolrFieldName(), value); } - public final void add(final SolrInputDocument doc, final String value, final float boost) { - assert !this.isMultiValued(); - doc.setField(this.getSolrFieldName(), value, boost); - } - } diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java index bee51d884..7d84285ab 100644 --- a/source/net/yacy/crawler/CrawlStacker.java +++ b/source/net/yacy/crawler/CrawlStacker.java @@ -104,8 +104,8 @@ public final class CrawlStacker { this.acceptGlobalURLs = acceptGlobalURLs; this.domainList = domainList; - this.fastQueue = new WorkflowProcessor("CrawlStackerFast", "This process checks new urls before they are enqueued into the balancer (proper, double-check, correct domain, filter)", new String[]{"Balancer"}, this, "job", 10000, null, 2); - this.slowQueue = new WorkflowProcessor("CrawlStackerSlow", "This is like CrawlStackerFast, but does additionaly a DNS lookup. The CrawlStackerFast does not need this because it can use the DNS cache.", new String[]{"Balancer"}, this, "job", 1000, null, 5); + this.fastQueue = new WorkflowProcessor("CrawlStackerFast", "This process checks new urls before they are enqueued into the balancer (proper, double-check, correct domain, filter)", new String[]{"Balancer"}, this, "job", 10000, null, WorkflowProcessor.availableCPU); + this.slowQueue = new WorkflowProcessor("CrawlStackerSlow", "This is like CrawlStackerFast, but does additionaly a DNS lookup. The CrawlStackerFast does not need this because it can use the DNS cache.", new String[]{"Balancer"}, this, "job", 1000, null, 2); this.log.logInfo("STACKCRAWL thread initialized."); } diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index 77beb20e7..68b10c174 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -45,6 +45,7 @@ import net.yacy.cora.document.WordCache; import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.document.analysis.EnhancedTextProfileSignature; import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.federate.solr.Boost; import net.yacy.cora.language.synonyms.SynonymLibrary; import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.document.language.Identificator; @@ -236,8 +237,8 @@ public final class Condenser { // check dups with http://localhost:8090/solr/select?q=*:*&start=0&rows=3&fl=sku,fuzzy_signature_text_t,fuzzy_signature_l,fuzzy_signature_unique_b EnhancedTextProfileSignature fuzzySignatureFactory = new EnhancedTextProfileSignature(); Map sp = new HashMap(); - sp.put("quantRate", "0.5"); // for minTokenLen = 2 the value should not be below 0.24; for minTokenLen = 3 the value must be not below 0.5! - sp.put("minTokenLen", "3"); + sp.put("quantRate", Float.toString(Boost.RANKING.getQuantRate())); // for minTokenLen = 2 the value should not be below 0.24; for minTokenLen = 3 the value must be not below 0.5! + sp.put("minTokenLen", Integer.toString(Boost.RANKING.getMinTokenLen())); fuzzySignatureFactory.init(new MapSolrParams(sp)); fuzzySignatureFactory.add(text); byte[] fuzzy_signature_hash = fuzzySignatureFactory.getSignature(); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 6b9fb18dc..3b81afbae 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -90,6 +90,7 @@ import net.yacy.cora.document.RSSReader; import net.yacy.cora.document.UTF8; import net.yacy.cora.document.WordCache; import net.yacy.cora.document.analysis.Classification; +import net.yacy.cora.federate.solr.Boost; import net.yacy.cora.federate.solr.YaCySchema; import net.yacy.cora.federate.solr.connector.ShardSelection; import net.yacy.cora.federate.solr.connector.ShardSolrConnector; @@ -401,6 +402,9 @@ public final class Switchboard extends serverSwitch { ConfigurationSet.Entry entry = solrScheme.get(field.name()); entry.setEnable(true); solrScheme.put(field.name(), entry); } solrScheme.commit(); + Boost.RANKING.update(this.getConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_BOOST, "")); // must be called every time the boosts change + Boost.RANKING.setMinTokenLen(this.getConfigInt(SwitchboardConstants.SEARCH_RANKING_SOLR_DOUBLEDETECTION_MINLENGTH, 3)); + Boost.RANKING.setQuantRate(this.getConfigFloat(SwitchboardConstants.SEARCH_RANKING_SOLR_DOUBLEDETECTION_QUANTRATE, 0.5f)); // initialize index ReferenceContainer.maxReferences = getConfigInt("index.maxReferences", 0); @@ -1529,9 +1533,9 @@ public final class Switchboard extends serverSwitch { } public RankingProfile getRanking() { - return (getConfig("rankingProfile", "").isEmpty()) + return (getConfig(SwitchboardConstants.SEARCH_RANKING_RWI_PROFILE, "").isEmpty()) ? new RankingProfile(Classification.ContentDomain.TEXT) - : new RankingProfile("", crypt.simpleDecode(this.getConfig("rankingProfile", ""))); + : new RankingProfile("", crypt.simpleDecode(this.getConfig(SwitchboardConstants.SEARCH_RANKING_RWI_PROFILE, ""))); } /** diff --git a/source/net/yacy/search/SwitchboardConstants.java b/source/net/yacy/search/SwitchboardConstants.java index 8d79ac9f4..285a92bd3 100644 --- a/source/net/yacy/search/SwitchboardConstants.java +++ b/source/net/yacy/search/SwitchboardConstants.java @@ -459,6 +459,15 @@ public final class SwitchboardConstants { public static final String SEARCH_VERIFY = "search.verify"; public static final String SEARCH_VERIFY_DELETE = "search.verify.delete"; + /** + * ranking + */ + public static final String SEARCH_RANKING_RWI_PROFILE = "search.ranking.rwi.profile"; // old rwi rankingProfile ranking + public static final String SEARCH_RANKING_SOLR_BOOST = "search.ranking.solr.boost.tmp"; // temporary until we know best default values + public static final String SEARCH_RANKING_SOLR_DOUBLEDETECTION_MINLENGTH = "search.ranking.solr.doubledetection.minlength"; + public static final String SEARCH_RANKING_SOLR_DOUBLEDETECTION_QUANTRATE = "search.ranking.solr.doubledetection.quantrate"; + + /** * system tray */ diff --git a/source/net/yacy/search/index/SolrConfiguration.java b/source/net/yacy/search/index/SolrConfiguration.java index 19e5364f6..4b4df69b4 100644 --- a/source/net/yacy/search/index/SolrConfiguration.java +++ b/source/net/yacy/search/index/SolrConfiguration.java @@ -77,7 +77,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable super(); this.lazy = false; } - + /** * initialize the scheme with a given configuration file * the configuration file simply contains a list of lines with keywords @@ -86,6 +86,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable */ public SolrConfiguration(final File configurationFile, boolean lazy) { super(configurationFile); + this.lazy = lazy; // check consistency: compare with YaCyField enum if (this.isEmpty()) return; Iterator it = this.entryIterator(); @@ -104,7 +105,6 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable Log.logWarning("SolrScheme", " solr scheme file " + configurationFile.getAbsolutePath() + " is missing declaration for '" + field.name() + "'"); } } - this.lazy = lazy; } public boolean contains(YaCySchema field) { diff --git a/source/net/yacy/search/query/QueryGoal.java b/source/net/yacy/search/query/QueryGoal.java index a26c07765..dc71f7716 100644 --- a/source/net/yacy/search/query/QueryGoal.java +++ b/source/net/yacy/search/query/QueryGoal.java @@ -28,6 +28,7 @@ import java.util.LinkedHashMap; import java.util.Map; import java.util.SortedSet; +import net.yacy.cora.federate.solr.Boost; import net.yacy.cora.federate.solr.YaCySchema; import net.yacy.cora.order.Base64Order; import net.yacy.cora.storage.HandleSet; @@ -221,19 +222,6 @@ public class QueryGoal { YaCySchema.author,YaCySchema.description,YaCySchema.keywords,YaCySchema.text_t,YaCySchema.synonyms_sxt }; - private final static Map boosts = new LinkedHashMap(); - static { - boosts.put(YaCySchema.sku, 20.0f); - boosts.put(YaCySchema.url_paths_sxt, 20.0f); - boosts.put(YaCySchema.title, 15.0f); - boosts.put(YaCySchema.h1_txt, 11.0f); - boosts.put(YaCySchema.h2_txt, 10.0f); - boosts.put(YaCySchema.author, 8.0f); - boosts.put(YaCySchema.description, 5.0f); - boosts.put(YaCySchema.keywords, 2.0f); - boosts.put(YaCySchema.text_t, 1.0f); - } - public StringBuilder solrQueryString(SolrConfiguration configuration) { final StringBuilder q = new StringBuilder(80); @@ -266,7 +254,7 @@ public class QueryGoal { if (wc > 0) q.append(" OR "); q.append('('); q.append(field.getSolrFieldName()).append(':').append(w); - boost = boosts.get(field); + boost = Boost.RANKING.get(field); if (boost != null) q.append('^').append(boost.toString()); q.append(')'); wc++; diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java index 44006cbc3..279b7b4e2 100644 --- a/source/net/yacy/search/query/QueryParams.java +++ b/source/net/yacy/search/query/QueryParams.java @@ -46,6 +46,7 @@ import net.yacy.cora.document.ASCII; import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.federate.solr.Boost; import net.yacy.cora.federate.solr.YaCySchema; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.geo.GeoLocation; @@ -430,7 +431,8 @@ public final class QueryParams { // construct query final SolrQuery params = new SolrQuery(); params.setParam("defType", "edismax"); - params.setParam("bq", YaCySchema.fuzzy_signature_unique_b.getSolrFieldName() + ":true^100000.0"); // a boost query that moves double content to the back + float f = Boost.RANKING.get(YaCySchema.fuzzy_signature_unique_b); + params.setParam("bq", YaCySchema.fuzzy_signature_unique_b.getSolrFieldName() + ":true^" + Float.toString(f)); // a boost query that moves double content to the back params.setStart(this.offset); params.setRows(this.itemsPerPage); params.setFacet(false); diff --git a/source/net/yacy/search/query/SearchEventCache.java b/source/net/yacy/search/query/SearchEventCache.java index e3daec4da..de2d2bde8 100644 --- a/source/net/yacy/search/query/SearchEventCache.java +++ b/source/net/yacy/search/query/SearchEventCache.java @@ -31,6 +31,7 @@ import java.util.Map; import java.util.SortedMap; import java.util.concurrent.ConcurrentHashMap; +import net.yacy.cora.federate.solr.Boost; import net.yacy.data.WorkTables; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.MemoryControl; @@ -148,7 +149,9 @@ public class SearchEventCache { Log.logInfo("SearchEventCache", "getEvent: " + lastEvents.size() + " in cache; " + countAliveThreads() + " alive"); // start a new event - final boolean delete = Switchboard.getSwitchboard() == null || Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.SEARCH_VERIFY_DELETE, true); + Switchboard sb = Switchboard.getSwitchboard(); + final boolean delete = sb == null || Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.SEARCH_VERIFY_DELETE, true); + if (sb != null) Boost.RANKING.update(sb.getConfig(SwitchboardConstants.SEARCH_RANKING_SOLR_BOOST, "")); // update the boost values event = new SearchEvent(query, peers, workTables, preselectedPeerHashes, generateAbstracts, loader, remote_maxcount, remote_maxtime, burstRobinsonPercent, burstMultiwordPercent, delete); MemoryControl.request(100 * 1024 * 1024, false); // this may trigger a short memory status which causes a reducing of cache space of other threads } diff --git a/source/net/yacy/server/serverSwitch.java b/source/net/yacy/server/serverSwitch.java index 8cac6cdb4..82321eb5e 100644 --- a/source/net/yacy/server/serverSwitch.java +++ b/source/net/yacy/server/serverSwitch.java @@ -264,7 +264,7 @@ public class serverSwitch * @param dflt default value which will be used in case parameter can not be found or if it is invalid * @return value if the parameter or default value */ - public double getConfigFloat(final String key, final float dflt) { + public float getConfigFloat(final String key, final float dflt) { try { return Float.parseFloat(getConfig(key, Float.toString(dflt))); } catch ( final NumberFormatException e ) {