Added an option to eventually limit size of remote solr documents put to

local index. See mantis #626.
pull/34/head
luc 9 years ago
parent 8827b86b2a
commit 8c4ab9c76b

@ -873,6 +873,8 @@ search.verify.delete = true
remotesearch.maxcount = 10
remotesearch.maxtime = 3000
remotesearch.result.store=true
# Maximum size allowed (in bytes) for a remote document result to be stored to local index. Defaults to -1, which means no limit.
remotesearch.result.store.maxsize=-1
remotesearch.maxload.rwi=8.0
remotesearch.maxload.solr=4.0

@ -66,7 +66,11 @@
<dt>Index remote results</dt>
<dd>
<input type="checkbox" name="remotesearch.result.store" value="true" #(remotesearch.result.store)#::checked="checked"#(/remotesearch.result.store)# /> add remote search results to the local index <b>( default=on, it is recommended to enable this option ! )</b>
<input type="checkbox" name="remotesearch.result.store" value="true" #(remotesearch.result.store)#::checked="checked"#(/remotesearch.result.store)# /> add remote search results to the local index <b>( default=on, it is recommended to enable this option ! )</b>
</dd>
<dt>Limit size of indexed remote results</dt>
<dd>
<input type="text" name="remotesearch.result.store.maxsize" value="#[remotesearch.result.store.maxsize]#" size="10" /> maximum allowed size in kbytes for each remote search result to be added to the local index (for example, a 1000kbytes limit might be useful if you are running YaCy with a low memory setup)
</dd>
<dt>Default Pop-Up Page</dt>

@ -93,6 +93,7 @@ public class ConfigPortal {
final boolean storeresult = post.getBoolean(SwitchboardConstants.REMOTESEARCH_RESULT_STORE);
sb.setConfig(SwitchboardConstants.REMOTESEARCH_RESULT_STORE, storeresult);
sb.setConfig(SwitchboardConstants.REMOTESEARCH_RESULT_STORE_MAXSIZE, post.getLong(SwitchboardConstants.REMOTESEARCH_RESULT_STORE_MAXSIZE, -1));
sb.setConfig(SwitchboardConstants.SEARCH_VERIFY, post.get("search.verify", "ifexist"));
sb.setConfig(SwitchboardConstants.SEARCH_VERIFY_DELETE, post.getBoolean("search.verify.delete"));
@ -148,6 +149,7 @@ public class ConfigPortal {
sb.setConfig("search.options", config.getProperty("search.options","true"));
sb.setConfig(SwitchboardConstants.GREEDYLEARNING_ACTIVE, config.getProperty(SwitchboardConstants.GREEDYLEARNING_ACTIVE));
sb.setConfig(SwitchboardConstants.REMOTESEARCH_RESULT_STORE, config.getProperty(SwitchboardConstants.REMOTESEARCH_RESULT_STORE));
sb.setConfig(SwitchboardConstants.REMOTESEARCH_RESULT_STORE_MAXSIZE, config.getProperty(SwitchboardConstants.REMOTESEARCH_RESULT_STORE_MAXSIZE));
sb.setConfig(SwitchboardConstants.SEARCH_VERIFY, config.getProperty(SwitchboardConstants.SEARCH_VERIFY,"iffresh"));
sb.setConfig(SwitchboardConstants.SEARCH_VERIFY_DELETE, config.getProperty(SwitchboardConstants.SEARCH_VERIFY_DELETE,"true"));
sb.setConfig("about.headline", config.getProperty("about.headline",""));
@ -170,6 +172,12 @@ public class ConfigPortal {
prop.put(SwitchboardConstants.GREEDYLEARNING_LIMIT_DOCCOUNT, sb.getConfig(SwitchboardConstants.GREEDYLEARNING_LIMIT_DOCCOUNT, "0"));
prop.put(SwitchboardConstants.REMOTESEARCH_RESULT_STORE, sb.getConfigBool(SwitchboardConstants.REMOTESEARCH_RESULT_STORE, true) ? 1 : 0);
long resultStoredMaxSize = sb.getConfigLong(SwitchboardConstants.REMOTESEARCH_RESULT_STORE_MAXSIZE, -1);
if(resultStoredMaxSize > 0) {
prop.put(SwitchboardConstants.REMOTESEARCH_RESULT_STORE_MAXSIZE, resultStoredMaxSize);
} else {
prop.put(SwitchboardConstants.REMOTESEARCH_RESULT_STORE_MAXSIZE, "");
}
prop.put("search.verify.nocache", sb.getConfig("search.verify", "").equals("nocache") ? 1 : 0);
prop.put("search.verify.iffresh", sb.getConfig("search.verify", "").equals("iffresh") ? 1 : 0);

@ -62,6 +62,15 @@ import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.http.entity.mime.content.ContentBody;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.response.FacetField;
import org.apache.solr.client.solrj.response.FacetField.Count;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
import net.yacy.migration;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.analysis.Classification;
@ -120,15 +129,6 @@ import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
import net.yacy.utils.crypt;
import org.apache.http.entity.mime.content.ContentBody;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.response.FacetField;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.client.solrj.response.FacetField.Count;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
public final class Protocol {
@ -929,6 +929,18 @@ public final class Protocol {
private final static CollectionSchema[] snippetFields = new CollectionSchema[]{CollectionSchema.description_txt, CollectionSchema.h4_txt, CollectionSchema.h3_txt, CollectionSchema.h2_txt, CollectionSchema.h1_txt, CollectionSchema.text_t};
/**
* Execute solr query against specified target.
* @param event search event ot feed with results
* @param solrQuery solr query
* @param offset pagination start indice
* @param count expected maximum results
* @param target target peer to query. May be null : in that case, local peer is queried.
* @param partitions
* @param blacklist url list to exclude from results
* @return the size of results list
* @throws InterruptedException when interrupt status on calling thread is detected while processing
*/
protected static int solrQuery(
final SearchEvent event,
final SolrQuery solrQuery,
@ -1125,12 +1137,17 @@ public final class Protocol {
// put the remote documents to the local index. We must convert the solr document to a solr input document:
if (event.addResultsToLocalIndex) {
final SolrInputDocument sid = event.query.getSegment().fulltext().getDefaultConfiguration().toSolrInputDocument(doc);
// the input document stays untouched because it contains top-level cloned objects
docs.add(sid);
// will be stored to index, and is a full solr document, can be added to firstseen
event.query.getSegment().setFirstSeenTime(urlEntry.hash(), Math.min(urlEntry.moddate().getTime(), System.currentTimeMillis()));
/* Check document size, only if a limit is set on remote documents size allowed to be stored to local index */
if(checkDocumentSize(doc, event.getRemoteDocStoredMaxSize() * 1024)) {
final SolrInputDocument sid = event.query.getSegment().fulltext().getDefaultConfiguration().toSolrInputDocument(doc);
// the input document stays untouched because it contains top-level cloned objects
docs.add(sid);
// will be stored to index, and is a full solr document, can be added to firstseen
event.query.getSegment().setFirstSeenTime(urlEntry.hash(), Math.min(urlEntry.moddate().getTime(), System.currentTimeMillis()));
} else {
Network.log.info("Document size greater than " + event.getRemoteDocStoredMaxSize() + " kbytes, excludes it from being stored to local index. Url : " + urlEntry.urlstring());
}
}
// after this conversion we can remove the largest and not used field text_t and synonyms_sxt from the document
@ -1172,6 +1189,33 @@ public final class Protocol {
}
return dls;
}
/**
* Only when maxSize is greater than zero, check that doc size is lower. To
* process in a reasonable amount of time, document size is not evaluated
* summing all fields sizes, but only against text_t field which is quite representative and might weigh
* some MB.
*
* @param doc
* document to verify. Must not be null.
* @param maxSize
* maximum allowed size in bytes
* @return true when document evaluated size is lower or equal than maxSize, or when
* maxSize is lower or equal than zero.
*/
protected static boolean checkDocumentSize(SolrDocument doc, long maxSize) {
if (maxSize > 0) {
/* All text field is often the largest */
Object value = doc.getFieldValue(CollectionSchema.text_t.getSolrFieldName());
if(value instanceof String) {
/* Each char uses 2 bytes */
if(((String)value).length() > (maxSize /2)) {
return false;
}
}
}
return true;
}
public static Map<String, String> permissionMessage(final String targetAddress, final String targetHash) {
// ask for allowed message size and attachment size

@ -332,6 +332,8 @@ public final class SwitchboardConstants {
public static final String REMOTESEARCH_MAXCOUNT_USER = "remotesearch.maxcount";
public static final String REMOTESEARCH_MAXTIME_USER = "remotesearch.maxtime";
public static final String REMOTESEARCH_RESULT_STORE = "remotesearch.result.store"; // add remote results to local index
/** Maximum size allowed (in kbytes) for a remote document result to be stored to local index */
public static final String REMOTESEARCH_RESULT_STORE_MAXSIZE= "remotesearch.result.store.maxsize";
public static final String REMOTESEARCH_MAXLOAD_RWI = "remotesearch.maxload.rwi";
public static final String REMOTESEARCH_MAXLOAD_SOLR = "remotesearch.maxload.solr";

@ -161,6 +161,8 @@ public final class SearchEvent {
private ConcurrentHashMap<String, LinkedHashSet<String>> snippets;
private final boolean remote;
public final boolean addResultsToLocalIndex; // add received results to local index (defult=true)
/** Maximum size allowed (in kbytes) for a remote document result to be stored to local index */
private long remoteStoredDocMaxSize;
private SortedMap<byte[], ReferenceContainer<WordReference>> localSearchInclusion;
private final ScoreMap<String> ref; // reference score computation for the commonSense heuristic
private final long maxtime;
@ -198,6 +200,22 @@ public final class SearchEvent {
);
}
/**
* Set maximum size allowed (in kbytes) for a remote document result to be stored to local index.
* @param maxSize document content max size in kbytes. Zero or negative value means no limit.
*/
public void setRemoteDocStoredMaxSize(long maxSize) {
this.remoteStoredDocMaxSize = maxSize;
}
/**
* @return maximum size allowed (in kbytes) for a remote document result to be stored to local index.
* Zero or negative value means no limit.
*/
public long getRemoteDocStoredMaxSize() {
return this.remoteStoredDocMaxSize;
}
protected SearchEvent(
final QueryParams query,
final SeedDB peers,
@ -261,6 +279,8 @@ public final class SearchEvent {
this.IAneardhthash = null;
this.remote = (peers != null && peers.sizeConnected() > 0) && (this.query.domType == QueryParams.Searchdom.CLUSTER || (this.query.domType == QueryParams.Searchdom.GLOBAL && Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.INDEX_RECEIVE_ALLOW_SEARCH, false)));
this.addResultsToLocalIndex = addResultsToLocalIdx;
/* Défault : no size limit to store remote result documents to local index. Use setter to eventually modify it. */
this.remoteStoredDocMaxSize = -1;
this.local_rwi_available = new AtomicInteger(0); // the number of results in the local peer after filtering
this.local_rwi_stored = new AtomicInteger(0);
this.local_solr_available = new AtomicInteger(0);

@ -174,6 +174,10 @@ public class SearchEventCache {
|| (sb.getConfigBool(SwitchboardConstants.NETWORK_SEARCHVERIFY, false) && sb.peers.mySeed().getFlagAcceptRemoteIndex());
final boolean addToLocalIdx = sb == null || Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.REMOTESEARCH_RESULT_STORE, true);
event = new SearchEvent(query, peers, workTables, preselectedPeerHashes, generateAbstracts, loader, remote_maxcount, remote_maxtime, delete, addToLocalIdx);
/* Optional config option may be valued to limit size of remote documents added to local index */
if(sb != null) {
event.setRemoteDocStoredMaxSize(sb.getConfigLong(SwitchboardConstants.REMOTESEARCH_RESULT_STORE_MAXSIZE, -1));
}
MemoryControl.request(100 * 1024 * 1024, false); // this may trigger a short memory status which causes a reducing of cache space of other threads
}

Loading…
Cancel
Save