added option to prefer http over https in unique-protocol ranking

pull/1/head
Michael Peter Christen 11 years ago
parent b3b174e2b8
commit 922979aae1

@ -1019,6 +1019,12 @@ search.ranking.solr.collection.boostfunction.tmpb.3=
search.ranking.solr.doubledetection.minlength=3
search.ranking.solr.doubledetection.quantrate=0.5f
# Another attribute for double content is a 'greedy' ignoring of a http url is present for each https and vice versa
# The same may be true for documents with leading 'www.' subdomain and without.
# The following attributes will cause that https is preferred over http and with-www is preferred over without-www
search.ranking.uniqueheuristic.preferhttps = true
search.ranking.uniqueheuristic.preferwwwprefix = true
#optional extern thumbnail program.
#the program must accept the invocation PROGRAM http://url /path/to/filename
thumbnailProgram =

@ -98,7 +98,6 @@ import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.solr.FailCategory;
import net.yacy.cora.federate.solr.Ranking;
import net.yacy.cora.federate.solr.SchemaConfiguration;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
import net.yacy.cora.federate.solr.instance.RemoteInstance;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order;
@ -368,6 +367,10 @@ public final class Switchboard extends serverSwitch {
SwitchboardConstants.DICTIONARY_SOURCE_PATH_DEFAULT);
this.log.config("Dictionaries Path:" + this.dictionariesPath.toString());
CollectionConfiguration.UNIQUE_HEURISTIC_PREFER_HTTPS = this.getConfigBool("search.ranking.uniqueheuristic.preferhttps", true);
CollectionConfiguration.UNIQUE_HEURISTIC_PREFER_WWWPREFIX = this.getConfigBool("search.ranking.uniqueheuristic.preferwwwprefix", true);
// init libraries
this.log.config("initializing libraries");
new Thread() {

@ -97,6 +97,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
private static final long serialVersionUID=-499100932212840385L;
public static boolean UNIQUE_HEURISTIC_PREFER_HTTPS = true;
public static boolean UNIQUE_HEURISTIC_PREFER_WWWPREFIX = true;
private final ArrayList<Ranking> rankings;
/**
@ -476,9 +479,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
add(doc, CollectionSchema.synonyms_sxt, synonyms);
}
// unique-fields; these values must be corrected during postprocessing.
add(doc, CollectionSchema.http_unique_b, digestURL.isHTTPS()); // this must be corrected afterwards during storage!
add(doc, CollectionSchema.www_unique_b, host != null && host.startsWith("www.")); // this must be corrected afterwards during storage!
// unique-fields; these values must be corrected during postprocessing. (the following logic is !^ (not-xor) but I prefer to write it that way as it is)
add(doc, CollectionSchema.http_unique_b, UNIQUE_HEURISTIC_PREFER_HTTPS ? digestURL.isHTTPS() : digestURL.isHTTP()); // this must be corrected afterwards during storage!
add(doc, CollectionSchema.www_unique_b, host != null && (UNIQUE_HEURISTIC_PREFER_WWWPREFIX ? host.startsWith("www.") : !host.startsWith("www."))); // this must be corrected afterwards during storage!
add(doc, CollectionSchema.exact_signature_l, condenser.exactSignature());
add(doc, CollectionSchema.exact_signature_unique_b, true); // this must be corrected afterwards during storage!

Loading…
Cancel
Save