From 922979aae17203d674a2383cc2adedc5ce6c43df Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Mon, 2 Jun 2014 17:40:56 +0200 Subject: [PATCH] added option to prefer http over https in unique-protocol ranking --- defaults/yacy.init | 6 ++++++ source/net/yacy/search/Switchboard.java | 5 ++++- .../net/yacy/search/schema/CollectionConfiguration.java | 9 ++++++--- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/defaults/yacy.init b/defaults/yacy.init index 8aad8cd88..5bbe3ecff 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -1019,6 +1019,12 @@ search.ranking.solr.collection.boostfunction.tmpb.3= search.ranking.solr.doubledetection.minlength=3 search.ranking.solr.doubledetection.quantrate=0.5f +# Another attribute for double content is a 'greedy' ignoring of a http url is present for each https and vice versa +# The same may be true for documents with leading 'www.' subdomain and without. +# The following attributes will cause that https is preferred over http and with-www is preferred over without-www +search.ranking.uniqueheuristic.preferhttps = true +search.ranking.uniqueheuristic.preferwwwprefix = true + #optional extern thumbnail program. #the program must accept the invocation PROGRAM http://url /path/to/filename thumbnailProgram = diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 42f769fad..7feb9b1c3 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -98,7 +98,6 @@ import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.solr.FailCategory; import net.yacy.cora.federate.solr.Ranking; import net.yacy.cora.federate.solr.SchemaConfiguration; -import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; import net.yacy.cora.federate.solr.instance.RemoteInstance; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.order.Base64Order; @@ -368,6 +367,10 @@ public final class Switchboard extends serverSwitch { SwitchboardConstants.DICTIONARY_SOURCE_PATH_DEFAULT); this.log.config("Dictionaries Path:" + this.dictionariesPath.toString()); + CollectionConfiguration.UNIQUE_HEURISTIC_PREFER_HTTPS = this.getConfigBool("search.ranking.uniqueheuristic.preferhttps", true); + CollectionConfiguration.UNIQUE_HEURISTIC_PREFER_WWWPREFIX = this.getConfigBool("search.ranking.uniqueheuristic.preferwwwprefix", true); + + // init libraries this.log.config("initializing libraries"); new Thread() { diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index e121c34aa..92ca03b25 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -97,6 +97,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri private static final long serialVersionUID=-499100932212840385L; + public static boolean UNIQUE_HEURISTIC_PREFER_HTTPS = true; + public static boolean UNIQUE_HEURISTIC_PREFER_WWWPREFIX = true; + private final ArrayList rankings; /** @@ -476,9 +479,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri add(doc, CollectionSchema.synonyms_sxt, synonyms); } - // unique-fields; these values must be corrected during postprocessing. - add(doc, CollectionSchema.http_unique_b, digestURL.isHTTPS()); // this must be corrected afterwards during storage! - add(doc, CollectionSchema.www_unique_b, host != null && host.startsWith("www.")); // this must be corrected afterwards during storage! + // unique-fields; these values must be corrected during postprocessing. (the following logic is !^ (not-xor) but I prefer to write it that way as it is) + add(doc, CollectionSchema.http_unique_b, UNIQUE_HEURISTIC_PREFER_HTTPS ? digestURL.isHTTPS() : digestURL.isHTTP()); // this must be corrected afterwards during storage! + add(doc, CollectionSchema.www_unique_b, host != null && (UNIQUE_HEURISTIC_PREFER_WWWPREFIX ? host.startsWith("www.") : !host.startsWith("www."))); // this must be corrected afterwards during storage! add(doc, CollectionSchema.exact_signature_l, condenser.exactSignature()); add(doc, CollectionSchema.exact_signature_unique_b, true); // this must be corrected afterwards during storage!