From c67c5c070929afa26f014aaa46c65dc9c9e7664d Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 18 Nov 2014 15:02:34 +0100 Subject: [PATCH] added new solr schema fields which record the occurences of vocabulary matchings. These matches can be used for result boosting, i.e. if a document contains words from a specific vocabulary, boost it. --- defaults/solr.collection.schema | 4 ++++ source/net/yacy/migration.java | 2 +- source/net/yacy/search/query/QueryParams.java | 4 ++-- source/net/yacy/search/query/SearchEvent.java | 4 ++-- .../search/schema/CollectionConfiguration.java | 15 ++++++++++++++- .../net/yacy/search/schema/CollectionSchema.java | 14 +++++++++----- 6 files changed, 32 insertions(+), 11 deletions(-) diff --git a/defaults/solr.collection.schema b/defaults/solr.collection.schema index a78f61542..d24bb1edb 100644 --- a/defaults/solr.collection.schema +++ b/defaults/solr.collection.schema @@ -498,3 +498,7 @@ host_extent_i ## number of matching title expressions #ext_title_val + +## collection of all vocabulary names that have a matcher in the document - use this to boost with vocabularies +vocabularies_sxt + \ No newline at end of file diff --git a/source/net/yacy/migration.java b/source/net/yacy/migration.java index 8d83e428b..15ccb5593 100644 --- a/source/net/yacy/migration.java +++ b/source/net/yacy/migration.java @@ -307,7 +307,7 @@ public class migration { omitFields.add("_version_"); // exclude internal Solr std. field from obsolete check Collection vocs = LibraryProvider.autotagging.getVocabularies(); for (Tagging v: vocs) { //exclude configured vocabulary index fields (not in CollectionSchema but valid) - omitFields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_SUFFIX); + omitFields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX); } CollectionConfiguration colcfg = Switchboard.getSwitchboard().index.fulltext().getDefaultConfiguration(); ReindexSolrBusyThread reidx = new ReindexSolrBusyThread(null); // ("*:*" would reindex all); diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java index 05297ad05..c9a1e7e21 100644 --- a/source/net/yacy/search/query/QueryParams.java +++ b/source/net/yacy/search/query/QueryParams.java @@ -230,7 +230,7 @@ public final class QueryParams { // handle special field, authors_sxt (add to facet w/o contains check, as authors_sxt is not enabled (is copyfield)) if (f != null && (solrSchema.contains(f) || f.name().equals("author_sxt"))) this.facetfields.add(f.getSolrFieldName()); } - for (Tagging v: LibraryProvider.autotagging.getVocabularies()) this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_SUFFIX); + for (Tagging v: LibraryProvider.autotagging.getVocabularies()) this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX); this.maxfacets = defaultmaxfacets; this.cachedQuery = null; } @@ -458,7 +458,7 @@ public final class QueryParams { // add vocabulary facets if (this.metatags != null) { for (Tagging.Metatag tag : this.metatags) { - fq.append(" AND ").append(CollectionSchema.VOCABULARY_PREFIX).append(tag.getVocabularyName()).append(CollectionSchema.VOCABULARY_SUFFIX).append(":\"").append(tag.getObject()).append('\"'); + fq.append(" AND ").append(CollectionSchema.VOCABULARY_PREFIX).append(tag.getVocabularyName()).append(CollectionSchema.VOCABULARY_TERMS_SUFFIX).append(":\"").append(tag.getObject()).append('\"'); } } diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index dd64b2f72..146b117d0 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -864,7 +864,7 @@ public final class SearchEvent { // get the vocabulary navigation for (Tagging v: LibraryProvider.autotagging.getVocabularies()) { - fcts = facets.get(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_SUFFIX); + fcts = facets.get(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX); if (fcts != null) { ScoreMap vocNav = this.vocabularyNavigator.get(v.getName()); if (vocNav == null) { @@ -1222,7 +1222,7 @@ public final class SearchEvent { tagloop: for (Tagging.Metatag tag : this.query.metatags) { SolrDocument sdoc = page; if (sdoc != null) { - Collection tagvalues = sdoc.getFieldValues(CollectionSchema.VOCABULARY_PREFIX + tag.getVocabularyName() + CollectionSchema.VOCABULARY_SUFFIX); + Collection tagvalues = sdoc.getFieldValues(CollectionSchema.VOCABULARY_PREFIX + tag.getVocabularyName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX); if (tagvalues != null && tagvalues.contains(tag.getObject())) { continue tagloop; // metatag exists check next tag (filter may consist of several tags) } diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index f153e8070..e98f3ab69 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -967,11 +967,24 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // there are no pre-defined solr fields for navigation because the vocabulary is generic // we use dynamically allocated solr fields for this. // It must be a multi-value string/token field, therefore we use _sxt extensions for the field names + List vocabularies = new ArrayList<>(); for (Map.Entry> facet: document.getGenericFacets().entrySet()) { String facetName = facet.getKey(); Set facetValues = facet.getValue(); - doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_SUFFIX, facetValues.toArray(new String[facetValues.size()])); + int count = facetValues.size(); + if (count == 0) continue; + int logcount = (int) (Math.log(count) / Math.log(2)); + Integer[] counts = new Integer[logcount + 1]; for (int i = 0; i <= logcount; i++) counts[i] = i; + doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_TERMS_SUFFIX, facetValues.toArray(new String[count])); + doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_COUNT_SUFFIX, facetValues.size()); + doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_LOGCOUNT_SUFFIX, logcount); + doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_LOGCOUNTS_SUFFIX, counts); + vocabularies.add(facetName); + } + if ((allAttr || contains(CollectionSchema.vocabularies_sxt)) && vocabularies.size() > 0) { + add(doc, CollectionSchema.vocabularies_sxt, vocabularies); } + if ((allAttr || contains(CollectionSchema.process_sxt)) && processTypes.size() > 0) { List p = new ArrayList(); diff --git a/source/net/yacy/search/schema/CollectionSchema.java b/source/net/yacy/search/schema/CollectionSchema.java index baf4331de..aecd14bab 100644 --- a/source/net/yacy/search/schema/CollectionSchema.java +++ b/source/net/yacy/search/schema/CollectionSchema.java @@ -229,12 +229,16 @@ public enum CollectionSchema implements SchemaDeclaration { ext_tracker_txt(SolrType.text_general, true, true, true, false, false, "names of tracker server"), ext_tracker_val(SolrType.num_integer, true, true, true, false, false, "number of attribute counts in ext_tracker_txt"), ext_title_txt(SolrType.text_general, true, true, true, false, false, "names matching title expressions"), - ext_title_val(SolrType.num_integer, true, true, true, false, false, "number of matching title expressions"); - - public final static String CORE_NAME = "collection1"; // this was the default core name up to Solr 4.4.0. This default name was stored in CoreContainer.DEFAULT_DEFAULT_CORE_NAME but was removed in Solr 4.5.0 + ext_title_val(SolrType.num_integer, true, true, true, false, false, "number of matching title expressions"), + vocabularies_sxt(SolrType.string, true, true, true, false, false, "collection of all vocabulary names that have a matcher in the document - use this to boost with vocabularies"); - public final static String VOCABULARY_PREFIX = "vocabulary_"; - public final static String VOCABULARY_SUFFIX = "_sxt"; + public final static String CORE_NAME = "collection1"; // this was the default core name up to Solr 4.4.0. This default name was stored in CoreContainer.DEFAULT_DEFAULT_CORE_NAME but was removed in Solr 4.5.0 + + public final static String VOCABULARY_PREFIX = "vocabulary_"; // collects all terms that appear for each vocabulary + public final static String VOCABULARY_TERMS_SUFFIX = "_sxt"; // suffix for the term collector that start with VOCABULARY_PREFIX - middle part is vocabulary name + public final static String VOCABULARY_COUNT_SUFFIX = "_i"; // suffix for the term counter (>=1) that start with VOCABULARY_PREFIX - middle part is vocabulary name + public final static String VOCABULARY_LOGCOUNT_SUFFIX = "_log_i"; // log2(VOCABULARY_COUNT)] -- can be used for ranking boosts based on the number of occurrences + public final static String VOCABULARY_LOGCOUNTS_SUFFIX = "_log_val"; // all integers from [0 to log2(VOCABULARY_COUNT)] -- can be used for ranking boosts based on the number of occurrences private String solrFieldName = null; // solr field name in custom solr schema, defaults to solcell schema field name (= same as this.name() ) private final SolrType type;