From c67c5c070929afa26f014aaa46c65dc9c9e7664d Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Tue, 18 Nov 2014 15:02:34 +0100
Subject: [PATCH] added new solr schema fields which record the occurences of
 vocabulary matchings. These matches can be used for result boosting, i.e. if
 a document contains words from a specific vocabulary, boost it.

---
 defaults/solr.collection.schema                   |  4 ++++
 source/net/yacy/migration.java                    |  2 +-
 source/net/yacy/search/query/QueryParams.java     |  4 ++--
 source/net/yacy/search/query/SearchEvent.java     |  4 ++--
 .../search/schema/CollectionConfiguration.java    | 15 ++++++++++++++-
 .../net/yacy/search/schema/CollectionSchema.java  | 14 +++++++++-----
 6 files changed, 32 insertions(+), 11 deletions(-)
diff --git a/defaults/solr.collection.schema b/defaults/solr.collection.schema
index a78f61542..d24bb1edb 100644
--- a/defaults/solr.collection.schema
+++ b/defaults/solr.collection.schema
@@ -498,3 +498,7 @@ host_extent_i
 
 ## number of matching title expressions
 #ext_title_val
+
+## collection of all vocabulary names that have a matcher in the document - use this to boost with vocabularies
+vocabularies_sxt
+    
\ No newline at end of file
diff --git a/source/net/yacy/migration.java b/source/net/yacy/migration.java
index 8d83e428b..15ccb5593 100644
--- a/source/net/yacy/migration.java
+++ b/source/net/yacy/migration.java
@@ -307,7 +307,7 @@ public class migration {
         omitFields.add("_version_"); // exclude internal Solr std. field from obsolete check
         Collection<Tagging> vocs = LibraryProvider.autotagging.getVocabularies();
         for (Tagging v: vocs) { //exclude configured vocabulary index fields (not in CollectionSchema but valid)
-            omitFields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_SUFFIX);
+            omitFields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX);
         }        
         CollectionConfiguration colcfg = Switchboard.getSwitchboard().index.fulltext().getDefaultConfiguration();
         ReindexSolrBusyThread reidx = new ReindexSolrBusyThread(null); // ("*:*" would reindex all);
diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java
index 05297ad05..c9a1e7e21 100644
--- a/source/net/yacy/search/query/QueryParams.java
+++ b/source/net/yacy/search/query/QueryParams.java
@@ -230,7 +230,7 @@ public final class QueryParams {
             // handle special field, authors_sxt (add to facet w/o contains check, as authors_sxt is not enabled (is copyfield))
             if (f != null && (solrSchema.contains(f) || f.name().equals("author_sxt"))) this.facetfields.add(f.getSolrFieldName());
         }
-        for (Tagging v: LibraryProvider.autotagging.getVocabularies()) this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_SUFFIX);
+        for (Tagging v: LibraryProvider.autotagging.getVocabularies()) this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX);
         this.maxfacets = defaultmaxfacets;
         this.cachedQuery = null;
     }
@@ -458,7 +458,7 @@ public final class QueryParams {
         // add vocabulary facets
         if (this.metatags != null) {
             for (Tagging.Metatag tag : this.metatags) {
-                fq.append(" AND ").append(CollectionSchema.VOCABULARY_PREFIX).append(tag.getVocabularyName()).append(CollectionSchema.VOCABULARY_SUFFIX).append(":\"").append(tag.getObject()).append('\"');
+                fq.append(" AND ").append(CollectionSchema.VOCABULARY_PREFIX).append(tag.getVocabularyName()).append(CollectionSchema.VOCABULARY_TERMS_SUFFIX).append(":\"").append(tag.getObject()).append('\"');
             }
         }
 
diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java
index dd64b2f72..146b117d0 100644
--- a/source/net/yacy/search/query/SearchEvent.java
+++ b/source/net/yacy/search/query/SearchEvent.java
@@ -864,7 +864,7 @@ public final class SearchEvent {
         
         // get the vocabulary navigation
         for (Tagging v: LibraryProvider.autotagging.getVocabularies()) {
-            fcts = facets.get(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_SUFFIX);
+            fcts = facets.get(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX);
             if (fcts != null) {
                 ScoreMap<String> vocNav = this.vocabularyNavigator.get(v.getName());
                 if (vocNav == null) {
@@ -1222,7 +1222,7 @@ public final class SearchEvent {
                 tagloop: for (Tagging.Metatag tag : this.query.metatags) {
                     SolrDocument sdoc = page;
                     if (sdoc != null) {
-                        Collection<Object> tagvalues = sdoc.getFieldValues(CollectionSchema.VOCABULARY_PREFIX + tag.getVocabularyName() + CollectionSchema.VOCABULARY_SUFFIX);
+                        Collection<Object> tagvalues = sdoc.getFieldValues(CollectionSchema.VOCABULARY_PREFIX + tag.getVocabularyName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX);
                         if (tagvalues != null && tagvalues.contains(tag.getObject())) {
                             continue tagloop; // metatag exists check next tag (filter may consist of several tags)                            
                         } 
diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java
index f153e8070..e98f3ab69 100644
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@@ -967,11 +967,24 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
         // there are no pre-defined solr fields for navigation because the vocabulary is generic
         // we use dynamically allocated solr fields for this.
         // It must be a multi-value string/token field, therefore we use _sxt extensions for the field names
+        List<String> vocabularies = new ArrayList<>();
         for (Map.Entry<String, Set<String>> facet: document.getGenericFacets().entrySet()) {
             String facetName = facet.getKey();
             Set<String> facetValues = facet.getValue();
-            doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_SUFFIX, facetValues.toArray(new String[facetValues.size()]));
+            int count = facetValues.size();
+            if (count == 0) continue;
+            int logcount = (int) (Math.log(count) / Math.log(2));
+            Integer[] counts = new Integer[logcount + 1]; for (int i = 0; i <= logcount; i++) counts[i] = i;
+            doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_TERMS_SUFFIX, facetValues.toArray(new String[count]));
+            doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_COUNT_SUFFIX, facetValues.size());
+            doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_LOGCOUNT_SUFFIX, logcount);
+            doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_LOGCOUNTS_SUFFIX, counts);
+            vocabularies.add(facetName);
+        }
+        if ((allAttr || contains(CollectionSchema.vocabularies_sxt)) && vocabularies.size() > 0) {
+            add(doc, CollectionSchema.vocabularies_sxt, vocabularies);
         }
+        
 
         if ((allAttr || contains(CollectionSchema.process_sxt)) && processTypes.size() > 0) {
             List<String> p = new ArrayList<String>();
diff --git a/source/net/yacy/search/schema/CollectionSchema.java b/source/net/yacy/search/schema/CollectionSchema.java
index baf4331de..aecd14bab 100644
--- a/source/net/yacy/search/schema/CollectionSchema.java
+++ b/source/net/yacy/search/schema/CollectionSchema.java
@@ -229,12 +229,16 @@ public enum CollectionSchema implements SchemaDeclaration {
     ext_tracker_txt(SolrType.text_general, true, true, true, false, false, "names of tracker server"),
     ext_tracker_val(SolrType.num_integer, true, true, true, false, false, "number of attribute counts in ext_tracker_txt"),
     ext_title_txt(SolrType.text_general, true, true, true, false, false, "names matching title expressions"),
-    ext_title_val(SolrType.num_integer, true, true, true, false, false, "number of matching title expressions");
-
-    public final static String CORE_NAME = "collection1"; // this was the default core name up to Solr 4.4.0. This default name was stored in CoreContainer.DEFAULT_DEFAULT_CORE_NAME but was removed in Solr 4.5.0
+    ext_title_val(SolrType.num_integer, true, true, true, false, false, "number of matching title expressions"),
+    vocabularies_sxt(SolrType.string, true, true, true, false, false, "collection of all vocabulary names that have a matcher in the document - use this to boost with vocabularies");
     
-    public final static String VOCABULARY_PREFIX = "vocabulary_";
-    public final static String VOCABULARY_SUFFIX = "_sxt";
+    public final static String CORE_NAME = "collection1"; // this was the default core name up to Solr 4.4.0. This default name was stored in CoreContainer.DEFAULT_DEFAULT_CORE_NAME but was removed in Solr 4.5.0
+
+    public final static String VOCABULARY_PREFIX = "vocabulary_"; // collects all terms that appear for each vocabulary
+    public final static String VOCABULARY_TERMS_SUFFIX = "_sxt"; // suffix for the term collector that start with VOCABULARY_PREFIX - middle part is vocabulary name
+    public final static String VOCABULARY_COUNT_SUFFIX = "_i"; // suffix for the term counter (>=1) that start with VOCABULARY_PREFIX - middle part is vocabulary name
+    public final static String VOCABULARY_LOGCOUNT_SUFFIX = "_log_i"; // log2(VOCABULARY_COUNT)] -- can be used for ranking boosts based on the number of occurrences
+    public final static String VOCABULARY_LOGCOUNTS_SUFFIX = "_log_val"; // all integers from [0 to log2(VOCABULARY_COUNT)] -- can be used for ranking boosts based on the number of occurrences
     
     private String solrFieldName = null; // solr field name in custom solr schema, defaults to solcell schema field name (= same as this.name() )
     private final SolrType type;