added new solr schema fields which record the occurences of vocabulary

matchings. These matches can be used for result boosting, i.e. if a document contains words from a specific vocabulary, boost it.
10 years ago · c67c5c0709
parent a67a465415
commit c67c5c0709
6 changed files with 32 additions and 11 deletions
--- a/defaults/solr.collection.schema
+++ b/defaults/solr.collection.schema
@ -498,3 +498,7 @@ host_extent_i

 ## number of matching title expressions
 #ext_title_val
+
+## collection of all vocabulary names that have a matcher in the document - use this to boost with vocabularies
+vocabularies_sxt
+    
--- a/source/net/yacy/migration.java
+++ b/source/net/yacy/migration.java
@ -307,7 +307,7 @@ public class migration {
        omitFields.add("_version_"); // exclude internal Solr std. field from obsolete check
        Collection<Tagging> vocs = LibraryProvider.autotagging.getVocabularies();
        for (Tagging v: vocs) { //exclude configured vocabulary index fields (not in CollectionSchema but valid)
-            omitFields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_SUFFIX);
+            omitFields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX);
        }        
        CollectionConfiguration colcfg = Switchboard.getSwitchboard().index.fulltext().getDefaultConfiguration();
        ReindexSolrBusyThread reidx = new ReindexSolrBusyThread(null); // ("*:*" would reindex all);
--- a/source/net/yacy/search/query/QueryParams.java
+++ b/source/net/yacy/search/query/QueryParams.java
@ -230,7 +230,7 @@ public final class QueryParams {
            // handle special field, authors_sxt (add to facet w/o contains check, as authors_sxt is not enabled (is copyfield))
            if (f != null && (solrSchema.contains(f) || f.name().equals("author_sxt"))) this.facetfields.add(f.getSolrFieldName());
        }
-        for (Tagging v: LibraryProvider.autotagging.getVocabularies()) this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_SUFFIX);
+        for (Tagging v: LibraryProvider.autotagging.getVocabularies()) this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX);
        this.maxfacets = defaultmaxfacets;
        this.cachedQuery = null;
    }
@ -458,7 +458,7 @@ public final class QueryParams {
        // add vocabulary facets
        if (this.metatags != null) {
            for (Tagging.Metatag tag : this.metatags) {
-                fq.append(" AND ").append(CollectionSchema.VOCABULARY_PREFIX).append(tag.getVocabularyName()).append(CollectionSchema.VOCABULARY_SUFFIX).append(":\"").append(tag.getObject()).append('\"');
+                fq.append(" AND ").append(CollectionSchema.VOCABULARY_PREFIX).append(tag.getVocabularyName()).append(CollectionSchema.VOCABULARY_TERMS_SUFFIX).append(":\"").append(tag.getObject()).append('\"');
            }
        }

--- a/source/net/yacy/search/query/SearchEvent.java
+++ b/source/net/yacy/search/query/SearchEvent.java
@ -864,7 +864,7 @@ public final class SearchEvent {
        
        // get the vocabulary navigation
        for (Tagging v: LibraryProvider.autotagging.getVocabularies()) {
-            fcts = facets.get(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_SUFFIX);
+            fcts = facets.get(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX);
            if (fcts != null) {
                ScoreMap<String> vocNav = this.vocabularyNavigator.get(v.getName());
                if (vocNav == null) {
@ -1222,7 +1222,7 @@ public final class SearchEvent {
                tagloop: for (Tagging.Metatag tag : this.query.metatags) {
                    SolrDocument sdoc = page;
                    if (sdoc != null) {
-                        Collection<Object> tagvalues = sdoc.getFieldValues(CollectionSchema.VOCABULARY_PREFIX + tag.getVocabularyName() + CollectionSchema.VOCABULARY_SUFFIX);
+                        Collection<Object> tagvalues = sdoc.getFieldValues(CollectionSchema.VOCABULARY_PREFIX + tag.getVocabularyName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX);
                        if (tagvalues != null && tagvalues.contains(tag.getObject())) {
                            continue tagloop; // metatag exists check next tag (filter may consist of several tags)                            
                        } 
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@ -967,11 +967,24 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
        // there are no pre-defined solr fields for navigation because the vocabulary is generic
        // we use dynamically allocated solr fields for this.
        // It must be a multi-value string/token field, therefore we use _sxt extensions for the field names
+        List<String> vocabularies = new ArrayList<>();
        for (Map.Entry<String, Set<String>> facet: document.getGenericFacets().entrySet()) {
            String facetName = facet.getKey();
            Set<String> facetValues = facet.getValue();
-            doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_SUFFIX, facetValues.toArray(new String[facetValues.size()]));
+            int count = facetValues.size();
+            if (count == 0) continue;
+            int logcount = (int) (Math.log(count) / Math.log(2));
+            Integer[] counts = new Integer[logcount + 1]; for (int i = 0; i <= logcount; i++) counts[i] = i;
+            doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_TERMS_SUFFIX, facetValues.toArray(new String[count]));
+            doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_COUNT_SUFFIX, facetValues.size());
+            doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_LOGCOUNT_SUFFIX, logcount);
+            doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_LOGCOUNTS_SUFFIX, counts);
+            vocabularies.add(facetName);
+        }
+        if ((allAttr || contains(CollectionSchema.vocabularies_sxt)) && vocabularies.size() > 0) {
+            add(doc, CollectionSchema.vocabularies_sxt, vocabularies);
        }
+        

        if ((allAttr || contains(CollectionSchema.process_sxt)) && processTypes.size() > 0) {
            List<String> p = new ArrayList<String>();
--- a/source/net/yacy/search/schema/CollectionSchema.java
+++ b/source/net/yacy/search/schema/CollectionSchema.java
@ -229,12 +229,16 @@ public enum CollectionSchema implements SchemaDeclaration {
    ext_tracker_txt(SolrType.text_general, true, true, true, false, false, "names of tracker server"),
    ext_tracker_val(SolrType.num_integer, true, true, true, false, false, "number of attribute counts in ext_tracker_txt"),
    ext_title_txt(SolrType.text_general, true, true, true, false, false, "names matching title expressions"),
-    ext_title_val(SolrType.num_integer, true, true, true, false, false, "number of matching title expressions");
-
-    public final static String CORE_NAME = "collection1"; // this was the default core name up to Solr 4.4.0. This default name was stored in CoreContainer.DEFAULT_DEFAULT_CORE_NAME but was removed in Solr 4.5.0
+    ext_title_val(SolrType.num_integer, true, true, true, false, false, "number of matching title expressions"),
+    vocabularies_sxt(SolrType.string, true, true, true, false, false, "collection of all vocabulary names that have a matcher in the document - use this to boost with vocabularies");
    
-    public final static String VOCABULARY_PREFIX = "vocabulary_";
-    public final static String VOCABULARY_SUFFIX = "_sxt";
+    public final static String CORE_NAME = "collection1"; // this was the default core name up to Solr 4.4.0. This default name was stored in CoreContainer.DEFAULT_DEFAULT_CORE_NAME but was removed in Solr 4.5.0
+
+    public final static String VOCABULARY_PREFIX = "vocabulary_"; // collects all terms that appear for each vocabulary
+    public final static String VOCABULARY_TERMS_SUFFIX = "_sxt"; // suffix for the term collector that start with VOCABULARY_PREFIX - middle part is vocabulary name
+    public final static String VOCABULARY_COUNT_SUFFIX = "_i"; // suffix for the term counter (>=1) that start with VOCABULARY_PREFIX - middle part is vocabulary name
+    public final static String VOCABULARY_LOGCOUNT_SUFFIX = "_log_i"; // log2(VOCABULARY_COUNT)] -- can be used for ranking boosts based on the number of occurrences
+    public final static String VOCABULARY_LOGCOUNTS_SUFFIX = "_log_val"; // all integers from [0 to log2(VOCABULARY_COUNT)] -- can be used for ranking boosts based on the number of occurrences
    
    private String solrFieldName = null; // solr field name in custom solr schema, defaults to solcell schema field name (= same as this.name() )
    private final SolrType type;