added new solr schema fields which record the occurences of vocabulary

matchings. These matches can be used for result boosting, i.e. if a
document contains words from a specific vocabulary, boost it.
pull/1/head
Michael Peter Christen 10 years ago
parent a67a465415
commit c67c5c0709

@ -498,3 +498,7 @@ host_extent_i
## number of matching title expressions ## number of matching title expressions
#ext_title_val #ext_title_val
## collection of all vocabulary names that have a matcher in the document - use this to boost with vocabularies
vocabularies_sxt

@ -307,7 +307,7 @@ public class migration {
omitFields.add("_version_"); // exclude internal Solr std. field from obsolete check omitFields.add("_version_"); // exclude internal Solr std. field from obsolete check
Collection<Tagging> vocs = LibraryProvider.autotagging.getVocabularies(); Collection<Tagging> vocs = LibraryProvider.autotagging.getVocabularies();
for (Tagging v: vocs) { //exclude configured vocabulary index fields (not in CollectionSchema but valid) for (Tagging v: vocs) { //exclude configured vocabulary index fields (not in CollectionSchema but valid)
omitFields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_SUFFIX); omitFields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX);
} }
CollectionConfiguration colcfg = Switchboard.getSwitchboard().index.fulltext().getDefaultConfiguration(); CollectionConfiguration colcfg = Switchboard.getSwitchboard().index.fulltext().getDefaultConfiguration();
ReindexSolrBusyThread reidx = new ReindexSolrBusyThread(null); // ("*:*" would reindex all); ReindexSolrBusyThread reidx = new ReindexSolrBusyThread(null); // ("*:*" would reindex all);

@ -230,7 +230,7 @@ public final class QueryParams {
// handle special field, authors_sxt (add to facet w/o contains check, as authors_sxt is not enabled (is copyfield)) // handle special field, authors_sxt (add to facet w/o contains check, as authors_sxt is not enabled (is copyfield))
if (f != null && (solrSchema.contains(f) || f.name().equals("author_sxt"))) this.facetfields.add(f.getSolrFieldName()); if (f != null && (solrSchema.contains(f) || f.name().equals("author_sxt"))) this.facetfields.add(f.getSolrFieldName());
} }
for (Tagging v: LibraryProvider.autotagging.getVocabularies()) this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_SUFFIX); for (Tagging v: LibraryProvider.autotagging.getVocabularies()) this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX);
this.maxfacets = defaultmaxfacets; this.maxfacets = defaultmaxfacets;
this.cachedQuery = null; this.cachedQuery = null;
} }
@ -458,7 +458,7 @@ public final class QueryParams {
// add vocabulary facets // add vocabulary facets
if (this.metatags != null) { if (this.metatags != null) {
for (Tagging.Metatag tag : this.metatags) { for (Tagging.Metatag tag : this.metatags) {
fq.append(" AND ").append(CollectionSchema.VOCABULARY_PREFIX).append(tag.getVocabularyName()).append(CollectionSchema.VOCABULARY_SUFFIX).append(":\"").append(tag.getObject()).append('\"'); fq.append(" AND ").append(CollectionSchema.VOCABULARY_PREFIX).append(tag.getVocabularyName()).append(CollectionSchema.VOCABULARY_TERMS_SUFFIX).append(":\"").append(tag.getObject()).append('\"');
} }
} }

@ -864,7 +864,7 @@ public final class SearchEvent {
// get the vocabulary navigation // get the vocabulary navigation
for (Tagging v: LibraryProvider.autotagging.getVocabularies()) { for (Tagging v: LibraryProvider.autotagging.getVocabularies()) {
fcts = facets.get(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_SUFFIX); fcts = facets.get(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX);
if (fcts != null) { if (fcts != null) {
ScoreMap<String> vocNav = this.vocabularyNavigator.get(v.getName()); ScoreMap<String> vocNav = this.vocabularyNavigator.get(v.getName());
if (vocNav == null) { if (vocNav == null) {
@ -1222,7 +1222,7 @@ public final class SearchEvent {
tagloop: for (Tagging.Metatag tag : this.query.metatags) { tagloop: for (Tagging.Metatag tag : this.query.metatags) {
SolrDocument sdoc = page; SolrDocument sdoc = page;
if (sdoc != null) { if (sdoc != null) {
Collection<Object> tagvalues = sdoc.getFieldValues(CollectionSchema.VOCABULARY_PREFIX + tag.getVocabularyName() + CollectionSchema.VOCABULARY_SUFFIX); Collection<Object> tagvalues = sdoc.getFieldValues(CollectionSchema.VOCABULARY_PREFIX + tag.getVocabularyName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX);
if (tagvalues != null && tagvalues.contains(tag.getObject())) { if (tagvalues != null && tagvalues.contains(tag.getObject())) {
continue tagloop; // metatag exists check next tag (filter may consist of several tags) continue tagloop; // metatag exists check next tag (filter may consist of several tags)
} }

@ -967,11 +967,24 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// there are no pre-defined solr fields for navigation because the vocabulary is generic // there are no pre-defined solr fields for navigation because the vocabulary is generic
// we use dynamically allocated solr fields for this. // we use dynamically allocated solr fields for this.
// It must be a multi-value string/token field, therefore we use _sxt extensions for the field names // It must be a multi-value string/token field, therefore we use _sxt extensions for the field names
List<String> vocabularies = new ArrayList<>();
for (Map.Entry<String, Set<String>> facet: document.getGenericFacets().entrySet()) { for (Map.Entry<String, Set<String>> facet: document.getGenericFacets().entrySet()) {
String facetName = facet.getKey(); String facetName = facet.getKey();
Set<String> facetValues = facet.getValue(); Set<String> facetValues = facet.getValue();
doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_SUFFIX, facetValues.toArray(new String[facetValues.size()])); int count = facetValues.size();
if (count == 0) continue;
int logcount = (int) (Math.log(count) / Math.log(2));
Integer[] counts = new Integer[logcount + 1]; for (int i = 0; i <= logcount; i++) counts[i] = i;
doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_TERMS_SUFFIX, facetValues.toArray(new String[count]));
doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_COUNT_SUFFIX, facetValues.size());
doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_LOGCOUNT_SUFFIX, logcount);
doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_LOGCOUNTS_SUFFIX, counts);
vocabularies.add(facetName);
}
if ((allAttr || contains(CollectionSchema.vocabularies_sxt)) && vocabularies.size() > 0) {
add(doc, CollectionSchema.vocabularies_sxt, vocabularies);
} }
if ((allAttr || contains(CollectionSchema.process_sxt)) && processTypes.size() > 0) { if ((allAttr || contains(CollectionSchema.process_sxt)) && processTypes.size() > 0) {
List<String> p = new ArrayList<String>(); List<String> p = new ArrayList<String>();

@ -229,12 +229,16 @@ public enum CollectionSchema implements SchemaDeclaration {
ext_tracker_txt(SolrType.text_general, true, true, true, false, false, "names of tracker server"), ext_tracker_txt(SolrType.text_general, true, true, true, false, false, "names of tracker server"),
ext_tracker_val(SolrType.num_integer, true, true, true, false, false, "number of attribute counts in ext_tracker_txt"), ext_tracker_val(SolrType.num_integer, true, true, true, false, false, "number of attribute counts in ext_tracker_txt"),
ext_title_txt(SolrType.text_general, true, true, true, false, false, "names matching title expressions"), ext_title_txt(SolrType.text_general, true, true, true, false, false, "names matching title expressions"),
ext_title_val(SolrType.num_integer, true, true, true, false, false, "number of matching title expressions"); ext_title_val(SolrType.num_integer, true, true, true, false, false, "number of matching title expressions"),
vocabularies_sxt(SolrType.string, true, true, true, false, false, "collection of all vocabulary names that have a matcher in the document - use this to boost with vocabularies");
public final static String CORE_NAME = "collection1"; // this was the default core name up to Solr 4.4.0. This default name was stored in CoreContainer.DEFAULT_DEFAULT_CORE_NAME but was removed in Solr 4.5.0
public final static String VOCABULARY_PREFIX = "vocabulary_"; public final static String CORE_NAME = "collection1"; // this was the default core name up to Solr 4.4.0. This default name was stored in CoreContainer.DEFAULT_DEFAULT_CORE_NAME but was removed in Solr 4.5.0
public final static String VOCABULARY_SUFFIX = "_sxt";
public final static String VOCABULARY_PREFIX = "vocabulary_"; // collects all terms that appear for each vocabulary
public final static String VOCABULARY_TERMS_SUFFIX = "_sxt"; // suffix for the term collector that start with VOCABULARY_PREFIX - middle part is vocabulary name
public final static String VOCABULARY_COUNT_SUFFIX = "_i"; // suffix for the term counter (>=1) that start with VOCABULARY_PREFIX - middle part is vocabulary name
public final static String VOCABULARY_LOGCOUNT_SUFFIX = "_log_i"; // log2(VOCABULARY_COUNT)] -- can be used for ranking boosts based on the number of occurrences
public final static String VOCABULARY_LOGCOUNTS_SUFFIX = "_log_val"; // all integers from [0 to log2(VOCABULARY_COUNT)] -- can be used for ranking boosts based on the number of occurrences
private String solrFieldName = null; // solr field name in custom solr schema, defaults to solcell schema field name (= same as this.name() ) private String solrFieldName = null; // solr field name in custom solr schema, defaults to solcell schema field name (= same as this.name() )
private final SolrType type; private final SolrType type;

Loading…
Cancel
Save