classification = context.classify(text);
+ String contextname = context.getName();
+ Category category = classification.getCategory();
+ String categoryname = category.getName();
+ c.put(contextname, categoryname);
+ }
+ return c;
+ }
+
+}
diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java
index 5a8480935..a61bc4ae1 100644
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@@ -153,6 +153,7 @@ import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.LibraryProvider;
import net.yacy.document.Parser;
+import net.yacy.document.ProbabilisticClassifier;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.Parser.Failure;
@@ -242,7 +243,7 @@ public final class Switchboard extends serverSwitch {
// storage management
public File htCachePath;
- public final File dictionariesPath;
+ public final File dictionariesPath, classificationPath;
public File listsPath;
public File htDocsPath;
public File workPath;
@@ -374,11 +375,20 @@ public final class Switchboard extends serverSwitch {
}
this.log.config("Work Path: " + this.workPath.toString());
+
this.dictionariesPath =
getDataPath(
SwitchboardConstants.DICTIONARY_SOURCE_PATH,
SwitchboardConstants.DICTIONARY_SOURCE_PATH_DEFAULT);
this.log.config("Dictionaries Path:" + this.dictionariesPath.toString());
+ if (!this.dictionariesPath.exists()) this.dictionariesPath.mkdirs();
+
+ this.classificationPath =
+ getDataPath(
+ SwitchboardConstants.CLASSIFICATION_SOURCE_PATH,
+ SwitchboardConstants.CLASSIFICATION_SOURCE_PATH_DEFAULT);
+ this.log.config("Classification Path:" + this.classificationPath.toString());
+ if (!this.classificationPath.exists()) this.classificationPath.mkdirs();
CollectionConfiguration.UNIQUE_HEURISTIC_PREFER_HTTPS = this.getConfigBool("search.ranking.uniqueheuristic.preferhttps", false);
CollectionConfiguration.UNIQUE_HEURISTIC_PREFER_WWWPREFIX = this.getConfigBool("search.ranking.uniqueheuristic.preferwwwprefix", true);
@@ -397,6 +407,9 @@ public final class Switchboard extends serverSwitch {
Tagging t = LibraryProvider.autotagging.getVocabulary(o);
if (t != null) t.setFacet(false);
}
+
+ Thread.currentThread().setName("ProbabilisticClassification.initialize");
+ ProbabilisticClassifier.initialize(Switchboard.this.classificationPath);
}
}.start();
diff --git a/source/net/yacy/search/SwitchboardConstants.java b/source/net/yacy/search/SwitchboardConstants.java
index 82295624f..32cc6c916 100644
--- a/source/net/yacy/search/SwitchboardConstants.java
+++ b/source/net/yacy/search/SwitchboardConstants.java
@@ -413,6 +413,9 @@ public final class SwitchboardConstants {
public static final String DICTIONARY_SOURCE_PATH = "dictionaries";
public static final String DICTIONARY_SOURCE_PATH_DEFAULT = "DATA/DICTIONARIES";
+
+ public static final String CLASSIFICATION_SOURCE_PATH = "classification";
+ public static final String CLASSIFICATION_SOURCE_PATH_DEFAULT = "DATA/CLASSIFICATION";
/**
* public static final String HTDOCS_PATH = "htDocsPath"
diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java
index 28316350a..1adafe904 100644
--- a/source/net/yacy/search/query/QueryParams.java
+++ b/source/net/yacy/search/query/QueryParams.java
@@ -54,6 +54,7 @@ import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.document.LibraryProvider;
+import net.yacy.document.ProbabilisticClassifier;
import net.yacy.document.Tokenizer;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReferenceRow;
@@ -262,6 +263,9 @@ public final class QueryParams {
this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX);
}
}
+ for (String context: ProbabilisticClassifier.getContextNames()) {
+ this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + context + CollectionSchema.VOCABULARY_TERMS_SUFFIX);
+ }
this.cachedQuery = null;
}
diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java
index 70bf0b998..cdcec498d 100644
--- a/source/net/yacy/search/query/SearchEvent.java
+++ b/source/net/yacy/search/query/SearchEvent.java
@@ -36,6 +36,7 @@ import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
+import java.util.Set;
import java.util.SortedMap;
import java.util.SortedSet;
import java.util.TreeMap;
@@ -72,6 +73,7 @@ import net.yacy.crawler.retrieval.Response;
import net.yacy.data.WorkTables;
import net.yacy.document.LargeNumberCache;
import net.yacy.document.LibraryProvider;
+import net.yacy.document.ProbabilisticClassifier;
import net.yacy.document.TextParser;
import net.yacy.document.Tokenizer;
import net.yacy.kelondro.data.meta.URIMetadataNode;
@@ -882,13 +884,16 @@ public final class SearchEvent {
}
// get the vocabulary navigation
- for (Tagging v: LibraryProvider.autotagging.getVocabularies()) {
- fcts = facets.get(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX);
+ Set genericFacets = new LinkedHashSet<>();
+ for (Tagging v: LibraryProvider.autotagging.getVocabularies()) genericFacets.add(v.getName());
+ genericFacets.addAll(ProbabilisticClassifier.getContextNames());
+ for (String v: genericFacets) {
+ fcts = facets.get(CollectionSchema.VOCABULARY_PREFIX + v + CollectionSchema.VOCABULARY_TERMS_SUFFIX);
if (fcts != null) {
- ScoreMap vocNav = this.vocabularyNavigator.get(v.getName());
+ ScoreMap vocNav = this.vocabularyNavigator.get(v);
if (vocNav == null) {
vocNav = new ConcurrentScoreMap();
- this.vocabularyNavigator.put(v.getName(), vocNav);
+ this.vocabularyNavigator.put(v, vocNav);
}
vocNav.inc(fcts);
}
@@ -1242,7 +1247,7 @@ public final class SearchEvent {
// check vocabulary terms (metatags) {only available in Solr index as vocabulary_xxyyzzz_sxt field}
- // TODO: vocabulary is only valid and available in local Solr index (considere to auto-switch to Searchdom.LOCAL)
+ // TODO: vocabulary is only valid and available in local Solr index (consider to auto-switch to Searchdom.LOCAL)
if (this.query.metatags != null && !this.query.metatags.isEmpty()) {
tagloop: for (Tagging.Metatag tag : this.query.metatags) {
SolrDocument sdoc = page;
diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java
index 7211a363a..07471e14e 100644
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@@ -81,6 +81,7 @@ import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.retrieval.Response;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
+import net.yacy.document.ProbabilisticClassifier;
import net.yacy.document.SentenceReader;
import net.yacy.document.Tokenizer;
import net.yacy.document.content.DCEntry;
@@ -1006,6 +1007,12 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
return doc;
}
+ /**
+ * attach additional information to the document to enable navigation features
+ * @param doc the document to be enriched
+ * @param synonyms a list of synonyms detected for the text content
+ * @param genericFacets a map where the key is the navigator name and the value is the set of attributes names
+ */
public void enrich(SolrInputDocument doc, List synonyms, Map> genericFacets) {
remove(doc, CollectionSchema.vocabularies_sxt); // delete old values
for (SolrInputField sif: doc) {
@@ -1016,6 +1023,17 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// there are no pre-defined solr fields for navigation because the vocabulary is generic
// we use dynamically allocated solr fields for this.
// It must be a multi-value string/token field, therefore we use _sxt extensions for the field names
+
+ // add to genericFacets the probabilistic categories
+ String text = (String) doc.getFieldValue(CollectionSchema.text_t.getSolrFieldName());
+ Map classification = ProbabilisticClassifier.getClassification(text);
+ for (Map.Entry entry: classification.entrySet()) {
+ Set facetAttrbutes = new HashSet<>();
+ facetAttrbutes.add(entry.getValue());
+ genericFacets.put(entry.getKey(), facetAttrbutes);
+ }
+
+ // compute the document field values
List vocabularies = new ArrayList<>();
for (Map.Entry> facet: genericFacets.entrySet()) {
String facetName = facet.getKey();