From df3314ac1a15f1f78f1e9c3601b85ac28c42915d Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Mon, 10 Aug 2015 14:27:44 +0200
Subject: [PATCH] added a new facet type based on a probabilistic classifier
 using bayesian filters. This can be used to classify documents during
 indexing-time using a pre-definied bayesian filter.

New wordings:
- a context is a class where different categories are possible. The
context name is equal to a facet name.
- a category is a facet type within a facet navigation. Each context
must have several categories, at least one custom name (things you want
to discover) and one with the exact name "negative".

To use this, you must do:
- for each context, you must create a directory within
DATA/CLASSIFICATION with the name of the context (the facet name)
- within each context directory, you must create text files with one
document each per line for every categroy. One of these categories MUST
have the name 'negative.txt'.

Then, each new document is classified to match within one of the given
categories for each context.
---
 defaults/yacy.init                            |   6 +
 htroot/js/yacysearch.js                       |   4 +-
 htroot/yacysearch.java                        |   8 +-
 .../language/synonyms/AutotaggingLibrary.java |   7 +
 .../net/yacy/cora/lod/vocabulary/Tagging.java |   7 +-
 source/net/yacy/document/Document.java        |   7 +-
 .../document/ProbabilisticClassifier.java     | 168 ++++++++++++++++++
 source/net/yacy/search/Switchboard.java       |  15 +-
 .../net/yacy/search/SwitchboardConstants.java |   3 +
 source/net/yacy/search/query/QueryParams.java |   4 +
 source/net/yacy/search/query/SearchEvent.java |  15 +-
 .../schema/CollectionConfiguration.java       |  18 ++
 12 files changed, 251 insertions(+), 11 deletions(-)
 create mode 100644 source/net/yacy/document/ProbabilisticClassifier.java
diff --git a/defaults/yacy.init b/defaults/yacy.init
index a238d2ed4..352f1b8c6 100644
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@@ -258,6 +258,12 @@ surrogates.out = DATA/SURROGATES/out
 # this directory also contains subdirectories for input sources, the did-you-mean function and other
 dictionaries = DATA/DICTIONARIES
 
+# a path to the classification directory
+# each subdirectory is the name of a context (which becomes a navigator) with '.txt' files
+# containing texts to teach a bayesian filter. One of the files must be named 'negative.txt'.
+# The text files can be created with the Export functionality using the option "Only Text".
+classification = DATA/CLASSIFICATION
+
 # storage place for new releases
 releases = DATA/RELEASE
 
diff --git a/htroot/js/yacysearch.js b/htroot/js/yacysearch.js
index c3994e009..413eabf58 100644
--- a/htroot/js/yacysearch.js
+++ b/htroot/js/yacysearch.js
@@ -52,8 +52,8 @@ function statistics(offset, itemscount, itemsperpage, totalcount, localResourceS
 	  	resnav += "\">&laquo;</a></li>";
 	  }
 	  
-	  numberofpages = Math.floor(Math.min(10, 1 + ((totalcount.replace(/\./g,'') - 1) / itemsperpage)));
-	  if (!numberofpages) numberofpages = 10;
+	  numberofpages = Math.floor(Math.min(9, 1 + ((totalcount.replace(/\./g,'') - 1) / itemsperpage)));
+	  if (!numberofpages) numberofpages = 9;
 	  for (i = 0; i < numberofpages; i++) {
 	      if (i == thispage) {
 	         resnav += "<li class=\"active\"><a href=\"#\">";
diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java
index 3706113bb..fee284c8d 100644
--- a/htroot/yacysearch.java
+++ b/htroot/yacysearch.java
@@ -49,6 +49,7 @@ import net.yacy.cora.federate.FederateSearchManager;
 import net.yacy.cora.federate.yacy.CacheStrategy;
 import net.yacy.cora.geo.GeoLocation;
 import net.yacy.cora.lod.vocabulary.Tagging;
+import net.yacy.cora.lod.vocabulary.Tagging.Metatag;
 import net.yacy.cora.protocol.ClientIdentification;
 import net.yacy.cora.protocol.Domains;
 import net.yacy.cora.protocol.HeaderFramework;
@@ -443,7 +444,12 @@ public class yacysearch {
                 if (p > 0) {
                     String k = vocabulary.substring(0, p);
                     String v = vocabulary.substring(p + 1);
-                    metatags.add(LibraryProvider.autotagging.metatag(k, v));
+                    Metatag mt = LibraryProvider.autotagging.metatag(k, v);
+                    if (mt != null) {
+                        metatags.add(mt);
+                    } else {
+                        
+                    }
                 }
             }
 
diff --git a/source/net/yacy/cora/language/synonyms/AutotaggingLibrary.java b/source/net/yacy/cora/language/synonyms/AutotaggingLibrary.java
index 54197e6bb..4cf70ce5c 100644
--- a/source/net/yacy/cora/language/synonyms/AutotaggingLibrary.java
+++ b/source/net/yacy/cora/language/synonyms/AutotaggingLibrary.java
@@ -31,6 +31,7 @@ import java.util.concurrent.ConcurrentHashMap;
 import net.yacy.cora.geo.Locations;
 import net.yacy.cora.lod.vocabulary.Tagging;
 import net.yacy.cora.util.ConcurrentLog;
+import net.yacy.document.ProbabilisticClassifier;
 
 /**
  * Autotagging provides a set of tag/print-name properties which can be used to
@@ -167,6 +168,12 @@ public class AutotaggingLibrary {
 
     public Tagging.Metatag metatag(String vocName, String term) {
         Tagging tagging = this.vocabularies.get(vocName);
+        if (tagging == null) {
+            if (ProbabilisticClassifier.getContextNames().contains(vocName)) {
+                tagging = new Tagging(vocName);
+            }
+        }
+        if (tagging == null) return null;
         return tagging.getMetatagFromTerm(Tagging.decodeMaskname(term));
     }
 
diff --git a/source/net/yacy/cora/lod/vocabulary/Tagging.java b/source/net/yacy/cora/lod/vocabulary/Tagging.java
index f642ca6f3..31fbc9461 100644
--- a/source/net/yacy/cora/lod/vocabulary/Tagging.java
+++ b/source/net/yacy/cora/lod/vocabulary/Tagging.java
@@ -90,7 +90,7 @@ public class Tagging {
 
     }
 
-    private Tagging(String name) {
+    public Tagging(String name) {
         this.navigatorName = name;
         this.synonym2term = new ConcurrentHashMap<String, String>();
         this.term2synonym = new ConcurrentHashMap<String, String>();
@@ -544,6 +544,11 @@ public class Tagging {
         return term;
     }
 
+    /**
+     * The metatag class contains the object value for a Linked Open Data RDF triple.
+     * The metatag is created in a tagging environment, which already contains the
+     * subject and the predicate. The metatag is the object of the RDF triple.
+     */
 	public class Metatag {
 	    private final String object;
 	    private Metatag(String object) {
diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java
index f72ea5890..ad50e16ef 100644
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@@ -250,12 +250,17 @@ dc_rights
     /**
      * add the given words to the set of keywords.
      * These keywords will appear in dc_subject
-     * @param tags
+     * @param tags a map where the key is the navigator name and the value is the set of attributes as metatags
      */
     protected void addMetatags(Map<String, Set<Tagging.Metatag>> tags) {
         this.generic_facets.putAll(computeGenericFacets(tags));
     }
 
+    /**
+     * compute generic facets
+     * @param tags a map where the key is the navigator name and the value is the set of attributes as metatags
+     * @return a map where the key is the navigator name and the value is the set of attributes names
+     */
     public static Map<String, Set<String>> computeGenericFacets(Map<String, Set<Tagging.Metatag>> tags) {
         Map<String, Set<String>> gf = new HashMap<String, Set<String>>();
         for (Map.Entry<String, Set<Tagging.Metatag>> e: tags.entrySet()) {
diff --git a/source/net/yacy/document/ProbabilisticClassifier.java b/source/net/yacy/document/ProbabilisticClassifier.java
new file mode 100644
index 000000000..b729a311c
--- /dev/null
+++ b/source/net/yacy/document/ProbabilisticClassifier.java
@@ -0,0 +1,168 @@
+/**
+ *  ProbabilisticClassifier
+ *  Copyright 2015 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
+ *  first published 06.08.2015 on http://yacy.net
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program in the file lgpl21.txt
+ *  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+package net.yacy.document;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import net.yacy.cora.bayes.BayesClassifier;
+import net.yacy.cora.bayes.Classification;
+import net.yacy.cora.util.ConcurrentLog;
+
+public class ProbabilisticClassifier {
+
+    public final static String NONE_CATEGORY_NAME = "NONE";
+    public final static Category NONE_CATEGORY = new Category(NONE_CATEGORY_NAME);
+    
+    public static class Category {
+        
+        String category_name;
+        
+        public Category(String category_name) {
+            this.category_name = category_name;
+        }
+        
+        public String getName() {
+            return this.category_name;
+        }
+    }
+    
+    public static class Context {
+
+        private String context_name;
+        private BayesClassifier<String, Category> bayes;
+        
+        public Context(String context_name, Map<String, File> categoryExampleLinesFiles, File negativeExampleLines) throws IOException {
+            this.context_name = context_name;
+            int requiredSize = 0;
+            Map<String, List<String>> categoryBuffer = new HashMap<>();
+            for (Map.Entry<String, File> category: categoryExampleLinesFiles.entrySet()) {
+                List<String> list = Files.readAllLines(category.getValue().toPath());
+                categoryBuffer.put(category.getKey(), list);
+                requiredSize += list.size();
+            }
+            List<String> list = Files.readAllLines(negativeExampleLines.toPath());
+            categoryBuffer.put(NONE_CATEGORY_NAME, Files.readAllLines(negativeExampleLines.toPath()));
+            requiredSize += list.size();
+            
+            this.bayes = new BayesClassifier<>();
+            this.bayes.setMemoryCapacity(requiredSize);
+            
+            for (Map.Entry<String, List<String>> category: categoryBuffer.entrySet()) {
+                Category c = new Category(category.getKey());
+                for (String line: category.getValue()) {
+                    List<String> tokens = normalize(line);
+                    bayes.learn(c, tokens);
+                }
+            }
+            bayes.learn(NONE_CATEGORY, categoryBuffer.get(NONE_CATEGORY_NAME));
+        }
+
+        private List<String> normalize(String phrase) {
+            String cleanphrase = phrase.toLowerCase().replaceAll("\\W", " ");
+            String[] rawtokens = cleanphrase.split("\\s");
+            List<String> tokens = new ArrayList<>();
+            for (String token: rawtokens) if (token.length() > 2) tokens.add(token);
+            return tokens;
+        }
+        
+        public String getName() {
+            return this.context_name;
+        }
+
+        public Classification<String, Category> classify(String phrase) {
+            List<String> words = normalize(phrase);
+            return this.bayes.classify(words);
+        }
+        
+     }
+    
+    private static Map<String, Context> contexts;
+
+    public static Set<String> getContextNames() {
+        return contexts.keySet();
+    }
+    
+    public static Context getContext(String contextName) {
+        return contexts.get(contextName);
+    }
+    
+    /**
+     * create a new classifier set.
+     * @param path_to_context_directory directory containing contexts wich are directories containing .txt files. One of them must be named 'negative.txt'
+     */
+    public static void initialize(File path_to_context_directory) {
+        contexts = new HashMap<>();
+        String[] context_candidates = path_to_context_directory.list();
+        for (String context_candidate: context_candidates) {
+            File ccf = new File(path_to_context_directory, context_candidate);
+            if (!ccf.isDirectory()) continue;
+            String[] category_candidates = ccf.list();
+            
+            Map<String, File> categoryExampleLinesFiles = new HashMap<>();
+            File negativeExampleLines = null;
+            
+            for (String category_candidate: category_candidates) {
+                if (!category_candidate.endsWith(".txt")) continue;
+                File catcf = new File(ccf, category_candidate);
+                if (category_candidate.startsWith("negative")) {
+                    negativeExampleLines = catcf;
+                } else {
+                    categoryExampleLinesFiles.put(category_candidate.substring(0, category_candidate.length() - 4), catcf);
+                }
+            }
+            
+            if (negativeExampleLines != null && categoryExampleLinesFiles.size() > 0) {
+                try {
+                    Context context = new Context(context_candidate, categoryExampleLinesFiles, negativeExampleLines);
+                    contexts.put(context_candidate, context);
+                } catch (IOException e) {
+                    ConcurrentLog.logException(e);
+                }
+            }
+        }
+    }
+    
+    /**
+     * compute the classification of a given text. The result is a map with most probable categorizations for each context.
+     * @param text the text to be classified
+     * @return a map where the key is the navigator name (the bayes context) and the value is the most probable attribute name (the bayes category)
+     */
+    public static Map<String, String> getClassification(String text) {
+        Map<String, String> c = new HashMap<>();
+        for (Context context: contexts.values()) {
+            Classification<String, Category> classification = context.classify(text);
+            String contextname = context.getName();
+            Category category = classification.getCategory();
+            String categoryname = category.getName();
+            c.put(contextname, categoryname);
+        }
+        return c;
+    }
+    
+}
diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java
index 5a8480935..a61bc4ae1 100644
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@@ -153,6 +153,7 @@ import net.yacy.document.Condenser;
 import net.yacy.document.Document;
 import net.yacy.document.LibraryProvider;
 import net.yacy.document.Parser;
+import net.yacy.document.ProbabilisticClassifier;
 import net.yacy.document.TextParser;
 import net.yacy.document.VocabularyScraper;
 import net.yacy.document.Parser.Failure;
@@ -242,7 +243,7 @@ public final class Switchboard extends serverSwitch {
 
     // storage management
     public File htCachePath;
-    public final File dictionariesPath;
+    public final File dictionariesPath, classificationPath;
     public File listsPath;
     public File htDocsPath;
     public File workPath;
@@ -374,11 +375,20 @@ public final class Switchboard extends serverSwitch {
         }
         
         this.log.config("Work Path:    " + this.workPath.toString());
+
         this.dictionariesPath =
             getDataPath(
                 SwitchboardConstants.DICTIONARY_SOURCE_PATH,
                 SwitchboardConstants.DICTIONARY_SOURCE_PATH_DEFAULT);
         this.log.config("Dictionaries Path:" + this.dictionariesPath.toString());
+        if (!this.dictionariesPath.exists()) this.dictionariesPath.mkdirs();
+        
+        this.classificationPath =
+                getDataPath(
+                    SwitchboardConstants.CLASSIFICATION_SOURCE_PATH,
+                    SwitchboardConstants.CLASSIFICATION_SOURCE_PATH_DEFAULT);
+            this.log.config("Classification Path:" + this.classificationPath.toString());
+        if (!this.classificationPath.exists()) this.classificationPath.mkdirs();
 
         CollectionConfiguration.UNIQUE_HEURISTIC_PREFER_HTTPS = this.getConfigBool("search.ranking.uniqueheuristic.preferhttps", false);
         CollectionConfiguration.UNIQUE_HEURISTIC_PREFER_WWWPREFIX = this.getConfigBool("search.ranking.uniqueheuristic.preferwwwprefix", true);
@@ -397,6 +407,9 @@ public final class Switchboard extends serverSwitch {
                     Tagging t = LibraryProvider.autotagging.getVocabulary(o);
                     if (t != null) t.setFacet(false);
                 }
+
+                Thread.currentThread().setName("ProbabilisticClassification.initialize");
+                ProbabilisticClassifier.initialize(Switchboard.this.classificationPath);
             }
         }.start();
 
diff --git a/source/net/yacy/search/SwitchboardConstants.java b/source/net/yacy/search/SwitchboardConstants.java
index 82295624f..32cc6c916 100644
--- a/source/net/yacy/search/SwitchboardConstants.java
+++ b/source/net/yacy/search/SwitchboardConstants.java
@@ -413,6 +413,9 @@ public final class SwitchboardConstants {
 
     public static final String DICTIONARY_SOURCE_PATH         = "dictionaries";
     public static final String DICTIONARY_SOURCE_PATH_DEFAULT = "DATA/DICTIONARIES";
+    
+    public static final String CLASSIFICATION_SOURCE_PATH         = "classification";
+    public static final String CLASSIFICATION_SOURCE_PATH_DEFAULT = "DATA/CLASSIFICATION";
 
     /**
      * <p><code>public static final String <strong>HTDOCS_PATH</strong> = "htDocsPath"</code></p>
diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java
index 28316350a..1adafe904 100644
--- a/source/net/yacy/search/query/QueryParams.java
+++ b/source/net/yacy/search/query/QueryParams.java
@@ -54,6 +54,7 @@ import net.yacy.cora.storage.HandleSet;
 import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.cora.util.SpaceExceededException;
 import net.yacy.document.LibraryProvider;
+import net.yacy.document.ProbabilisticClassifier;
 import net.yacy.document.Tokenizer;
 import net.yacy.kelondro.data.word.Word;
 import net.yacy.kelondro.data.word.WordReferenceRow;
@@ -262,6 +263,9 @@ public final class QueryParams {
                 this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX);
             }
         }
+        for (String context: ProbabilisticClassifier.getContextNames()) {
+            this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + context + CollectionSchema.VOCABULARY_TERMS_SUFFIX);
+        }
         this.cachedQuery = null;
     }
 
diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java
index 70bf0b998..cdcec498d 100644
--- a/source/net/yacy/search/query/SearchEvent.java
+++ b/source/net/yacy/search/query/SearchEvent.java
@@ -36,6 +36,7 @@ import java.util.LinkedHashMap;
 import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 import java.util.SortedMap;
 import java.util.SortedSet;
 import java.util.TreeMap;
@@ -72,6 +73,7 @@ import net.yacy.crawler.retrieval.Response;
 import net.yacy.data.WorkTables;
 import net.yacy.document.LargeNumberCache;
 import net.yacy.document.LibraryProvider;
+import net.yacy.document.ProbabilisticClassifier;
 import net.yacy.document.TextParser;
 import net.yacy.document.Tokenizer;
 import net.yacy.kelondro.data.meta.URIMetadataNode;
@@ -882,13 +884,16 @@ public final class SearchEvent {
         }
         
         // get the vocabulary navigation
-        for (Tagging v: LibraryProvider.autotagging.getVocabularies()) {
-            fcts = facets.get(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX);
+        Set<String> genericFacets = new LinkedHashSet<>();
+        for (Tagging v: LibraryProvider.autotagging.getVocabularies()) genericFacets.add(v.getName());
+        genericFacets.addAll(ProbabilisticClassifier.getContextNames());
+        for (String v: genericFacets) {
+            fcts = facets.get(CollectionSchema.VOCABULARY_PREFIX + v + CollectionSchema.VOCABULARY_TERMS_SUFFIX);
             if (fcts != null) {
-                ScoreMap<String> vocNav = this.vocabularyNavigator.get(v.getName());
+                ScoreMap<String> vocNav = this.vocabularyNavigator.get(v);
                 if (vocNav == null) {
                     vocNav = new ConcurrentScoreMap<String>();
-                    this.vocabularyNavigator.put(v.getName(), vocNav);
+                    this.vocabularyNavigator.put(v, vocNav);
                 }
                 vocNav.inc(fcts);
             }
@@ -1242,7 +1247,7 @@ public final class SearchEvent {
             
           
             // check vocabulary terms (metatags) {only available in Solr index as vocabulary_xxyyzzz_sxt field}
-            // TODO: vocabulary is only valid and available in local Solr index (considere to auto-switch to Searchdom.LOCAL)
+            // TODO: vocabulary is only valid and available in local Solr index (consider to auto-switch to Searchdom.LOCAL)
             if (this.query.metatags != null && !this.query.metatags.isEmpty()) {
                 tagloop: for (Tagging.Metatag tag : this.query.metatags) {
                     SolrDocument sdoc = page;
diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java
index 7211a363a..07471e14e 100644
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@@ -81,6 +81,7 @@ import net.yacy.cora.util.SpaceExceededException;
 import net.yacy.crawler.retrieval.Response;
 import net.yacy.document.Condenser;
 import net.yacy.document.Document;
+import net.yacy.document.ProbabilisticClassifier;
 import net.yacy.document.SentenceReader;
 import net.yacy.document.Tokenizer;
 import net.yacy.document.content.DCEntry;
@@ -1006,6 +1007,12 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
         return doc;
     }
     
+    /**
+     * attach additional information to the document to enable navigation features
+     * @param doc the document to be enriched
+     * @param synonyms a list of synonyms detected for the text content
+     * @param genericFacets a map where the key is the navigator name and the value is the set of attributes names
+     */
     public void enrich(SolrInputDocument doc, List<String> synonyms, Map<String, Set<String>> genericFacets) {
         remove(doc, CollectionSchema.vocabularies_sxt); // delete old values
         for (SolrInputField sif: doc) {
@@ -1016,6 +1023,17 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
             // there are no pre-defined solr fields for navigation because the vocabulary is generic
             // we use dynamically allocated solr fields for this.
             // It must be a multi-value string/token field, therefore we use _sxt extensions for the field names
+            
+            // add to genericFacets the probabilistic categories
+            String text = (String) doc.getFieldValue(CollectionSchema.text_t.getSolrFieldName());
+            Map<String, String> classification = ProbabilisticClassifier.getClassification(text);
+            for (Map.Entry<String, String> entry: classification.entrySet()) {
+                Set<String> facetAttrbutes = new HashSet<>();
+                facetAttrbutes.add(entry.getValue());
+                genericFacets.put(entry.getKey(), facetAttrbutes);
+            }
+            
+            // compute the document field values
             List<String> vocabularies = new ArrayList<>();
             for (Map.Entry<String, Set<String>> facet: genericFacets.entrySet()) {
                 String facetName = facet.getKey();