added a new facet type based on a probabilistic classifier using

bayesian filters. This can be used to classify documents during indexing-time using a pre-definied bayesian filter. New wordings: - a context is a class where different categories are possible. The context name is equal to a facet name. - a category is a facet type within a facet navigation. Each context must have several categories, at least one custom name (things you want to discover) and one with the exact name "negative". To use this, you must do: - for each context, you must create a directory within DATA/CLASSIFICATION with the name of the context (the facet name) - within each context directory, you must create text files with one document each per line for every categroy. One of these categories MUST have the name 'negative.txt'. Then, each new document is classified to match within one of the given categories for each context.
10 years ago · df3314ac1a
parent dbbad23e12
commit df3314ac1a
12 changed files with 251 additions and 11 deletions
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@ -258,6 +258,12 @@ surrogates.out = DATA/SURROGATES/out
 # this directory also contains subdirectories for input sources, the did-you-mean function and other
 dictionaries = DATA/DICTIONARIES

+# a path to the classification directory
+# each subdirectory is the name of a context (which becomes a navigator) with '.txt' files
+# containing texts to teach a bayesian filter. One of the files must be named 'negative.txt'.
+# The text files can be created with the Export functionality using the option "Only Text".
+classification = DATA/CLASSIFICATION
+
 # storage place for new releases
 releases = DATA/RELEASE

--- a/htroot/js/yacysearch.js
+++ b/htroot/js/yacysearch.js
@ -52,8 +52,8 @@ function statistics(offset, itemscount, itemsperpage, totalcount, localResourceS
 	  	resnav += "\">&laquo;</a></li>";
 	  }
 	  
-	  numberofpages = Math.floor(Math.min(10, 1 + ((totalcount.replace(/\./g,'') - 1) / itemsperpage)));
-	  if (!numberofpages) numberofpages = 10;
+	  numberofpages = Math.floor(Math.min(9, 1 + ((totalcount.replace(/\./g,'') - 1) / itemsperpage)));
+	  if (!numberofpages) numberofpages = 9;
 	  for (i = 0; i < numberofpages; i++) {
 	      if (i == thispage) {
 	         resnav += "<li class=\"active\"><a href=\"#\">";
--- a/htroot/yacysearch.java
+++ b/htroot/yacysearch.java
@ -49,6 +49,7 @@ import net.yacy.cora.federate.FederateSearchManager;
 import net.yacy.cora.federate.yacy.CacheStrategy;
 import net.yacy.cora.geo.GeoLocation;
 import net.yacy.cora.lod.vocabulary.Tagging;
+import net.yacy.cora.lod.vocabulary.Tagging.Metatag;
 import net.yacy.cora.protocol.ClientIdentification;
 import net.yacy.cora.protocol.Domains;
 import net.yacy.cora.protocol.HeaderFramework;
@ -443,7 +444,12 @@ public class yacysearch {
                if (p > 0) {
                    String k = vocabulary.substring(0, p);
                    String v = vocabulary.substring(p + 1);
-                    metatags.add(LibraryProvider.autotagging.metatag(k, v));
+                    Metatag mt = LibraryProvider.autotagging.metatag(k, v);
+                    if (mt != null) {
+                        metatags.add(mt);
+                    } else {
+                        
+                    }
                }
            }

--- a/source/net/yacy/cora/language/synonyms/AutotaggingLibrary.java
+++ b/source/net/yacy/cora/language/synonyms/AutotaggingLibrary.java
@ -31,6 +31,7 @@ import java.util.concurrent.ConcurrentHashMap;
 import net.yacy.cora.geo.Locations;
 import net.yacy.cora.lod.vocabulary.Tagging;
 import net.yacy.cora.util.ConcurrentLog;
+import net.yacy.document.ProbabilisticClassifier;

 /**
 * Autotagging provides a set of tag/print-name properties which can be used to
@ -167,6 +168,12 @@ public class AutotaggingLibrary {

    public Tagging.Metatag metatag(String vocName, String term) {
        Tagging tagging = this.vocabularies.get(vocName);
+        if (tagging == null) {
+            if (ProbabilisticClassifier.getContextNames().contains(vocName)) {
+                tagging = new Tagging(vocName);
+            }
+        }
+        if (tagging == null) return null;
        return tagging.getMetatagFromTerm(Tagging.decodeMaskname(term));
    }

--- a/source/net/yacy/cora/lod/vocabulary/Tagging.java
+++ b/source/net/yacy/cora/lod/vocabulary/Tagging.java
@ -90,7 +90,7 @@ public class Tagging {

    }

-    private Tagging(String name) {
+    public Tagging(String name) {
        this.navigatorName = name;
        this.synonym2term = new ConcurrentHashMap<String, String>();
        this.term2synonym = new ConcurrentHashMap<String, String>();
@ -544,6 +544,11 @@ public class Tagging {
        return term;
    }

+    /**
+     * The metatag class contains the object value for a Linked Open Data RDF triple.
+     * The metatag is created in a tagging environment, which already contains the
+     * subject and the predicate. The metatag is the object of the RDF triple.
+     */
 	public class Metatag {
 	    private final String object;
 	    private Metatag(String object) {
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@ -250,12 +250,17 @@ dc_rights
    /**
     * add the given words to the set of keywords.
     * These keywords will appear in dc_subject
-     * @param tags
+     * @param tags a map where the key is the navigator name and the value is the set of attributes as metatags
     */
    protected void addMetatags(Map<String, Set<Tagging.Metatag>> tags) {
        this.generic_facets.putAll(computeGenericFacets(tags));
    }

+    /**
+     * compute generic facets
+     * @param tags a map where the key is the navigator name and the value is the set of attributes as metatags
+     * @return a map where the key is the navigator name and the value is the set of attributes names
+     */
    public static Map<String, Set<String>> computeGenericFacets(Map<String, Set<Tagging.Metatag>> tags) {
        Map<String, Set<String>> gf = new HashMap<String, Set<String>>();
        for (Map.Entry<String, Set<Tagging.Metatag>> e: tags.entrySet()) {
--- a/source/net/yacy/document/ProbabilisticClassifier.java
+++ b/source/net/yacy/document/ProbabilisticClassifier.java
@ -0,0 +1,168 @@
+/**
+ *  ProbabilisticClassifier
+ *  Copyright 2015 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
+ *  first published 06.08.2015 on http://yacy.net
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program in the file lgpl21.txt
+ *  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+package net.yacy.document;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import net.yacy.cora.bayes.BayesClassifier;
+import net.yacy.cora.bayes.Classification;
+import net.yacy.cora.util.ConcurrentLog;
+
+public class ProbabilisticClassifier {
+
+    public final static String NONE_CATEGORY_NAME = "NONE";
+    public final static Category NONE_CATEGORY = new Category(NONE_CATEGORY_NAME);
+    
+    public static class Category {
+        
+        String category_name;
+        
+        public Category(String category_name) {
+            this.category_name = category_name;
+        }
+        
+        public String getName() {
+            return this.category_name;
+        }
+    }
+    
+    public static class Context {
+
+        private String context_name;
+        private BayesClassifier<String, Category> bayes;
+        
+        public Context(String context_name, Map<String, File> categoryExampleLinesFiles, File negativeExampleLines) throws IOException {
+            this.context_name = context_name;
+            int requiredSize = 0;
+            Map<String, List<String>> categoryBuffer = new HashMap<>();
+            for (Map.Entry<String, File> category: categoryExampleLinesFiles.entrySet()) {
+                List<String> list = Files.readAllLines(category.getValue().toPath());
+                categoryBuffer.put(category.getKey(), list);
+                requiredSize += list.size();
+            }
+            List<String> list = Files.readAllLines(negativeExampleLines.toPath());
+            categoryBuffer.put(NONE_CATEGORY_NAME, Files.readAllLines(negativeExampleLines.toPath()));
+            requiredSize += list.size();
+            
+            this.bayes = new BayesClassifier<>();
+            this.bayes.setMemoryCapacity(requiredSize);
+            
+            for (Map.Entry<String, List<String>> category: categoryBuffer.entrySet()) {
+                Category c = new Category(category.getKey());
+                for (String line: category.getValue()) {
+                    List<String> tokens = normalize(line);
+                    bayes.learn(c, tokens);
+                }
+            }
+            bayes.learn(NONE_CATEGORY, categoryBuffer.get(NONE_CATEGORY_NAME));
+        }
+
+        private List<String> normalize(String phrase) {
+            String cleanphrase = phrase.toLowerCase().replaceAll("\\W", " ");
+            String[] rawtokens = cleanphrase.split("\\s");
+            List<String> tokens = new ArrayList<>();
+            for (String token: rawtokens) if (token.length() > 2) tokens.add(token);
+            return tokens;
+        }
+        
+        public String getName() {
+            return this.context_name;
+        }
+
+        public Classification<String, Category> classify(String phrase) {
+            List<String> words = normalize(phrase);
+            return this.bayes.classify(words);
+        }
+        
+     }
+    
+    private static Map<String, Context> contexts;
+
+    public static Set<String> getContextNames() {
+        return contexts.keySet();
+    }
+    
+    public static Context getContext(String contextName) {
+        return contexts.get(contextName);
+    }
+    
+    /**
+     * create a new classifier set.
+     * @param path_to_context_directory directory containing contexts wich are directories containing .txt files. One of them must be named 'negative.txt'
+     */
+    public static void initialize(File path_to_context_directory) {
+        contexts = new HashMap<>();
+        String[] context_candidates = path_to_context_directory.list();
+        for (String context_candidate: context_candidates) {
+            File ccf = new File(path_to_context_directory, context_candidate);
+            if (!ccf.isDirectory()) continue;
+            String[] category_candidates = ccf.list();
+            
+            Map<String, File> categoryExampleLinesFiles = new HashMap<>();
+            File negativeExampleLines = null;
+            
+            for (String category_candidate: category_candidates) {
+                if (!category_candidate.endsWith(".txt")) continue;
+                File catcf = new File(ccf, category_candidate);
+                if (category_candidate.startsWith("negative")) {
+                    negativeExampleLines = catcf;
+                } else {
+                    categoryExampleLinesFiles.put(category_candidate.substring(0, category_candidate.length() - 4), catcf);
+                }
+            }
+            
+            if (negativeExampleLines != null && categoryExampleLinesFiles.size() > 0) {
+                try {
+                    Context context = new Context(context_candidate, categoryExampleLinesFiles, negativeExampleLines);
+                    contexts.put(context_candidate, context);
+                } catch (IOException e) {
+                    ConcurrentLog.logException(e);
+                }
+            }
+        }
+    }
+    
+    /**
+     * compute the classification of a given text. The result is a map with most probable categorizations for each context.
+     * @param text the text to be classified
+     * @return a map where the key is the navigator name (the bayes context) and the value is the most probable attribute name (the bayes category)
+     */
+    public static Map<String, String> getClassification(String text) {
+        Map<String, String> c = new HashMap<>();
+        for (Context context: contexts.values()) {
+            Classification<String, Category> classification = context.classify(text);
+            String contextname = context.getName();
+            Category category = classification.getCategory();
+            String categoryname = category.getName();
+            c.put(contextname, categoryname);
+        }
+        return c;
+    }
+    
+}
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -153,6 +153,7 @@ import net.yacy.document.Condenser;
 import net.yacy.document.Document;
 import net.yacy.document.LibraryProvider;
 import net.yacy.document.Parser;
+import net.yacy.document.ProbabilisticClassifier;
 import net.yacy.document.TextParser;
 import net.yacy.document.VocabularyScraper;
 import net.yacy.document.Parser.Failure;
@ -242,7 +243,7 @@ public final class Switchboard extends serverSwitch {

    // storage management
    public File htCachePath;
-    public final File dictionariesPath;
+    public final File dictionariesPath, classificationPath;
    public File listsPath;
    public File htDocsPath;
    public File workPath;
@ -374,11 +375,20 @@ public final class Switchboard extends serverSwitch {
        }
        
        this.log.config("Work Path:    " + this.workPath.toString());
+
        this.dictionariesPath =
            getDataPath(
                SwitchboardConstants.DICTIONARY_SOURCE_PATH,
                SwitchboardConstants.DICTIONARY_SOURCE_PATH_DEFAULT);
        this.log.config("Dictionaries Path:" + this.dictionariesPath.toString());
+        if (!this.dictionariesPath.exists()) this.dictionariesPath.mkdirs();
+        
+        this.classificationPath =
+                getDataPath(
+                    SwitchboardConstants.CLASSIFICATION_SOURCE_PATH,
+                    SwitchboardConstants.CLASSIFICATION_SOURCE_PATH_DEFAULT);
+            this.log.config("Classification Path:" + this.classificationPath.toString());
+        if (!this.classificationPath.exists()) this.classificationPath.mkdirs();

        CollectionConfiguration.UNIQUE_HEURISTIC_PREFER_HTTPS = this.getConfigBool("search.ranking.uniqueheuristic.preferhttps", false);
        CollectionConfiguration.UNIQUE_HEURISTIC_PREFER_WWWPREFIX = this.getConfigBool("search.ranking.uniqueheuristic.preferwwwprefix", true);
@ -397,6 +407,9 @@ public final class Switchboard extends serverSwitch {
                    Tagging t = LibraryProvider.autotagging.getVocabulary(o);
                    if (t != null) t.setFacet(false);
                }
+
+                Thread.currentThread().setName("ProbabilisticClassification.initialize");
+                ProbabilisticClassifier.initialize(Switchboard.this.classificationPath);
            }
        }.start();

--- a/source/net/yacy/search/SwitchboardConstants.java
+++ b/source/net/yacy/search/SwitchboardConstants.java
@ -413,6 +413,9 @@ public final class SwitchboardConstants {

    public static final String DICTIONARY_SOURCE_PATH         = "dictionaries";
    public static final String DICTIONARY_SOURCE_PATH_DEFAULT = "DATA/DICTIONARIES";
+    
+    public static final String CLASSIFICATION_SOURCE_PATH         = "classification";
+    public static final String CLASSIFICATION_SOURCE_PATH_DEFAULT = "DATA/CLASSIFICATION";

    /**
     * <p><code>public static final String <strong>HTDOCS_PATH</strong> = "htDocsPath"</code></p>
--- a/source/net/yacy/search/query/QueryParams.java
+++ b/source/net/yacy/search/query/QueryParams.java
@ -54,6 +54,7 @@ import net.yacy.cora.storage.HandleSet;
 import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.cora.util.SpaceExceededException;
 import net.yacy.document.LibraryProvider;
+import net.yacy.document.ProbabilisticClassifier;
 import net.yacy.document.Tokenizer;
 import net.yacy.kelondro.data.word.Word;
 import net.yacy.kelondro.data.word.WordReferenceRow;
@ -262,6 +263,9 @@ public final class QueryParams {
                this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX);
            }
        }
+        for (String context: ProbabilisticClassifier.getContextNames()) {
+            this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + context + CollectionSchema.VOCABULARY_TERMS_SUFFIX);
+        }
        this.cachedQuery = null;
    }

--- a/source/net/yacy/search/query/SearchEvent.java
+++ b/source/net/yacy/search/query/SearchEvent.java
@ -36,6 +36,7 @@ import java.util.LinkedHashMap;
 import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 import java.util.SortedMap;
 import java.util.SortedSet;
 import java.util.TreeMap;
@ -72,6 +73,7 @@ import net.yacy.crawler.retrieval.Response;
 import net.yacy.data.WorkTables;
 import net.yacy.document.LargeNumberCache;
 import net.yacy.document.LibraryProvider;
+import net.yacy.document.ProbabilisticClassifier;
 import net.yacy.document.TextParser;
 import net.yacy.document.Tokenizer;
 import net.yacy.kelondro.data.meta.URIMetadataNode;
@ -882,13 +884,16 @@ public final class SearchEvent {
        }
        
        // get the vocabulary navigation
-        for (Tagging v: LibraryProvider.autotagging.getVocabularies()) {
-            fcts = facets.get(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX);
+        Set<String> genericFacets = new LinkedHashSet<>();
+        for (Tagging v: LibraryProvider.autotagging.getVocabularies()) genericFacets.add(v.getName());
+        genericFacets.addAll(ProbabilisticClassifier.getContextNames());
+        for (String v: genericFacets) {
+            fcts = facets.get(CollectionSchema.VOCABULARY_PREFIX + v + CollectionSchema.VOCABULARY_TERMS_SUFFIX);
            if (fcts != null) {
-                ScoreMap<String> vocNav = this.vocabularyNavigator.get(v.getName());
+                ScoreMap<String> vocNav = this.vocabularyNavigator.get(v);
                if (vocNav == null) {
                    vocNav = new ConcurrentScoreMap<String>();
-                    this.vocabularyNavigator.put(v.getName(), vocNav);
+                    this.vocabularyNavigator.put(v, vocNav);
                }
                vocNav.inc(fcts);
            }
@ -1242,7 +1247,7 @@ public final class SearchEvent {
            
          
            // check vocabulary terms (metatags) {only available in Solr index as vocabulary_xxyyzzz_sxt field}
-            // TODO: vocabulary is only valid and available in local Solr index (considere to auto-switch to Searchdom.LOCAL)
+            // TODO: vocabulary is only valid and available in local Solr index (consider to auto-switch to Searchdom.LOCAL)
            if (this.query.metatags != null && !this.query.metatags.isEmpty()) {
                tagloop: for (Tagging.Metatag tag : this.query.metatags) {
                    SolrDocument sdoc = page;
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@ -81,6 +81,7 @@ import net.yacy.cora.util.SpaceExceededException;
 import net.yacy.crawler.retrieval.Response;
 import net.yacy.document.Condenser;
 import net.yacy.document.Document;
+import net.yacy.document.ProbabilisticClassifier;
 import net.yacy.document.SentenceReader;
 import net.yacy.document.Tokenizer;
 import net.yacy.document.content.DCEntry;
@ -1006,6 +1007,12 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
        return doc;
    }
    
+    /**
+     * attach additional information to the document to enable navigation features
+     * @param doc the document to be enriched
+     * @param synonyms a list of synonyms detected for the text content
+     * @param genericFacets a map where the key is the navigator name and the value is the set of attributes names
+     */
    public void enrich(SolrInputDocument doc, List<String> synonyms, Map<String, Set<String>> genericFacets) {
        remove(doc, CollectionSchema.vocabularies_sxt); // delete old values
        for (SolrInputField sif: doc) {
@ -1016,6 +1023,17 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
            // there are no pre-defined solr fields for navigation because the vocabulary is generic
            // we use dynamically allocated solr fields for this.
            // It must be a multi-value string/token field, therefore we use _sxt extensions for the field names
+            
+            // add to genericFacets the probabilistic categories
+            String text = (String) doc.getFieldValue(CollectionSchema.text_t.getSolrFieldName());
+            Map<String, String> classification = ProbabilisticClassifier.getClassification(text);
+            for (Map.Entry<String, String> entry: classification.entrySet()) {
+                Set<String> facetAttrbutes = new HashSet<>();
+                facetAttrbutes.add(entry.getValue());
+                genericFacets.put(entry.getKey(), facetAttrbutes);
+            }
+            
+            // compute the document field values
            List<String> vocabularies = new ArrayList<>();
            for (Map.Entry<String, Set<String>> facet: genericFacets.entrySet()) {
                String facetName = facet.getKey();