From df3314ac1a15f1f78f1e9c3601b85ac28c42915d Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Mon, 10 Aug 2015 14:27:44 +0200 Subject: [PATCH] added a new facet type based on a probabilistic classifier using bayesian filters. This can be used to classify documents during indexing-time using a pre-definied bayesian filter. New wordings: - a context is a class where different categories are possible. The context name is equal to a facet name. - a category is a facet type within a facet navigation. Each context must have several categories, at least one custom name (things you want to discover) and one with the exact name "negative". To use this, you must do: - for each context, you must create a directory within DATA/CLASSIFICATION with the name of the context (the facet name) - within each context directory, you must create text files with one document each per line for every categroy. One of these categories MUST have the name 'negative.txt'. Then, each new document is classified to match within one of the given categories for each context. --- defaults/yacy.init | 6 + htroot/js/yacysearch.js | 4 +- htroot/yacysearch.java | 8 +- .../language/synonyms/AutotaggingLibrary.java | 7 + .../net/yacy/cora/lod/vocabulary/Tagging.java | 7 +- source/net/yacy/document/Document.java | 7 +- .../document/ProbabilisticClassifier.java | 168 ++++++++++++++++++ source/net/yacy/search/Switchboard.java | 15 +- .../net/yacy/search/SwitchboardConstants.java | 3 + source/net/yacy/search/query/QueryParams.java | 4 + source/net/yacy/search/query/SearchEvent.java | 15 +- .../schema/CollectionConfiguration.java | 18 ++ 12 files changed, 251 insertions(+), 11 deletions(-) create mode 100644 source/net/yacy/document/ProbabilisticClassifier.java diff --git a/defaults/yacy.init b/defaults/yacy.init index a238d2ed4..352f1b8c6 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -258,6 +258,12 @@ surrogates.out = DATA/SURROGATES/out # this directory also contains subdirectories for input sources, the did-you-mean function and other dictionaries = DATA/DICTIONARIES +# a path to the classification directory +# each subdirectory is the name of a context (which becomes a navigator) with '.txt' files +# containing texts to teach a bayesian filter. One of the files must be named 'negative.txt'. +# The text files can be created with the Export functionality using the option "Only Text". +classification = DATA/CLASSIFICATION + # storage place for new releases releases = DATA/RELEASE diff --git a/htroot/js/yacysearch.js b/htroot/js/yacysearch.js index c3994e009..413eabf58 100644 --- a/htroot/js/yacysearch.js +++ b/htroot/js/yacysearch.js @@ -52,8 +52,8 @@ function statistics(offset, itemscount, itemsperpage, totalcount, localResourceS resnav += "\">«"; } - numberofpages = Math.floor(Math.min(10, 1 + ((totalcount.replace(/\./g,'') - 1) / itemsperpage))); - if (!numberofpages) numberofpages = 10; + numberofpages = Math.floor(Math.min(9, 1 + ((totalcount.replace(/\./g,'') - 1) / itemsperpage))); + if (!numberofpages) numberofpages = 9; for (i = 0; i < numberofpages; i++) { if (i == thispage) { resnav += "
  • "; diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 3706113bb..fee284c8d 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -49,6 +49,7 @@ import net.yacy.cora.federate.FederateSearchManager; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.geo.GeoLocation; import net.yacy.cora.lod.vocabulary.Tagging; +import net.yacy.cora.lod.vocabulary.Tagging.Metatag; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; @@ -443,7 +444,12 @@ public class yacysearch { if (p > 0) { String k = vocabulary.substring(0, p); String v = vocabulary.substring(p + 1); - metatags.add(LibraryProvider.autotagging.metatag(k, v)); + Metatag mt = LibraryProvider.autotagging.metatag(k, v); + if (mt != null) { + metatags.add(mt); + } else { + + } } } diff --git a/source/net/yacy/cora/language/synonyms/AutotaggingLibrary.java b/source/net/yacy/cora/language/synonyms/AutotaggingLibrary.java index 54197e6bb..4cf70ce5c 100644 --- a/source/net/yacy/cora/language/synonyms/AutotaggingLibrary.java +++ b/source/net/yacy/cora/language/synonyms/AutotaggingLibrary.java @@ -31,6 +31,7 @@ import java.util.concurrent.ConcurrentHashMap; import net.yacy.cora.geo.Locations; import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.cora.util.ConcurrentLog; +import net.yacy.document.ProbabilisticClassifier; /** * Autotagging provides a set of tag/print-name properties which can be used to @@ -167,6 +168,12 @@ public class AutotaggingLibrary { public Tagging.Metatag metatag(String vocName, String term) { Tagging tagging = this.vocabularies.get(vocName); + if (tagging == null) { + if (ProbabilisticClassifier.getContextNames().contains(vocName)) { + tagging = new Tagging(vocName); + } + } + if (tagging == null) return null; return tagging.getMetatagFromTerm(Tagging.decodeMaskname(term)); } diff --git a/source/net/yacy/cora/lod/vocabulary/Tagging.java b/source/net/yacy/cora/lod/vocabulary/Tagging.java index f642ca6f3..31fbc9461 100644 --- a/source/net/yacy/cora/lod/vocabulary/Tagging.java +++ b/source/net/yacy/cora/lod/vocabulary/Tagging.java @@ -90,7 +90,7 @@ public class Tagging { } - private Tagging(String name) { + public Tagging(String name) { this.navigatorName = name; this.synonym2term = new ConcurrentHashMap(); this.term2synonym = new ConcurrentHashMap(); @@ -544,6 +544,11 @@ public class Tagging { return term; } + /** + * The metatag class contains the object value for a Linked Open Data RDF triple. + * The metatag is created in a tagging environment, which already contains the + * subject and the predicate. The metatag is the object of the RDF triple. + */ public class Metatag { private final String object; private Metatag(String object) { diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index f72ea5890..ad50e16ef 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -250,12 +250,17 @@ dc_rights /** * add the given words to the set of keywords. * These keywords will appear in dc_subject - * @param tags + * @param tags a map where the key is the navigator name and the value is the set of attributes as metatags */ protected void addMetatags(Map> tags) { this.generic_facets.putAll(computeGenericFacets(tags)); } + /** + * compute generic facets + * @param tags a map where the key is the navigator name and the value is the set of attributes as metatags + * @return a map where the key is the navigator name and the value is the set of attributes names + */ public static Map> computeGenericFacets(Map> tags) { Map> gf = new HashMap>(); for (Map.Entry> e: tags.entrySet()) { diff --git a/source/net/yacy/document/ProbabilisticClassifier.java b/source/net/yacy/document/ProbabilisticClassifier.java new file mode 100644 index 000000000..b729a311c --- /dev/null +++ b/source/net/yacy/document/ProbabilisticClassifier.java @@ -0,0 +1,168 @@ +/** + * ProbabilisticClassifier + * Copyright 2015 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany + * first published 06.08.2015 on http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + + +package net.yacy.document; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import net.yacy.cora.bayes.BayesClassifier; +import net.yacy.cora.bayes.Classification; +import net.yacy.cora.util.ConcurrentLog; + +public class ProbabilisticClassifier { + + public final static String NONE_CATEGORY_NAME = "NONE"; + public final static Category NONE_CATEGORY = new Category(NONE_CATEGORY_NAME); + + public static class Category { + + String category_name; + + public Category(String category_name) { + this.category_name = category_name; + } + + public String getName() { + return this.category_name; + } + } + + public static class Context { + + private String context_name; + private BayesClassifier bayes; + + public Context(String context_name, Map categoryExampleLinesFiles, File negativeExampleLines) throws IOException { + this.context_name = context_name; + int requiredSize = 0; + Map> categoryBuffer = new HashMap<>(); + for (Map.Entry category: categoryExampleLinesFiles.entrySet()) { + List list = Files.readAllLines(category.getValue().toPath()); + categoryBuffer.put(category.getKey(), list); + requiredSize += list.size(); + } + List list = Files.readAllLines(negativeExampleLines.toPath()); + categoryBuffer.put(NONE_CATEGORY_NAME, Files.readAllLines(negativeExampleLines.toPath())); + requiredSize += list.size(); + + this.bayes = new BayesClassifier<>(); + this.bayes.setMemoryCapacity(requiredSize); + + for (Map.Entry> category: categoryBuffer.entrySet()) { + Category c = new Category(category.getKey()); + for (String line: category.getValue()) { + List tokens = normalize(line); + bayes.learn(c, tokens); + } + } + bayes.learn(NONE_CATEGORY, categoryBuffer.get(NONE_CATEGORY_NAME)); + } + + private List normalize(String phrase) { + String cleanphrase = phrase.toLowerCase().replaceAll("\\W", " "); + String[] rawtokens = cleanphrase.split("\\s"); + List tokens = new ArrayList<>(); + for (String token: rawtokens) if (token.length() > 2) tokens.add(token); + return tokens; + } + + public String getName() { + return this.context_name; + } + + public Classification classify(String phrase) { + List words = normalize(phrase); + return this.bayes.classify(words); + } + + } + + private static Map contexts; + + public static Set getContextNames() { + return contexts.keySet(); + } + + public static Context getContext(String contextName) { + return contexts.get(contextName); + } + + /** + * create a new classifier set. + * @param path_to_context_directory directory containing contexts wich are directories containing .txt files. One of them must be named 'negative.txt' + */ + public static void initialize(File path_to_context_directory) { + contexts = new HashMap<>(); + String[] context_candidates = path_to_context_directory.list(); + for (String context_candidate: context_candidates) { + File ccf = new File(path_to_context_directory, context_candidate); + if (!ccf.isDirectory()) continue; + String[] category_candidates = ccf.list(); + + Map categoryExampleLinesFiles = new HashMap<>(); + File negativeExampleLines = null; + + for (String category_candidate: category_candidates) { + if (!category_candidate.endsWith(".txt")) continue; + File catcf = new File(ccf, category_candidate); + if (category_candidate.startsWith("negative")) { + negativeExampleLines = catcf; + } else { + categoryExampleLinesFiles.put(category_candidate.substring(0, category_candidate.length() - 4), catcf); + } + } + + if (negativeExampleLines != null && categoryExampleLinesFiles.size() > 0) { + try { + Context context = new Context(context_candidate, categoryExampleLinesFiles, negativeExampleLines); + contexts.put(context_candidate, context); + } catch (IOException e) { + ConcurrentLog.logException(e); + } + } + } + } + + /** + * compute the classification of a given text. The result is a map with most probable categorizations for each context. + * @param text the text to be classified + * @return a map where the key is the navigator name (the bayes context) and the value is the most probable attribute name (the bayes category) + */ + public static Map getClassification(String text) { + Map c = new HashMap<>(); + for (Context context: contexts.values()) { + Classification classification = context.classify(text); + String contextname = context.getName(); + Category category = classification.getCategory(); + String categoryname = category.getName(); + c.put(contextname, categoryname); + } + return c; + } + +} diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 5a8480935..a61bc4ae1 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -153,6 +153,7 @@ import net.yacy.document.Condenser; import net.yacy.document.Document; import net.yacy.document.LibraryProvider; import net.yacy.document.Parser; +import net.yacy.document.ProbabilisticClassifier; import net.yacy.document.TextParser; import net.yacy.document.VocabularyScraper; import net.yacy.document.Parser.Failure; @@ -242,7 +243,7 @@ public final class Switchboard extends serverSwitch { // storage management public File htCachePath; - public final File dictionariesPath; + public final File dictionariesPath, classificationPath; public File listsPath; public File htDocsPath; public File workPath; @@ -374,11 +375,20 @@ public final class Switchboard extends serverSwitch { } this.log.config("Work Path: " + this.workPath.toString()); + this.dictionariesPath = getDataPath( SwitchboardConstants.DICTIONARY_SOURCE_PATH, SwitchboardConstants.DICTIONARY_SOURCE_PATH_DEFAULT); this.log.config("Dictionaries Path:" + this.dictionariesPath.toString()); + if (!this.dictionariesPath.exists()) this.dictionariesPath.mkdirs(); + + this.classificationPath = + getDataPath( + SwitchboardConstants.CLASSIFICATION_SOURCE_PATH, + SwitchboardConstants.CLASSIFICATION_SOURCE_PATH_DEFAULT); + this.log.config("Classification Path:" + this.classificationPath.toString()); + if (!this.classificationPath.exists()) this.classificationPath.mkdirs(); CollectionConfiguration.UNIQUE_HEURISTIC_PREFER_HTTPS = this.getConfigBool("search.ranking.uniqueheuristic.preferhttps", false); CollectionConfiguration.UNIQUE_HEURISTIC_PREFER_WWWPREFIX = this.getConfigBool("search.ranking.uniqueheuristic.preferwwwprefix", true); @@ -397,6 +407,9 @@ public final class Switchboard extends serverSwitch { Tagging t = LibraryProvider.autotagging.getVocabulary(o); if (t != null) t.setFacet(false); } + + Thread.currentThread().setName("ProbabilisticClassification.initialize"); + ProbabilisticClassifier.initialize(Switchboard.this.classificationPath); } }.start(); diff --git a/source/net/yacy/search/SwitchboardConstants.java b/source/net/yacy/search/SwitchboardConstants.java index 82295624f..32cc6c916 100644 --- a/source/net/yacy/search/SwitchboardConstants.java +++ b/source/net/yacy/search/SwitchboardConstants.java @@ -413,6 +413,9 @@ public final class SwitchboardConstants { public static final String DICTIONARY_SOURCE_PATH = "dictionaries"; public static final String DICTIONARY_SOURCE_PATH_DEFAULT = "DATA/DICTIONARIES"; + + public static final String CLASSIFICATION_SOURCE_PATH = "classification"; + public static final String CLASSIFICATION_SOURCE_PATH_DEFAULT = "DATA/CLASSIFICATION"; /** *

    public static final String HTDOCS_PATH = "htDocsPath"

    diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java index 28316350a..1adafe904 100644 --- a/source/net/yacy/search/query/QueryParams.java +++ b/source/net/yacy/search/query/QueryParams.java @@ -54,6 +54,7 @@ import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.SpaceExceededException; import net.yacy.document.LibraryProvider; +import net.yacy.document.ProbabilisticClassifier; import net.yacy.document.Tokenizer; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReferenceRow; @@ -262,6 +263,9 @@ public final class QueryParams { this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX); } } + for (String context: ProbabilisticClassifier.getContextNames()) { + this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + context + CollectionSchema.VOCABULARY_TERMS_SUFFIX); + } this.cachedQuery = null; } diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index 70bf0b998..cdcec498d 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -36,6 +36,7 @@ import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.SortedMap; import java.util.SortedSet; import java.util.TreeMap; @@ -72,6 +73,7 @@ import net.yacy.crawler.retrieval.Response; import net.yacy.data.WorkTables; import net.yacy.document.LargeNumberCache; import net.yacy.document.LibraryProvider; +import net.yacy.document.ProbabilisticClassifier; import net.yacy.document.TextParser; import net.yacy.document.Tokenizer; import net.yacy.kelondro.data.meta.URIMetadataNode; @@ -882,13 +884,16 @@ public final class SearchEvent { } // get the vocabulary navigation - for (Tagging v: LibraryProvider.autotagging.getVocabularies()) { - fcts = facets.get(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX); + Set genericFacets = new LinkedHashSet<>(); + for (Tagging v: LibraryProvider.autotagging.getVocabularies()) genericFacets.add(v.getName()); + genericFacets.addAll(ProbabilisticClassifier.getContextNames()); + for (String v: genericFacets) { + fcts = facets.get(CollectionSchema.VOCABULARY_PREFIX + v + CollectionSchema.VOCABULARY_TERMS_SUFFIX); if (fcts != null) { - ScoreMap vocNav = this.vocabularyNavigator.get(v.getName()); + ScoreMap vocNav = this.vocabularyNavigator.get(v); if (vocNav == null) { vocNav = new ConcurrentScoreMap(); - this.vocabularyNavigator.put(v.getName(), vocNav); + this.vocabularyNavigator.put(v, vocNav); } vocNav.inc(fcts); } @@ -1242,7 +1247,7 @@ public final class SearchEvent { // check vocabulary terms (metatags) {only available in Solr index as vocabulary_xxyyzzz_sxt field} - // TODO: vocabulary is only valid and available in local Solr index (considere to auto-switch to Searchdom.LOCAL) + // TODO: vocabulary is only valid and available in local Solr index (consider to auto-switch to Searchdom.LOCAL) if (this.query.metatags != null && !this.query.metatags.isEmpty()) { tagloop: for (Tagging.Metatag tag : this.query.metatags) { SolrDocument sdoc = page; diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 7211a363a..07471e14e 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -81,6 +81,7 @@ import net.yacy.cora.util.SpaceExceededException; import net.yacy.crawler.retrieval.Response; import net.yacy.document.Condenser; import net.yacy.document.Document; +import net.yacy.document.ProbabilisticClassifier; import net.yacy.document.SentenceReader; import net.yacy.document.Tokenizer; import net.yacy.document.content.DCEntry; @@ -1006,6 +1007,12 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri return doc; } + /** + * attach additional information to the document to enable navigation features + * @param doc the document to be enriched + * @param synonyms a list of synonyms detected for the text content + * @param genericFacets a map where the key is the navigator name and the value is the set of attributes names + */ public void enrich(SolrInputDocument doc, List synonyms, Map> genericFacets) { remove(doc, CollectionSchema.vocabularies_sxt); // delete old values for (SolrInputField sif: doc) { @@ -1016,6 +1023,17 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // there are no pre-defined solr fields for navigation because the vocabulary is generic // we use dynamically allocated solr fields for this. // It must be a multi-value string/token field, therefore we use _sxt extensions for the field names + + // add to genericFacets the probabilistic categories + String text = (String) doc.getFieldValue(CollectionSchema.text_t.getSolrFieldName()); + Map classification = ProbabilisticClassifier.getClassification(text); + for (Map.Entry entry: classification.entrySet()) { + Set facetAttrbutes = new HashSet<>(); + facetAttrbutes.add(entry.getValue()); + genericFacets.put(entry.getKey(), facetAttrbutes); + } + + // compute the document field values List vocabularies = new ArrayList<>(); for (Map.Entry> facet: genericFacets.entrySet()) { String facetName = facet.getKey();