added bayes filter from Philipp Nolte, originally taken from

https://github.com/ptnplanet/Java-Naive-Bayes-Classifier
and modified inside the loklak.org project. After optimization in loklak
it was inserted into the net.yacy.cora.bayes package. It shall be used
to create custom search navigation filters.

The original copyright notice was copied from the README.md from
https://github.com/ptnplanet/Java-Naive-Bayes-Classifier/blob/master/README.md
The original package domain was
de.daslaboratorium.machinelearning.classifier
pull/9/merge
Michael Peter Christen 10 years ago
parent 1bced1ae60
commit 1ccbf739b1

@ -0,0 +1,154 @@
/*
* The MIT License (MIT)
* ------------------
*
* Copyright (c) 2012-2014 Philipp Nolte
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
/*
* This software was taken from https://github.com/ptnplanet/Java-Naive-Bayes-Classifier
* and inserted into the loklak class hierarchy to be enhanced and extended
* by @0rb1t3r. After optimization in loklak it was inserted into the net.yacy.cora.bayes
* package. It shall be used to create custom search navigation filters.
* The original copyright notice was copied from the README.mnd
* from https://github.com/ptnplanet/Java-Naive-Bayes-Classifier/blob/master/README.md
* The original package domain was de.daslaboratorium.machinelearning.classifier
*/
package net.yacy.cora.bayes;
import java.util.Collection;
import java.util.Comparator;
import java.util.SortedSet;
import java.util.TreeSet;
/**
* A concrete implementation of the abstract Classifier class. The Bayes
* classifier implements a naive Bayes approach to classifying a given set of
* features: classify(feat1,...,featN) = argmax(P(cat)*PROD(P(featI|cat)
*
* @author Philipp Nolte
*
* @see http://en.wikipedia.org/wiki/Naive_Bayes_classifier
*
* @param <T> The feature class.
* @param <K> The category class.
*/
public class BayesClassifier<T, K> extends Classifier<T, K> {
/**
* Calculates the product of all feature probabilities: PROD(P(featI|cat)
*
* @param features The set of features to use.
* @param category The category to test for.
* @return The product of all feature probabilities.
*/
private float featuresProbabilityProduct(Collection<T> features,
K category) {
float product = 1.0f;
for (T feature : features)
product *= this.featureWeighedAverage(feature, category);
return product;
}
/**
* Calculates the probability that the features can be classified as the
* category given.
*
* @param features The set of features to use.
* @param category The category to test for.
* @return The probability that the features can be classified as the
* category.
*/
private float categoryProbability(Collection<T> features, K category) {
return ((float) this.categoryCount(category)
/ (float) this.getCategoriesTotal())
* featuresProbabilityProduct(features, category);
}
/**
* Retrieves a sorted <code>Set</code> of probabilities that the given set
* of features is classified as the available categories.
*
* @param features The set of features to use.
* @return A sorted <code>Set</code> of category-probability-entries.
*/
private SortedSet<Classification<T, K>> categoryProbabilities(
Collection<T> features) {
/*
* Sort the set according to the possibilities. Because we have to sort
* by the mapped value and not by the mapped key, we can not use a
* sorted tree (TreeMap) and we have to use a set-entry approach to
* achieve the desired functionality. A custom comparator is therefore
* needed.
*/
SortedSet<Classification<T, K>> probabilities =
new TreeSet<Classification<T, K>>(
new Comparator<Classification<T, K>>() {
@Override
public int compare(Classification<T, K> o1,
Classification<T, K> o2) {
int toReturn = Float.compare(
o1.getProbability(), o2.getProbability());
if ((toReturn == 0)
&& !o1.getCategory().equals(o2.getCategory()))
toReturn = -1;
return toReturn;
}
});
for (K category : this.getCategories())
probabilities.add(new Classification<T, K>(
features, category,
this.categoryProbability(features, category)));
return probabilities;
}
/**
* Classifies the given set of features.
*
* @return The category the set of features is classified as.
*/
@Override
public Classification<T, K> classify(Collection<T> features) {
SortedSet<Classification<T, K>> probabilites =
this.categoryProbabilities(features);
if (probabilites.size() > 0) {
return probabilites.last();
}
return null;
}
/**
* Classifies the given set of features. and return the full details of the
* classification.
*
* @return The set of categories the set of features is classified as.
*/
public Collection<Classification<T, K>> classifyDetailed(
Collection<T> features) {
return this.categoryProbabilities(features);
}
}

@ -0,0 +1,127 @@
/*
* The MIT License (MIT)
* ------------------
*
* Copyright (c) 2012-2014 Philipp Nolte
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
/*
* This software was taken from https://github.com/ptnplanet/Java-Naive-Bayes-Classifier
* and inserted into the loklak class hierarchy to be enhanced and extended
* by @0rb1t3r. After optimization in loklak it was inserted into the net.yacy.cora.bayes
* package. It shall be used to create custom search navigation filters.
* The original copyright notice was copied from the README.mnd
* from https://github.com/ptnplanet/Java-Naive-Bayes-Classifier/blob/master/README.md
* The original package domain was de.daslaboratorium.machinelearning.classifier
*/
package net.yacy.cora.bayes;
import java.util.Collection;
/**
* A basic wrapper reflecting a classification. It will store both featureset
* and resulting classification.
*
* @author Philipp Nolte
*
* @param <T> The feature class.
* @param <K> The category class.
*/
public class Classification<T, K> {
/**
* The classified featureset.
*/
private Collection<T> featureset;
/**
* The category as which the featureset was classified.
*/
private K category;
/**
* The probability that the featureset belongs to the given category.
*/
private float probability;
/**
* Constructs a new Classification with the parameters given and a default
* probability of 1.
*
* @param featureset The featureset.
* @param category The category.
*/
public Classification(Collection<T> featureset, K category) {
this(featureset, category, 1.0f);
}
/**
* Constructs a new Classification with the parameters given.
*
* @param featureset The featureset.
* @param category The category.
* @param probability The probability.
*/
public Classification(Collection<T> featureset, K category, float probability) {
this.featureset = featureset;
this.category = category;
this.probability = probability;
}
/**
* Retrieves the featureset classified.
*
* @return The featureset.
*/
public Collection<T> getFeatureset() {
return featureset;
}
/**
* Retrieves the classification's probability.
* @return
*/
public float getProbability() {
return this.probability;
}
/**
* Retrieves the category the featureset was classified as.
*
* @return The category.
*/
public K getCategory() {
return category;
}
/**
* {@inheritDoc}
*/
@Override
public String toString() {
return "Classification [category=" + this.category
+ ", probability=" + this.probability
+ ", featureset=" + this.featureset
+ "]";
}
}

@ -0,0 +1,437 @@
/*
* The MIT License (MIT)
* ------------------
*
* Copyright (c) 2012-2014 Philipp Nolte
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
/*
* This software was taken from https://github.com/ptnplanet/Java-Naive-Bayes-Classifier
* and inserted into the loklak class hierarchy to be enhanced and extended
* by @0rb1t3r. After optimization in loklak it was inserted into the net.yacy.cora.bayes
* package. It shall be used to create custom search navigation filters.
* The original copyright notice was copied from the README.mnd
* from https://github.com/ptnplanet/Java-Naive-Bayes-Classifier/blob/master/README.md
* The original package domain was de.daslaboratorium.machinelearning.classifier
*/
package net.yacy.cora.bayes;
import java.util.Collection;
import java.util.LinkedList;
import java.util.Map;
import java.util.Queue;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
/**
* Abstract base extended by any concrete classifier. It implements the basic
* functionality for storing categories or features and can be used to calculate
* basic probabilities both category and feature probabilities. The classify
* function has to be implemented by the concrete classifier class.
*
* @author Philipp Nolte
*
* @param <T> A feature class
* @param <K> A category class
*/
public abstract class Classifier<T, K> {
/**
* Initial capacity of category dictionaries.
*/
private static final int INITIAL_CATEGORY_DICTIONARY_CAPACITY = 16;
/**
* Initial capacity of feature dictionaries. It should be quite big, because
* the features will quickly outnumber the categories.
*/
private static final int INITIAL_FEATURE_DICTIONARY_CAPACITY = 32;
/**
* The initial memory capacity or how many classifications are memorized.
*/
private int memoryCapacity = 1000;
/**
* A dictionary mapping features to their number of occurrences in each
* known category.
*/
private Map<K, Map<T, Integer>> featureCountPerCategory;
/**
* A dictionary mapping features to their number of occurrences.
*/
private Map<T, Integer> totalFeatureCount;
/**
* A dictionary mapping categories to their number of occurrences.
*/
private Map<K, Integer> totalCategoryCount;
/**
* The classifier's memory. It will forget old classifications as soon as
* they become too old.
*/
private Queue<Classification<T, K>> memoryQueue;
/**
* Constructs a new classifier without any trained knowledge.
*/
public Classifier() {
this.reset();
}
/**
* Resets the <i>learned</i> feature and category counts.
*/
public void reset() {
this.featureCountPerCategory =
new ConcurrentHashMap<K, Map<T,Integer>>(
Classifier.INITIAL_CATEGORY_DICTIONARY_CAPACITY);
this.totalFeatureCount =
new ConcurrentHashMap<T, Integer>(
Classifier.INITIAL_FEATURE_DICTIONARY_CAPACITY);
this.totalCategoryCount =
new ConcurrentHashMap<K, Integer>(
Classifier.INITIAL_CATEGORY_DICTIONARY_CAPACITY);
this.memoryQueue = new LinkedList<Classification<T, K>>();
}
/**
* Returns a <code>Set</code> of features the classifier knows about.
*
* @return The <code>Set</code> of features the classifier knows about.
*/
public Set<T> getFeatures() {
return this.totalFeatureCount.keySet();
}
/**
* Returns a <code>Set</code> of categories the classifier knows about.
*
* @return The <code>Set</code> of categories the classifier knows about.
*/
public Set<K> getCategories() {
return this.totalCategoryCount.keySet();
}
/**
* Retrieves the total number of categories the classifier knows about.
*
* @return The total category count.
*/
public int getCategoriesTotal() {
int toReturn = 0;
for (Integer c: this.totalCategoryCount.values()) {
toReturn += c;
}
return toReturn;
}
/**
* Retrieves the memory's capacity.
*
* @return The memory's capacity.
*/
public int getMemoryCapacity() {
return memoryCapacity;
}
/**
* Sets the memory's capacity. If the new value is less than the old
* value, the memory will be truncated accordingly.
*
* @param memoryCapacity The new memory capacity.
*/
public void setMemoryCapacity(int memoryCapacity) {
for (int i = this.memoryCapacity; i > memoryCapacity; i--) {
this.memoryQueue.poll();
}
this.memoryCapacity = memoryCapacity;
}
/**
* Increments the count of a given feature in the given category. This is
* equal to telling the classifier, that this feature has occurred in this
* category.
*
* @param feature The feature, which count to increase.
* @param category The category the feature occurred in.
*/
public void incrementFeature(T feature, K category) {
Map<T, Integer> features =
this.featureCountPerCategory.get(category);
if (features == null) {
this.featureCountPerCategory.put(category,
new ConcurrentHashMap<T, Integer>(Classifier.INITIAL_FEATURE_DICTIONARY_CAPACITY));
features = this.featureCountPerCategory.get(category);
}
Integer count = features.get(feature);
if (count == null) {
features.put(feature, 0);
count = features.get(feature);
}
features.put(feature, ++count);
Integer totalCount = this.totalFeatureCount.get(feature);
if (totalCount == null) {
this.totalFeatureCount.put(feature, 0);
totalCount = this.totalFeatureCount.get(feature);
}
this.totalFeatureCount.put(feature, ++totalCount);
}
/**
* Increments the count of a given category. This is equal to telling the
* classifier, that this category has occurred once more.
*
* @param category The category, which count to increase.
*/
public void incrementCategory(K category) {
Integer count = this.totalCategoryCount.get(category);
if (count == null) {
this.totalCategoryCount.put(category, 0);
count = this.totalCategoryCount.get(category);
}
this.totalCategoryCount.put(category, ++count);
}
/**
* Decrements the count of a given feature in the given category. This is
* equal to telling the classifier that this feature was classified once in
* the category.
*
* @param feature The feature to decrement the count for.
* @param category The category.
*/
public void decrementFeature(T feature, K category) {
Map<T, Integer> features =
this.featureCountPerCategory.get(category);
if (features == null) {
return;
}
Integer count = features.get(feature);
if (count == null) {
return;
}
if (count.intValue() == 1) {
features.remove(feature);
if (features.size() == 0) {
this.featureCountPerCategory.remove(category);
}
} else {
features.put(feature, --count);
}
Integer totalCount = this.totalFeatureCount.get(feature);
if (totalCount == null) {
return;
}
if (totalCount.intValue() == 1) {
this.totalFeatureCount.remove(feature);
} else {
this.totalFeatureCount.put(feature, --totalCount);
}
}
/**
* Decrements the count of a given category. This is equal to telling the
* classifier, that this category has occurred once less.
*
* @param category The category, which count to increase.
*/
public void decrementCategory(K category) {
Integer count = this.totalCategoryCount.get(category);
if (count == null) {
return;
}
if (count.intValue() == 1) {
this.totalCategoryCount.remove(category);
} else {
this.totalCategoryCount.put(category, --count);
}
}
/**
* Retrieves the number of occurrences of the given feature in the given
* category.
*
* @param feature The feature, which count to retrieve.
* @param category The category, which the feature occurred in.
* @return The number of occurrences of the feature in the category.
*/
public int featureCount(T feature, K category) {
Map<T, Integer> features =
this.featureCountPerCategory.get(category);
if (features == null)
return 0;
Integer count = features.get(feature);
return (count == null) ? 0 : count.intValue();
}
/**
* Retrieves the number of occurrences of the given category.
*
* @param category The category, which count should be retrieved.
* @return The number of occurrences.
*/
public int categoryCount(K category) {
Integer count = this.totalCategoryCount.get(category);
return (count == null) ? 0 : count.intValue();
}
/**
* {@inheritDoc}
*/
public float featureProbability(T feature, K category) {
if (this.categoryCount(category) == 0)
return 0;
return (float) this.featureCount(feature, category)
/ (float) this.categoryCount(category);
}
/**
* Retrieves the weighed average <code>P(feature|category)</code> with
* overall weight of <code>1.0</code> and an assumed probability of
* <code>0.5</code>. The probability defaults to the overall feature
* probability.
*
* @see de.daslaboratorium.machinelearning.classifier.Classifier#featureProbability(Object, Object)
* @see de.daslaboratorium.machinelearning.classifier.Classifier#featureWeighedAverage(Object, Object, IFeatureProbability, float, float)
*
* @param feature The feature, which probability to calculate.
* @param category The category.
* @return The weighed average probability.
*/
public float featureWeighedAverage(T feature, K category) {
return this.featureWeighedAverage(feature, category, null, 1.0f, 0.5f);
}
/**
* Retrieves the weighed average <code>P(feature|category)</code> with
* overall weight of <code>1.0</code>, an assumed probability of
* <code>0.5</code> and the given object to use for probability calculation.
*
* @see de.daslaboratorium.machinelearning.classifier.Classifier#featureWeighedAverage(Object, Object, IFeatureProbability, float, float)
*
* @param feature The feature, which probability to calculate.
* @param category The category.
* @param calculator The calculating object.
* @return The weighed average probability.
*/
public float featureWeighedAverage(T feature, K category, Classifier<T, K> calculator) {
return this.featureWeighedAverage(feature, category,
calculator, 1.0f, 0.5f);
}
/**
* Retrieves the weighed average <code>P(feature|category)</code> with
* the given weight and an assumed probability of <code>0.5</code> and the
* given object to use for probability calculation.
*
* @see de.daslaboratorium.machinelearning.classifier.Classifier#featureWeighedAverage(Object, Object, IFeatureProbability, float, float)
*
* @param feature The feature, which probability to calculate.
* @param category The category.
* @param calculator The calculating object.
* @param weight The feature weight.
* @return The weighed average probability.
*/
public float featureWeighedAverage(T feature, K category, Classifier<T, K> calculator, float weight) {
return this.featureWeighedAverage(feature, category,
calculator, weight, 0.5f);
}
/**
* Retrieves the weighed average <code>P(feature|category)</code> with
* the given weight, the given assumed probability and the given object to
* use for probability calculation.
*
* @param feature The feature, which probability to calculate.
* @param category The category.
* @param calculator The calculating object.
* @param weight The feature weight.
* @param assumedProbability The assumed probability.
* @return The weighed average probability.
*/
public float featureWeighedAverage(T feature, K category, Classifier<T, K> calculator, float weight, float assumedProbability) {
/*
* use the given calculating object or the default method to calculate
* the probability that the given feature occurred in the given
* category.
*/
final float basicProbability =
(calculator == null)
? this.featureProbability(feature, category)
: calculator.featureProbability(feature, category);
Integer totals = this.totalFeatureCount.get(feature);
if (totals == null)
totals = 0;
return (weight * assumedProbability + totals * basicProbability)
/ (weight + totals);
}
/**
* Train the classifier by telling it that the given features resulted in
* the given category.
*
* @param category The category the features belong to.
* @param features The features that resulted in the given category.
*/
public void learn(K category, Collection<T> features) {
this.learn(new Classification<T, K>(features, category));
}
/**
* Train the classifier by telling it that the given features resulted in
* the given category.
*
* @param classification The classification to learn.
*/
public void learn(Classification<T, K> classification) {
for (T feature : classification.getFeatureset())
this.incrementFeature(feature, classification.getCategory());
this.incrementCategory(classification.getCategory());
this.memoryQueue.offer(classification);
if (this.memoryQueue.size() > this.memoryCapacity) {
Classification<T, K> toForget = this.memoryQueue.remove();
for (T feature : toForget.getFeatureset())
this.decrementFeature(feature, toForget.getCategory());
this.decrementCategory(toForget.getCategory());
}
}
/**
* The classify method. It will retrieve the most likely category for the
* features given and depends on the concrete classifier implementation.
*
* @param features The features to classify.
* @return The category most likely.
*/
public abstract Classification<T, K> classify(Collection<T> features);
}
Loading…
Cancel
Save