bayesian filters. This can be used to classify documents during indexing-time using a pre-definied bayesian filter. New wordings: - a context is a class where different categories are possible. The context name is equal to a facet name. - a category is a facet type within a facet navigation. Each context must have several categories, at least one custom name (things you want to discover) and one with the exact name "negative". To use this, you must do: - for each context, you must create a directory within DATA/CLASSIFICATION with the name of the context (the facet name) - within each context directory, you must create text files with one document each per line for every categroy. One of these categories MUST have the name 'negative.txt'. Then, each new document is classified to match within one of the given categories for each context.pull/12/head
parent
dbbad23e12
commit
df3314ac1a
@ -0,0 +1,168 @@
|
||||
/**
|
||||
* ProbabilisticClassifier
|
||||
* Copyright 2015 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||
* first published 06.08.2015 on http://yacy.net
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
package net.yacy.document;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import net.yacy.cora.bayes.BayesClassifier;
|
||||
import net.yacy.cora.bayes.Classification;
|
||||
import net.yacy.cora.util.ConcurrentLog;
|
||||
|
||||
public class ProbabilisticClassifier {
|
||||
|
||||
public final static String NONE_CATEGORY_NAME = "NONE";
|
||||
public final static Category NONE_CATEGORY = new Category(NONE_CATEGORY_NAME);
|
||||
|
||||
public static class Category {
|
||||
|
||||
String category_name;
|
||||
|
||||
public Category(String category_name) {
|
||||
this.category_name = category_name;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return this.category_name;
|
||||
}
|
||||
}
|
||||
|
||||
public static class Context {
|
||||
|
||||
private String context_name;
|
||||
private BayesClassifier<String, Category> bayes;
|
||||
|
||||
public Context(String context_name, Map<String, File> categoryExampleLinesFiles, File negativeExampleLines) throws IOException {
|
||||
this.context_name = context_name;
|
||||
int requiredSize = 0;
|
||||
Map<String, List<String>> categoryBuffer = new HashMap<>();
|
||||
for (Map.Entry<String, File> category: categoryExampleLinesFiles.entrySet()) {
|
||||
List<String> list = Files.readAllLines(category.getValue().toPath());
|
||||
categoryBuffer.put(category.getKey(), list);
|
||||
requiredSize += list.size();
|
||||
}
|
||||
List<String> list = Files.readAllLines(negativeExampleLines.toPath());
|
||||
categoryBuffer.put(NONE_CATEGORY_NAME, Files.readAllLines(negativeExampleLines.toPath()));
|
||||
requiredSize += list.size();
|
||||
|
||||
this.bayes = new BayesClassifier<>();
|
||||
this.bayes.setMemoryCapacity(requiredSize);
|
||||
|
||||
for (Map.Entry<String, List<String>> category: categoryBuffer.entrySet()) {
|
||||
Category c = new Category(category.getKey());
|
||||
for (String line: category.getValue()) {
|
||||
List<String> tokens = normalize(line);
|
||||
bayes.learn(c, tokens);
|
||||
}
|
||||
}
|
||||
bayes.learn(NONE_CATEGORY, categoryBuffer.get(NONE_CATEGORY_NAME));
|
||||
}
|
||||
|
||||
private List<String> normalize(String phrase) {
|
||||
String cleanphrase = phrase.toLowerCase().replaceAll("\\W", " ");
|
||||
String[] rawtokens = cleanphrase.split("\\s");
|
||||
List<String> tokens = new ArrayList<>();
|
||||
for (String token: rawtokens) if (token.length() > 2) tokens.add(token);
|
||||
return tokens;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return this.context_name;
|
||||
}
|
||||
|
||||
public Classification<String, Category> classify(String phrase) {
|
||||
List<String> words = normalize(phrase);
|
||||
return this.bayes.classify(words);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static Map<String, Context> contexts;
|
||||
|
||||
public static Set<String> getContextNames() {
|
||||
return contexts.keySet();
|
||||
}
|
||||
|
||||
public static Context getContext(String contextName) {
|
||||
return contexts.get(contextName);
|
||||
}
|
||||
|
||||
/**
|
||||
* create a new classifier set.
|
||||
* @param path_to_context_directory directory containing contexts wich are directories containing .txt files. One of them must be named 'negative.txt'
|
||||
*/
|
||||
public static void initialize(File path_to_context_directory) {
|
||||
contexts = new HashMap<>();
|
||||
String[] context_candidates = path_to_context_directory.list();
|
||||
for (String context_candidate: context_candidates) {
|
||||
File ccf = new File(path_to_context_directory, context_candidate);
|
||||
if (!ccf.isDirectory()) continue;
|
||||
String[] category_candidates = ccf.list();
|
||||
|
||||
Map<String, File> categoryExampleLinesFiles = new HashMap<>();
|
||||
File negativeExampleLines = null;
|
||||
|
||||
for (String category_candidate: category_candidates) {
|
||||
if (!category_candidate.endsWith(".txt")) continue;
|
||||
File catcf = new File(ccf, category_candidate);
|
||||
if (category_candidate.startsWith("negative")) {
|
||||
negativeExampleLines = catcf;
|
||||
} else {
|
||||
categoryExampleLinesFiles.put(category_candidate.substring(0, category_candidate.length() - 4), catcf);
|
||||
}
|
||||
}
|
||||
|
||||
if (negativeExampleLines != null && categoryExampleLinesFiles.size() > 0) {
|
||||
try {
|
||||
Context context = new Context(context_candidate, categoryExampleLinesFiles, negativeExampleLines);
|
||||
contexts.put(context_candidate, context);
|
||||
} catch (IOException e) {
|
||||
ConcurrentLog.logException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* compute the classification of a given text. The result is a map with most probable categorizations for each context.
|
||||
* @param text the text to be classified
|
||||
* @return a map where the key is the navigator name (the bayes context) and the value is the most probable attribute name (the bayes category)
|
||||
*/
|
||||
public static Map<String, String> getClassification(String text) {
|
||||
Map<String, String> c = new HashMap<>();
|
||||
for (Context context: contexts.values()) {
|
||||
Classification<String, Category> classification = context.classify(text);
|
||||
String contextname = context.getName();
|
||||
Category category = classification.getCategory();
|
||||
String categoryname = category.getName();
|
||||
c.put(contextname, categoryname);
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in new issue