/** * ProbabilisticClassifier * Copyright 2015 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany * first published 06.08.2015 on http://yacy.net * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see . */ package net.yacy.document; import java.io.File; import java.io.IOException; import java.nio.charset.Charset; import java.nio.file.Files; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import net.yacy.cora.bayes.BayesClassifier; import net.yacy.cora.bayes.Classification; import net.yacy.cora.util.ConcurrentLog; public class ProbabilisticClassifier { public final static String NONE_CATEGORY_NAME = "NONE"; public final static Category NONE_CATEGORY = new Category(NONE_CATEGORY_NAME); public static class Category { String category_name; public Category(String category_name) { this.category_name = category_name; } public String getName() { return this.category_name; } } public static class Context { private String context_name; private BayesClassifier bayes; public Context(String context_name, Map categoryExampleLinesFiles, File negativeExampleLines) throws IOException { this.context_name = context_name; int requiredSize = 0; Charset charset = Charset.forName("UTF-8"); Map> categoryBuffer = new HashMap<>(); for (Map.Entry category: categoryExampleLinesFiles.entrySet()) { List list = Files.readAllLines(category.getValue().toPath(), charset); categoryBuffer.put(category.getKey(), list); requiredSize += list.size(); } List list = Files.readAllLines(negativeExampleLines.toPath(), charset); categoryBuffer.put(NONE_CATEGORY_NAME, Files.readAllLines(negativeExampleLines.toPath(), charset)); requiredSize += list.size(); this.bayes = new BayesClassifier<>(); this.bayes.setMemoryCapacity(requiredSize); for (Map.Entry> category: categoryBuffer.entrySet()) { Category c = new Category(category.getKey()); for (String line: category.getValue()) { List tokens = normalize(line); bayes.learn(c, tokens); } } bayes.learn(NONE_CATEGORY, categoryBuffer.get(NONE_CATEGORY_NAME)); } private List normalize(String phrase) { String cleanphrase = phrase.toLowerCase().replaceAll("\\W", " "); String[] rawtokens = cleanphrase.split("\\s"); List tokens = new ArrayList<>(); for (String token: rawtokens) if (token.length() > 2) tokens.add(token); return tokens; } public String getName() { return this.context_name; } public Classification classify(String phrase) { List words = normalize(phrase); return this.bayes.classify(words); } } private static Map contexts; public static Set getContextNames() { return contexts.keySet(); } public static Context getContext(String contextName) { return contexts.get(contextName); } /** * create a new classifier set. * @param path_to_context_directory directory containing contexts wich are directories containing .txt files. One of them must be named 'negative.txt' */ public static void initialize(File path_to_context_directory) { contexts = new HashMap<>(); String[] context_candidates = path_to_context_directory.list(); for (String context_candidate: context_candidates) { File ccf = new File(path_to_context_directory, context_candidate); if (!ccf.isDirectory()) continue; String[] category_candidates = ccf.list(); Map categoryExampleLinesFiles = new HashMap<>(); File negativeExampleLines = null; for (String category_candidate: category_candidates) { if (!category_candidate.endsWith(".txt")) continue; File catcf = new File(ccf, category_candidate); if (category_candidate.startsWith("negative")) { negativeExampleLines = catcf; } else { categoryExampleLinesFiles.put(category_candidate.substring(0, category_candidate.length() - 4), catcf); } } if (negativeExampleLines != null && categoryExampleLinesFiles.size() > 0) { try { Context context = new Context(context_candidate, categoryExampleLinesFiles, negativeExampleLines); contexts.put(context_candidate, context); } catch (IOException e) { ConcurrentLog.logException(e); } } } } /** * compute the classification of a given text. The result is a map with most probable categorizations for each context. * @param text the text to be classified * @return a map where the key is the navigator name (the bayes context) and the value is the most probable attribute name (the bayes category) */ public static Map getClassification(String text) { Map c = new HashMap<>(); for (Context context: contexts.values()) { Classification classification = context.classify(text); String contextname = context.getName(); Category category = classification.getCategory(); String categoryname = category.getName(); c.put(contextname, categoryname); } return c; } }