diff --git a/defaults/yacy.init b/defaults/yacy.init index a238d2ed4..352f1b8c6 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -258,6 +258,12 @@ surrogates.out = DATA/SURROGATES/out # this directory also contains subdirectories for input sources, the did-you-mean function and other dictionaries = DATA/DICTIONARIES +# a path to the classification directory +# each subdirectory is the name of a context (which becomes a navigator) with '.txt' files +# containing texts to teach a bayesian filter. One of the files must be named 'negative.txt'. +# The text files can be created with the Export functionality using the option "Only Text". +classification = DATA/CLASSIFICATION + # storage place for new releases releases = DATA/RELEASE diff --git a/htroot/js/yacysearch.js b/htroot/js/yacysearch.js index c3994e009..413eabf58 100644 --- a/htroot/js/yacysearch.js +++ b/htroot/js/yacysearch.js @@ -52,8 +52,8 @@ function statistics(offset, itemscount, itemsperpage, totalcount, localResourceS resnav += "\">«"; } - numberofpages = Math.floor(Math.min(10, 1 + ((totalcount.replace(/\./g,'') - 1) / itemsperpage))); - if (!numberofpages) numberofpages = 10; + numberofpages = Math.floor(Math.min(9, 1 + ((totalcount.replace(/\./g,'') - 1) / itemsperpage))); + if (!numberofpages) numberofpages = 9; for (i = 0; i < numberofpages; i++) { if (i == thispage) { resnav += "
  • "; diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 3706113bb..fee284c8d 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -49,6 +49,7 @@ import net.yacy.cora.federate.FederateSearchManager; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.geo.GeoLocation; import net.yacy.cora.lod.vocabulary.Tagging; +import net.yacy.cora.lod.vocabulary.Tagging.Metatag; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; @@ -443,7 +444,12 @@ public class yacysearch { if (p > 0) { String k = vocabulary.substring(0, p); String v = vocabulary.substring(p + 1); - metatags.add(LibraryProvider.autotagging.metatag(k, v)); + Metatag mt = LibraryProvider.autotagging.metatag(k, v); + if (mt != null) { + metatags.add(mt); + } else { + + } } } diff --git a/source/net/yacy/cora/language/synonyms/AutotaggingLibrary.java b/source/net/yacy/cora/language/synonyms/AutotaggingLibrary.java index 54197e6bb..4cf70ce5c 100644 --- a/source/net/yacy/cora/language/synonyms/AutotaggingLibrary.java +++ b/source/net/yacy/cora/language/synonyms/AutotaggingLibrary.java @@ -31,6 +31,7 @@ import java.util.concurrent.ConcurrentHashMap; import net.yacy.cora.geo.Locations; import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.cora.util.ConcurrentLog; +import net.yacy.document.ProbabilisticClassifier; /** * Autotagging provides a set of tag/print-name properties which can be used to @@ -167,6 +168,12 @@ public class AutotaggingLibrary { public Tagging.Metatag metatag(String vocName, String term) { Tagging tagging = this.vocabularies.get(vocName); + if (tagging == null) { + if (ProbabilisticClassifier.getContextNames().contains(vocName)) { + tagging = new Tagging(vocName); + } + } + if (tagging == null) return null; return tagging.getMetatagFromTerm(Tagging.decodeMaskname(term)); } diff --git a/source/net/yacy/cora/lod/vocabulary/Tagging.java b/source/net/yacy/cora/lod/vocabulary/Tagging.java index f642ca6f3..31fbc9461 100644 --- a/source/net/yacy/cora/lod/vocabulary/Tagging.java +++ b/source/net/yacy/cora/lod/vocabulary/Tagging.java @@ -90,7 +90,7 @@ public class Tagging { } - private Tagging(String name) { + public Tagging(String name) { this.navigatorName = name; this.synonym2term = new ConcurrentHashMap(); this.term2synonym = new ConcurrentHashMap(); @@ -544,6 +544,11 @@ public class Tagging { return term; } + /** + * The metatag class contains the object value for a Linked Open Data RDF triple. + * The metatag is created in a tagging environment, which already contains the + * subject and the predicate. The metatag is the object of the RDF triple. + */ public class Metatag { private final String object; private Metatag(String object) { diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index f72ea5890..ad50e16ef 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -250,12 +250,17 @@ dc_rights /** * add the given words to the set of keywords. * These keywords will appear in dc_subject - * @param tags + * @param tags a map where the key is the navigator name and the value is the set of attributes as metatags */ protected void addMetatags(Map> tags) { this.generic_facets.putAll(computeGenericFacets(tags)); } + /** + * compute generic facets + * @param tags a map where the key is the navigator name and the value is the set of attributes as metatags + * @return a map where the key is the navigator name and the value is the set of attributes names + */ public static Map> computeGenericFacets(Map> tags) { Map> gf = new HashMap>(); for (Map.Entry> e: tags.entrySet()) { diff --git a/source/net/yacy/document/ProbabilisticClassifier.java b/source/net/yacy/document/ProbabilisticClassifier.java new file mode 100644 index 000000000..b729a311c --- /dev/null +++ b/source/net/yacy/document/ProbabilisticClassifier.java @@ -0,0 +1,168 @@ +/** + * ProbabilisticClassifier + * Copyright 2015 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany + * first published 06.08.2015 on http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + + +package net.yacy.document; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import net.yacy.cora.bayes.BayesClassifier; +import net.yacy.cora.bayes.Classification; +import net.yacy.cora.util.ConcurrentLog; + +public class ProbabilisticClassifier { + + public final static String NONE_CATEGORY_NAME = "NONE"; + public final static Category NONE_CATEGORY = new Category(NONE_CATEGORY_NAME); + + public static class Category { + + String category_name; + + public Category(String category_name) { + this.category_name = category_name; + } + + public String getName() { + return this.category_name; + } + } + + public static class Context { + + private String context_name; + private BayesClassifier bayes; + + public Context(String context_name, Map categoryExampleLinesFiles, File negativeExampleLines) throws IOException { + this.context_name = context_name; + int requiredSize = 0; + Map> categoryBuffer = new HashMap<>(); + for (Map.Entry category: categoryExampleLinesFiles.entrySet()) { + List list = Files.readAllLines(category.getValue().toPath()); + categoryBuffer.put(category.getKey(), list); + requiredSize += list.size(); + } + List list = Files.readAllLines(negativeExampleLines.toPath()); + categoryBuffer.put(NONE_CATEGORY_NAME, Files.readAllLines(negativeExampleLines.toPath())); + requiredSize += list.size(); + + this.bayes = new BayesClassifier<>(); + this.bayes.setMemoryCapacity(requiredSize); + + for (Map.Entry> category: categoryBuffer.entrySet()) { + Category c = new Category(category.getKey()); + for (String line: category.getValue()) { + List tokens = normalize(line); + bayes.learn(c, tokens); + } + } + bayes.learn(NONE_CATEGORY, categoryBuffer.get(NONE_CATEGORY_NAME)); + } + + private List normalize(String phrase) { + String cleanphrase = phrase.toLowerCase().replaceAll("\\W", " "); + String[] rawtokens = cleanphrase.split("\\s"); + List tokens = new ArrayList<>(); + for (String token: rawtokens) if (token.length() > 2) tokens.add(token); + return tokens; + } + + public String getName() { + return this.context_name; + } + + public Classification classify(String phrase) { + List words = normalize(phrase); + return this.bayes.classify(words); + } + + } + + private static Map contexts; + + public static Set getContextNames() { + return contexts.keySet(); + } + + public static Context getContext(String contextName) { + return contexts.get(contextName); + } + + /** + * create a new classifier set. + * @param path_to_context_directory directory containing contexts wich are directories containing .txt files. One of them must be named 'negative.txt' + */ + public static void initialize(File path_to_context_directory) { + contexts = new HashMap<>(); + String[] context_candidates = path_to_context_directory.list(); + for (String context_candidate: context_candidates) { + File ccf = new File(path_to_context_directory, context_candidate); + if (!ccf.isDirectory()) continue; + String[] category_candidates = ccf.list(); + + Map categoryExampleLinesFiles = new HashMap<>(); + File negativeExampleLines = null; + + for (String category_candidate: category_candidates) { + if (!category_candidate.endsWith(".txt")) continue; + File catcf = new File(ccf, category_candidate); + if (category_candidate.startsWith("negative")) { + negativeExampleLines = catcf; + } else { + categoryExampleLinesFiles.put(category_candidate.substring(0, category_candidate.length() - 4), catcf); + } + } + + if (negativeExampleLines != null && categoryExampleLinesFiles.size() > 0) { + try { + Context context = new Context(context_candidate, categoryExampleLinesFiles, negativeExampleLines); + contexts.put(context_candidate, context); + } catch (IOException e) { + ConcurrentLog.logException(e); + } + } + } + } + + /** + * compute the classification of a given text. The result is a map with most probable categorizations for each context. + * @param text the text to be classified + * @return a map where the key is the navigator name (the bayes context) and the value is the most probable attribute name (the bayes category) + */ + public static Map getClassification(String text) { + Map c = new HashMap<>(); + for (Context context: contexts.values()) { + Classification classification = context.classify(text); + String contextname = context.getName(); + Category category = classification.getCategory(); + String categoryname = category.getName(); + c.put(contextname, categoryname); + } + return c; + } + +} diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 5a8480935..a61bc4ae1 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -153,6 +153,7 @@ import net.yacy.document.Condenser; import net.yacy.document.Document; import net.yacy.document.LibraryProvider; import net.yacy.document.Parser; +import net.yacy.document.ProbabilisticClassifier; import net.yacy.document.TextParser; import net.yacy.document.VocabularyScraper; import net.yacy.document.Parser.Failure; @@ -242,7 +243,7 @@ public final class Switchboard extends serverSwitch { // storage management public File htCachePath; - public final File dictionariesPath; + public final File dictionariesPath, classificationPath; public File listsPath; public File htDocsPath; public File workPath; @@ -374,11 +375,20 @@ public final class Switchboard extends serverSwitch { } this.log.config("Work Path: " + this.workPath.toString()); + this.dictionariesPath = getDataPath( SwitchboardConstants.DICTIONARY_SOURCE_PATH, SwitchboardConstants.DICTIONARY_SOURCE_PATH_DEFAULT); this.log.config("Dictionaries Path:" + this.dictionariesPath.toString()); + if (!this.dictionariesPath.exists()) this.dictionariesPath.mkdirs(); + + this.classificationPath = + getDataPath( + SwitchboardConstants.CLASSIFICATION_SOURCE_PATH, + SwitchboardConstants.CLASSIFICATION_SOURCE_PATH_DEFAULT); + this.log.config("Classification Path:" + this.classificationPath.toString()); + if (!this.classificationPath.exists()) this.classificationPath.mkdirs(); CollectionConfiguration.UNIQUE_HEURISTIC_PREFER_HTTPS = this.getConfigBool("search.ranking.uniqueheuristic.preferhttps", false); CollectionConfiguration.UNIQUE_HEURISTIC_PREFER_WWWPREFIX = this.getConfigBool("search.ranking.uniqueheuristic.preferwwwprefix", true); @@ -397,6 +407,9 @@ public final class Switchboard extends serverSwitch { Tagging t = LibraryProvider.autotagging.getVocabulary(o); if (t != null) t.setFacet(false); } + + Thread.currentThread().setName("ProbabilisticClassification.initialize"); + ProbabilisticClassifier.initialize(Switchboard.this.classificationPath); } }.start(); diff --git a/source/net/yacy/search/SwitchboardConstants.java b/source/net/yacy/search/SwitchboardConstants.java index 82295624f..32cc6c916 100644 --- a/source/net/yacy/search/SwitchboardConstants.java +++ b/source/net/yacy/search/SwitchboardConstants.java @@ -413,6 +413,9 @@ public final class SwitchboardConstants { public static final String DICTIONARY_SOURCE_PATH = "dictionaries"; public static final String DICTIONARY_SOURCE_PATH_DEFAULT = "DATA/DICTIONARIES"; + + public static final String CLASSIFICATION_SOURCE_PATH = "classification"; + public static final String CLASSIFICATION_SOURCE_PATH_DEFAULT = "DATA/CLASSIFICATION"; /** *

    public static final String HTDOCS_PATH = "htDocsPath"

    diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java index 28316350a..1adafe904 100644 --- a/source/net/yacy/search/query/QueryParams.java +++ b/source/net/yacy/search/query/QueryParams.java @@ -54,6 +54,7 @@ import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.SpaceExceededException; import net.yacy.document.LibraryProvider; +import net.yacy.document.ProbabilisticClassifier; import net.yacy.document.Tokenizer; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReferenceRow; @@ -262,6 +263,9 @@ public final class QueryParams { this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX); } } + for (String context: ProbabilisticClassifier.getContextNames()) { + this.facetfields.add(CollectionSchema.VOCABULARY_PREFIX + context + CollectionSchema.VOCABULARY_TERMS_SUFFIX); + } this.cachedQuery = null; } diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index 70bf0b998..cdcec498d 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -36,6 +36,7 @@ import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.SortedMap; import java.util.SortedSet; import java.util.TreeMap; @@ -72,6 +73,7 @@ import net.yacy.crawler.retrieval.Response; import net.yacy.data.WorkTables; import net.yacy.document.LargeNumberCache; import net.yacy.document.LibraryProvider; +import net.yacy.document.ProbabilisticClassifier; import net.yacy.document.TextParser; import net.yacy.document.Tokenizer; import net.yacy.kelondro.data.meta.URIMetadataNode; @@ -882,13 +884,16 @@ public final class SearchEvent { } // get the vocabulary navigation - for (Tagging v: LibraryProvider.autotagging.getVocabularies()) { - fcts = facets.get(CollectionSchema.VOCABULARY_PREFIX + v.getName() + CollectionSchema.VOCABULARY_TERMS_SUFFIX); + Set genericFacets = new LinkedHashSet<>(); + for (Tagging v: LibraryProvider.autotagging.getVocabularies()) genericFacets.add(v.getName()); + genericFacets.addAll(ProbabilisticClassifier.getContextNames()); + for (String v: genericFacets) { + fcts = facets.get(CollectionSchema.VOCABULARY_PREFIX + v + CollectionSchema.VOCABULARY_TERMS_SUFFIX); if (fcts != null) { - ScoreMap vocNav = this.vocabularyNavigator.get(v.getName()); + ScoreMap vocNav = this.vocabularyNavigator.get(v); if (vocNav == null) { vocNav = new ConcurrentScoreMap(); - this.vocabularyNavigator.put(v.getName(), vocNav); + this.vocabularyNavigator.put(v, vocNav); } vocNav.inc(fcts); } @@ -1242,7 +1247,7 @@ public final class SearchEvent { // check vocabulary terms (metatags) {only available in Solr index as vocabulary_xxyyzzz_sxt field} - // TODO: vocabulary is only valid and available in local Solr index (considere to auto-switch to Searchdom.LOCAL) + // TODO: vocabulary is only valid and available in local Solr index (consider to auto-switch to Searchdom.LOCAL) if (this.query.metatags != null && !this.query.metatags.isEmpty()) { tagloop: for (Tagging.Metatag tag : this.query.metatags) { SolrDocument sdoc = page; diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 7211a363a..07471e14e 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -81,6 +81,7 @@ import net.yacy.cora.util.SpaceExceededException; import net.yacy.crawler.retrieval.Response; import net.yacy.document.Condenser; import net.yacy.document.Document; +import net.yacy.document.ProbabilisticClassifier; import net.yacy.document.SentenceReader; import net.yacy.document.Tokenizer; import net.yacy.document.content.DCEntry; @@ -1006,6 +1007,12 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri return doc; } + /** + * attach additional information to the document to enable navigation features + * @param doc the document to be enriched + * @param synonyms a list of synonyms detected for the text content + * @param genericFacets a map where the key is the navigator name and the value is the set of attributes names + */ public void enrich(SolrInputDocument doc, List synonyms, Map> genericFacets) { remove(doc, CollectionSchema.vocabularies_sxt); // delete old values for (SolrInputField sif: doc) { @@ -1016,6 +1023,17 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // there are no pre-defined solr fields for navigation because the vocabulary is generic // we use dynamically allocated solr fields for this. // It must be a multi-value string/token field, therefore we use _sxt extensions for the field names + + // add to genericFacets the probabilistic categories + String text = (String) doc.getFieldValue(CollectionSchema.text_t.getSolrFieldName()); + Map classification = ProbabilisticClassifier.getClassification(text); + for (Map.Entry entry: classification.entrySet()) { + Set facetAttrbutes = new HashSet<>(); + facetAttrbutes.add(entry.getValue()); + genericFacets.put(entry.getKey(), facetAttrbutes); + } + + // compute the document field values List vocabularies = new ArrayList<>(); for (Map.Entry> facet: genericFacets.entrySet()) { String facetName = facet.getKey();