From 3190347814991a98b22f7c4fa53a4c7b016199dd Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 2 Oct 2012 00:02:50 +0200 Subject: [PATCH] added a synonyms_t field to solr and a process to read synonym files. This can be used to add another stemming to solr using stemming files that are expressed as synonyms for grammatical alternatives. The synonym/stemming files must have the following form: - each line is a comma-separated list of synonyms - the list of synonyms may be enclosed with {} (like the GSA synonyms file) - the file may contain comments which are lines starting with a '#' The synonym file(s) must be placed in DATA/DICTIONARIES/synonyms/ and are activated by default whenever a synonym file is in place. Then, for each word that is found in a document all synonyms are added to a long text field which is stored into synonyms_t. Processes using the synonyms must query with that field as optional matcher. --- defaults/solr.keys.list | 3 +++ .../yacy/cora/federate/solr/YaCySchema.java | 1 + .../net/yacy/data/ymark/YMarkAutoTagger.java | 2 +- source/net/yacy/document/Condenser.java | 19 ++++++++++++++++++- source/net/yacy/document/LibraryProvider.java | 19 +++++++++++++------ .../yacy/document/parser/torrentParser.java | 2 +- source/net/yacy/search/Switchboard.java | 9 +++++---- .../net/yacy/search/index/DocumentIndex.java | 2 +- source/net/yacy/search/index/Segment.java | 4 ++-- .../yacy/search/index/SolrConfiguration.java | 9 ++++++++- 10 files changed, 53 insertions(+), 17 deletions(-) diff --git a/defaults/solr.keys.list b/defaults/solr.keys.list index 67f463977..41f6cee75 100644 --- a/defaults/solr.keys.list +++ b/defaults/solr.keys.list @@ -119,6 +119,9 @@ responsetime_i ## all visible text, text text_t +## additional synonyms to the words in the text, text +synonyms_t + ## h1 header h1_txt diff --git a/source/net/yacy/cora/federate/solr/YaCySchema.java b/source/net/yacy/cora/federate/solr/YaCySchema.java index 5c22d8194..530434ff7 100644 --- a/source/net/yacy/cora/federate/solr/YaCySchema.java +++ b/source/net/yacy/cora/federate/solr/YaCySchema.java @@ -67,6 +67,7 @@ public enum YaCySchema implements Schema { imagescount_i(SolrType.integer, true, true, false, "number of images"), responsetime_i(SolrType.integer, true, true, false, "response time of target server in milliseconds"), text_t(SolrType.text_general, true, true, false, "all visible text"), + synonyms_t(SolrType.text_general, true, true, false, "additional synonyms to the words in the text"), h1_txt(SolrType.text_general, true, true, true, "h1 header"), h2_txt(SolrType.text_general, true, true, true, "h2 header"), h3_txt(SolrType.text_general, true, true, true, "h3 header"), diff --git a/source/net/yacy/data/ymark/YMarkAutoTagger.java b/source/net/yacy/data/ymark/YMarkAutoTagger.java index 70f4472d9..78ba98851 100644 --- a/source/net/yacy/data/ymark/YMarkAutoTagger.java +++ b/source/net/yacy/data/ymark/YMarkAutoTagger.java @@ -85,7 +85,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle } //get words from document - final Map words = new Condenser(document, true, true, LibraryProvider.dymLib, false).words(); + final Map words = new Condenser(document, true, true, LibraryProvider.dymLib, LibraryProvider.synonyms, false).words(); // generate potential tags from document title, description and subject final int bufferSize = document.dc_title().length() + document.dc_description().length() + document.dc_subject(' ').length() + 32; diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index 677015a70..67a44da03 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -38,6 +38,7 @@ import net.yacy.cora.document.ASCII; import net.yacy.cora.document.WordCache; import net.yacy.cora.document.Classification.ContentDomain; import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.language.synonyms.SynonymLibrary; import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.document.language.Identificator; import net.yacy.document.parser.html.ImageEntry; @@ -66,7 +67,8 @@ public final class Condenser { //private Properties analysis; private final Map words; // a string (the words) to (indexWord) - relation private final Map> tags = new HashMap>(); // a set of tags, discovered from Autotagging - + private final Set synonyms; // a set of synonyms to the words + public int RESULT_NUMB_WORDS = -1; public int RESULT_DIFF_WORDS = -1; public int RESULT_NUMB_SENTENCES = -1; @@ -79,12 +81,14 @@ public final class Condenser { final boolean indexText, final boolean indexMedia, final WordCache meaningLib, + final SynonymLibrary stemming, final boolean doAutotagging ) { Thread.currentThread().setName("condenser-" + document.dc_identifier()); // for debugging // if addMedia == true, then all the media links are also parsed and added to the words // added media words are flagged with the appropriate media flag this.words = new HashMap(); + this.synonyms = new HashSet(); this.RESULT_FLAGS = new Bitfield(4); // construct flag set for document @@ -202,6 +206,14 @@ public final class Condenser { if (!this.tags.isEmpty()) { document.addMetatags(this.tags); } + + // create the synonyms set + if (stemming != null) { + for (String word: this.words.keySet()) { + Set syms = stemming.getSynonyms(word); + if (syms != null) this.synonyms.addAll(syms); + } + } } private void insertTextToWords( @@ -239,6 +251,7 @@ public final class Condenser { this.languageIdentificator = null; // we don't need that here // analysis = new Properties(); this.words = new TreeMap(); + this.synonyms = new HashSet(); createCondensement(text, meaningLib, doAutotagging); } @@ -254,6 +267,10 @@ public final class Condenser { // returns the words as word/indexWord relation map return this.words; } + + public Set synonyms() { + return this.synonyms; + } public String language() { return this.languageIdentificator.getLanguage(); diff --git a/source/net/yacy/document/LibraryProvider.java b/source/net/yacy/document/LibraryProvider.java index 368f186e3..f5b991024 100644 --- a/source/net/yacy/document/LibraryProvider.java +++ b/source/net/yacy/document/LibraryProvider.java @@ -47,6 +47,7 @@ import net.yacy.cora.document.WordCache; import net.yacy.cora.geo.GeonamesLocation; import net.yacy.cora.geo.OpenGeoDBLocation; import net.yacy.cora.geo.OverarchingLocation; +import net.yacy.cora.language.synonyms.SynonymLibrary; import net.yacy.cora.lod.JenaTripleStore; import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.cora.lod.vocabulary.Tagging.SOTuple; @@ -61,19 +62,19 @@ public class LibraryProvider { public static final String path_to_source_dictionaries = "source"; public static final String path_to_did_you_mean_dictionaries = "didyoumean"; public static final String path_to_autotagging_dictionaries = "autotagging"; + public static final String path_to_synonym_dictionaries = "synonyms"; public static final String disabledExtension = ".disabled"; public static WordCache dymLib = new WordCache(null); public static Autotagging autotagging = null; + public static SynonymLibrary synonyms = null; public static OverarchingLocation geoLoc = new OverarchingLocation(); private static File dictSource = null; private static File dictRoot = null; public static enum Dictionary { - GEODB0( - "geo0", - "http://downloads.sourceforge.net/project/opengeodb/Data/0.2.5a/opengeodb-0.2.5a-UTF8-sql.gz" ), + GEODB0( "geo0", "http://downloads.sourceforge.net/project/opengeodb/Data/0.2.5a/opengeodb-0.2.5a-UTF8-sql.gz" ), GEODB1( "geo1", "http://fa-technik.adfc.de/code/opengeodb/dump/opengeodb-02624_2011-10-17.sql.gz" ), GEON0( "geon0", "http://download.geonames.org/export/dump/cities1000.zip" ), GEON1( "geon1", "http://download.geonames.org/export/dump/cities5000.zip" ), @@ -121,6 +122,7 @@ public class LibraryProvider { initAutotagging(); activateDeReWo(); initDidYouMean(); + initSynonyms(); integrateOpenGeoDB(); integrateGeonames0(-1); integrateGeonames1(-1); @@ -169,8 +171,7 @@ public class LibraryProvider { geoLoc.activateLocation(Dictionary.GEON2.nickname, new GeonamesLocation(geon, dymLib, minPopulation)); return; } - } - + } public static void initDidYouMean() { final File dymDict = new File(dictRoot, path_to_did_you_mean_dictionaries); if ( !dymDict.exists() ) { @@ -186,7 +187,13 @@ public class LibraryProvider { } autotagging = new Autotagging(autotaggingPath); } - + public static void initSynonyms() { + final File synonymPath = new File(dictRoot, path_to_synonym_dictionaries); + if ( !synonymPath.exists() ) { + synonymPath.mkdirs(); + } + synonyms = new SynonymLibrary(synonymPath); + } public static void activateDeReWo() { // translate input files (once..) final File dymDict = new File(dictRoot, path_to_did_you_mean_dictionaries); diff --git a/source/net/yacy/document/parser/torrentParser.java b/source/net/yacy/document/parser/torrentParser.java index d83b162ac..1b9b2041e 100644 --- a/source/net/yacy/document/parser/torrentParser.java +++ b/source/net/yacy/document/parser/torrentParser.java @@ -118,7 +118,7 @@ public class torrentParser extends AbstractParser implements Parser { byte[] b = FileUtils.read(new File(args[0])); torrentParser parser = new torrentParser(); Document[] d = parser.parse(new DigestURI("http://localhost/test.torrent"), null, "UTF-8", new ByteArrayInputStream(b)); - Condenser c = new Condenser(d[0], true, true, LibraryProvider.dymLib, false); + Condenser c = new Condenser(d[0], true, true, LibraryProvider.dymLib, LibraryProvider.synonyms, false); Map w = c.words(); for (Map.Entry e: w.entrySet()) System.out.println("Word: " + e.getKey() + " - " + e.getValue().posInText); } catch (IOException e) { diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 2c25a4f01..ed18c6aa1 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2464,9 +2464,10 @@ public final class Switchboard extends serverSwitch final Condenser[] condenser = new Condenser[in.documents.length]; for ( int i = 0; i < in.documents.length; i++ ) { condenser[i] = - new Condenser(in.documents[i], in.queueEntry.profile().indexText(), in.queueEntry - .profile() - .indexMedia(), LibraryProvider.dymLib, true); + new Condenser( + in.documents[i], in.queueEntry.profile().indexText(), + in.queueEntry.profile().indexMedia(), + LibraryProvider.dymLib, LibraryProvider.synonyms, true); // update image result list statistics // its good to do this concurrently here, because it needs a DNS lookup @@ -2714,7 +2715,7 @@ public final class Switchboard extends serverSwitch if ( document.indexingDenied() ) { throw new Parser.Failure("indexing is denied", url); } - final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, true); + final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, LibraryProvider.synonyms, true); ResultImages.registerImages(url, document, true); Switchboard.this.webStructure.generateCitationReference(url, document); storeDocumentIndex( diff --git a/source/net/yacy/search/index/DocumentIndex.java b/source/net/yacy/search/index/DocumentIndex.java index 2b41a7f75..73637c29b 100644 --- a/source/net/yacy/search/index/DocumentIndex.java +++ b/source/net/yacy/search/index/DocumentIndex.java @@ -164,7 +164,7 @@ public class DocumentIndex extends Segment { int c = 0; for ( final Document document : documents ) { if (document == null) continue; - final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, true); + final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, LibraryProvider.synonyms, true); rows[c++] = super.storeDocument( url, diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 756b3c23b..a11ad8534 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -395,7 +395,7 @@ public class Segment { // STORE TO SOLR try { - this.fulltext.putDocument(this.fulltext.getSolrScheme().yacy2solr(id, profile, responseHeader, document, metadata)); + this.fulltext.putDocument(this.fulltext.getSolrScheme().yacy2solr(id, profile, responseHeader, document, condenser, metadata)); } catch ( final IOException e ) { Log.logWarning("SOLR", "failed to send " + urlNormalform + " to solr: " + e.getMessage()); } @@ -517,7 +517,7 @@ public class Segment { } // get the word set Set words = null; - words = new Condenser(document, true, true, null, false).words().keySet(); + words = new Condenser(document, true, true, null, null, false).words().keySet(); // delete all word references int count = 0; diff --git a/source/net/yacy/search/index/SolrConfiguration.java b/source/net/yacy/search/index/SolrConfiguration.java index ec9290541..87b30f636 100644 --- a/source/net/yacy/search/index/SolrConfiguration.java +++ b/source/net/yacy/search/index/SolrConfiguration.java @@ -338,7 +338,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable if (!text.isEmpty() && text.charAt(text.length() - 1) == '.') sb.append(text); else sb.append(text).append('.'); } - public SolrInputDocument yacy2solr(final String id, final CrawlProfile profile, final ResponseHeader header, final Document yacydoc, final URIMetadata metadata) { + public SolrInputDocument yacy2solr(final String id, final CrawlProfile profile, final ResponseHeader header, final Document yacydoc, Condenser condenser, final URIMetadata metadata) { // we use the SolrCell design as index scheme final SolrInputDocument doc = new SolrInputDocument(); final DigestURI digestURI = new DigestURI(yacydoc.dc_source()); @@ -416,6 +416,13 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable final int contentwc = content.split(" ").length; add(doc, YaCySchema.wordcount_i, contentwc); } + if (allAttr || contains(YaCySchema.synonyms_t)) { + Set synonyms = condenser.synonyms(); + StringBuilder s = new StringBuilder(synonyms.size() * 8); + for (String o: synonyms) s.append(o).append(' '); + if (s.length() > 0) s.setLength(s.length() - 1); + add(doc, YaCySchema.synonyms_t, s.toString()); + } // path elements of link if (allAttr || contains(YaCySchema.url_paths_sxt)) add(doc, YaCySchema.url_paths_sxt, digestURI.getPaths());