From 90f75c8c3db1d322aca97e5c5888d6657d26bc70 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Thu, 2 Jul 2015 00:23:50 +0200 Subject: [PATCH] added enrichment of synonyms and vocabularies for imported documents during surrogate reading: those attributes from the dump are removed during the import process and replaced by new detected attributes according to the setting of the YaCy peer. This may cause that all such attributes are removed if the importing peer has no synonyms and/or no vocabularies defined. --- source/net/yacy/document/Document.java | 17 +++--- source/net/yacy/document/Tokenizer.java | 4 ++ .../net/yacy/document/VocabularyScraper.java | 4 +- source/net/yacy/document/WordTokenizer.java | 2 +- source/net/yacy/search/Switchboard.java | 19 +++++- .../schema/CollectionConfiguration.java | 61 ++++++++++--------- 6 files changed, 67 insertions(+), 40 deletions(-) diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index cc69970ac..f72ea5890 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -253,24 +253,23 @@ dc_rights * @param tags */ protected void addMetatags(Map> tags) { - //String subject = YaCyMetadata.hashURI(this.source.hash()); - //for (String s: this.keywords) { - // tags.remove(s); - //} + this.generic_facets.putAll(computeGenericFacets(tags)); + } + + public static Map> computeGenericFacets(Map> tags) { + Map> gf = new HashMap>(); for (Map.Entry> e: tags.entrySet()) { Tagging vocabulary = LibraryProvider.autotagging.getVocabulary(e.getKey()); if (vocabulary == null) continue; - //String objectspace = vocabulary.getObjectspace(); - //StringBuilder sb = new StringBuilder(e.getValue().size() * 20); Set objects = new HashSet(); for (Tagging.Metatag s: e.getValue()) { objects.add(s.getObject()); - //sb.append(',').append(s.getObject()); } - this.generic_facets.put(vocabulary.getName(), objects); + gf.put(vocabulary.getName(), objects); } + return gf; } - + public String[] dc_subject() { // sort out doubles and empty words final TreeSet hs = new TreeSet(); diff --git a/source/net/yacy/document/Tokenizer.java b/source/net/yacy/document/Tokenizer.java index ed3b0fd0d..ff2e94bff 100644 --- a/source/net/yacy/document/Tokenizer.java +++ b/source/net/yacy/document/Tokenizer.java @@ -237,5 +237,9 @@ public class Tokenizer { for (String s: this.synonyms) l.add(s); return l; } + + public Map> tags() { + return this.tags; + } } diff --git a/source/net/yacy/document/VocabularyScraper.java b/source/net/yacy/document/VocabularyScraper.java index 967a4afbd..0ce349e06 100644 --- a/source/net/yacy/document/VocabularyScraper.java +++ b/source/net/yacy/document/VocabularyScraper.java @@ -40,8 +40,10 @@ public class VocabularyScraper { this.vocMap = new ConcurrentHashMap<>(); } + /** + * @param init must be a property list of property lists: the key of the top property list is the name of the vocabulary, the name of the embedded property list is the entity class and the value of the embedded property is the entity name + */ public VocabularyScraper(JSONObject init) { - // init must be a property list of property lists: the key of the top property list is the name of the vocabulary, the name of the embedded property list is the entity class and the value of the embedded property is the entity name this.scraperDefinition = init == null ? new JSONObject() : init; this.vocMap = new ConcurrentHashMap<>(); if (this.scraperDefinition.length() == 0) { diff --git a/source/net/yacy/document/WordTokenizer.java b/source/net/yacy/document/WordTokenizer.java index 598cd0f64..69d78ae71 100644 --- a/source/net/yacy/document/WordTokenizer.java +++ b/source/net/yacy/document/WordTokenizer.java @@ -72,7 +72,7 @@ public class WordTokenizer implements Enumeration { final StringBuilder r = (this.buffer == null) ? null : this.buffer; this.buffer = nextElement0(); // put word to words statistics cache - if (this.meaningLib != null) WordCache.learn(r); + if (this.meaningLib != null && r != null) WordCache.learn(r); return r; } diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 70a067a81..e46a3e9dc 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -154,7 +154,9 @@ import net.yacy.document.Document; import net.yacy.document.LibraryProvider; import net.yacy.document.Parser; import net.yacy.document.TextParser; +import net.yacy.document.VocabularyScraper; import net.yacy.document.Parser.Failure; +import net.yacy.document.Tokenizer; import net.yacy.document.content.SurrogateReader; import net.yacy.document.importer.OAIListFriendsLoader; import net.yacy.document.parser.audioTagParser; @@ -1996,10 +1998,25 @@ public final class Switchboard extends serverSwitch { indexer[t] = new Thread() { @Override public void run() { + VocabularyScraper scraper = new VocabularyScraper(); SolrInputDocument surrogate; while ((surrogate = reader.take()) != SurrogateReader.POISON_DOCUMENT ) { - // check if url is in accepted domain assert surrogate != null; + try { + // enrich the surrogate + final DigestURL root = new DigestURL((String) surrogate.getFieldValue(CollectionSchema.sku.getSolrFieldName()), ASCII.getBytes((String) surrogate.getFieldValue(CollectionSchema.id.getSolrFieldName()))); + final String text = (String) surrogate.getFieldValue(CollectionSchema.text_t.getSolrFieldName()); + if (text != null && text.length() > 0) { + // run the tokenizer on the text to get vocabularies and synonyms + final Tokenizer tokenizer = new Tokenizer(root, text, LibraryProvider.dymLib, true, scraper); + final Map> facets = Document.computeGenericFacets(tokenizer.tags()); + // overwrite the given vocabularies and synonyms with new computed ones + Switchboard.this.index.fulltext().getDefaultConfiguration().enrich(surrogate, tokenizer.synonyms(), facets); + } + } catch (MalformedURLException e) { + ConcurrentLog.logException(e); + } + // write the surrogate into the index Switchboard.this.index.putDocument(surrogate); if (shallTerminate()) break; } diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 362de281e..304d790c7 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -82,6 +82,7 @@ import net.yacy.crawler.retrieval.Response; import net.yacy.document.Condenser; import net.yacy.document.Document; import net.yacy.document.SentenceReader; +import net.yacy.document.Tokenizer; import net.yacy.document.content.DCEntry; import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ImageEntry; @@ -301,7 +302,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri String keywords = md.dc_subject(); Bitfield flags = md.flags(); - if (flags.get(Condenser.flag_cat_indexof)) { + if (flags.get(Tokenizer.flag_cat_indexof)) { if (keywords == null || keywords.isEmpty()) keywords = "indexof"; else { if (keywords.indexOf(',') > 0) keywords += ", indexof"; else keywords += " indexof"; } @@ -511,10 +512,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri } add(doc, CollectionSchema.keywords, keywords); } - if (allAttr || contains(CollectionSchema.synonyms_sxt)) { - List synonyms = condenser.synonyms(); - add(doc, CollectionSchema.synonyms_sxt, synonyms); - } // unique-fields; these values must be corrected during postprocessing. (the following logic is !^ (not-xor) but I prefer to write it that way as it is) add(doc, CollectionSchema.http_unique_b, setUnique || UNIQUE_HEURISTIC_PREFER_HTTPS ? digestURL.isHTTPS() : digestURL.isHTTP()); // this must be corrected afterwards during storage! @@ -993,29 +990,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri if (allAttr || contains(CollectionSchema.videolinkscount_i)) add(doc, CollectionSchema.videolinkscount_i, document.getVideolinks().size()); if (allAttr || contains(CollectionSchema.applinkscount_i)) add(doc, CollectionSchema.applinkscount_i, document.getApplinks().size()); - // write generic navigation - // there are no pre-defined solr fields for navigation because the vocabulary is generic - // we use dynamically allocated solr fields for this. - // It must be a multi-value string/token field, therefore we use _sxt extensions for the field names - List vocabularies = new ArrayList<>(); - for (Map.Entry> facet: document.getGenericFacets().entrySet()) { - String facetName = facet.getKey(); - Set facetValues = facet.getValue(); - int count = facetValues.size(); - if (count == 0) continue; - int logcount = (int) (Math.log(count) / Math.log(2)); - Integer[] counts = new Integer[logcount + 1]; for (int i = 0; i <= logcount; i++) counts[i] = i; - doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_TERMS_SUFFIX, facetValues.toArray(new String[count])); - doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_COUNT_SUFFIX, facetValues.size()); - doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_LOGCOUNT_SUFFIX, logcount); - doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_LOGCOUNTS_SUFFIX, counts); - vocabularies.add(facetName); - } - if ((allAttr || contains(CollectionSchema.vocabularies_sxt)) && vocabularies.size() > 0) { - add(doc, CollectionSchema.vocabularies_sxt, vocabularies); - } - - + // document post-processing if ((allAttr || contains(CollectionSchema.process_sxt)) && processTypes.size() > 0) { List p = new ArrayList(); for (ProcessType t: processTypes) p.add(t.name()); @@ -1024,8 +999,38 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri add(doc, CollectionSchema.harvestkey_s, sourceName); } } + + // document enrichments (synonyms, facets) + enrich(doc, condenser.synonyms(), document.getGenericFacets()); return doc; } + + public void enrich(SolrInputDocument doc, List synonyms, Map> genericFacets) { + if (this.isEmpty() || contains(CollectionSchema.vocabularies_sxt)) { + // write generic navigation + // there are no pre-defined solr fields for navigation because the vocabulary is generic + // we use dynamically allocated solr fields for this. + // It must be a multi-value string/token field, therefore we use _sxt extensions for the field names + List vocabularies = new ArrayList<>(); + for (Map.Entry> facet: genericFacets.entrySet()) { + String facetName = facet.getKey(); + Set facetValues = facet.getValue(); + int count = facetValues.size(); + if (count == 0) continue; + int logcount = (int) (Math.log(count) / Math.log(2)); + Integer[] counts = new Integer[logcount + 1]; for (int i = 0; i <= logcount; i++) counts[i] = i; + doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_TERMS_SUFFIX, facetValues.toArray(new String[count])); + doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_COUNT_SUFFIX, facetValues.size()); + doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_LOGCOUNT_SUFFIX, logcount); + doc.setField(CollectionSchema.VOCABULARY_PREFIX + facetName + CollectionSchema.VOCABULARY_LOGCOUNTS_SUFFIX, counts); + vocabularies.add(facetName); + } + if (vocabularies.size() > 0) add(doc, CollectionSchema.vocabularies_sxt, vocabularies); + } + if (this.isEmpty() || contains(CollectionSchema.synonyms_sxt)) { + if (synonyms.size() > 0) add(doc, CollectionSchema.synonyms_sxt, synonyms); + } + } public static boolean postprocessingRunning = false; public static String postprocessingActivity = "";