From 5e8879beb7865ca31377e0ec87be0450064867de Mon Sep 17 00:00:00 2001 From: reger Date: Thu, 16 Feb 2017 01:43:14 +0100 Subject: [PATCH] Reduce self generated content for text_t (visible text index field) to avoid repeat of tokenized url as description, continuation of https://github.com/yacy/yacy_search_server/commit/7e09bff4a1a117d2f2336e004ec67ffb325a7e9d https://github.com/yacy/yacy_search_server/commit/1409cabe8b7bce1fb767f01665d9d7e0a91a81b6 Add some javadoc, and not needed remove of omitted fields in postprocessing. --- .../schema/CollectionConfiguration.java | 39 ++++++++----------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 7bc87e09e..8dd70ae28 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -247,10 +247,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri } /** - * add uri attributes to solr document + * add uri attributes to solr document and assign the document id * @param doc * @param allAttr - * @param digestURL + * @param digestURL used to calc. the document.id and the doc.sku=(in index stored url) * @return the normalized url */ public String addURIAttributes(final SolrInputDocument doc, final boolean allAttr, final DigestURL digestURL) { @@ -305,13 +305,20 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri } return us; } - + + /** + * Convert a URIMetadataNode, which has some private fields to a pure + * SolrInputDocument with all field values from the input. + * This also assigns the document.id with the YaCy url.hash() + * @param md + * @return + */ public SolrInputDocument metadata2solr(final URIMetadataNode md) { SolrInputDocument doc = toSolrInputDocument(md); //urimetadatanode stores some values in private fields, add now to sorldocument boolean allAttr = this.isEmpty(); - addURIAttributes(doc, allAttr, md.url()); + addURIAttributes(doc, allAttr, md.url()); // assign doc.id, doc.sku and url attribute fields String title = md.dc_title(); if (allAttr || contains(CollectionSchema.title_count_i)) add(doc, CollectionSchema.title_count_i, 1); @@ -873,18 +880,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri } String content = document.getTextString(); - String tokens = digestURL.toTokens(); - if (content == null || content.length() == 0) { - content = tokens; - } else { - String[] t = CommonPattern.SPACE.split(tokens); - for (String r: t) { - if (r.length() > 0 && - content.indexOf(" " + r + " ") < 0 && - !content.startsWith(r + " ") && - !content.endsWith(" " + r)) content += " " + r; - } - } // handle image source meta data if (document.getContentDomain() == ContentDomain.IMAGE) { @@ -1298,9 +1293,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri final AtomicInteger allcount) { final Map hostExtentCache = new HashMap(); // a mapping from the host id to the number of documents which contain this host-id final Set uniqueURLs = new ConcurrentHashSet(); // will be used in a concurrent environment - final Set omitFields = new HashSet(); - omitFields.add(CollectionSchema.process_sxt.getSolrFieldName()); - omitFields.add(CollectionSchema.harvestkey_s.getSolrFieldName()); + final Set localOmitFields = new HashSet(); + localOmitFields.add(CollectionSchema.process_sxt.getSolrFieldName()); + localOmitFields.add(CollectionSchema.harvestkey_s.getSolrFieldName()); final Collection failids = new ConcurrentHashSet(); final AtomicInteger countcheck = new AtomicInteger(0); final AtomicInteger proccount = new AtomicInteger(); @@ -1383,7 +1378,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri try { DigestURL url = new DigestURL(u, ASCII.getBytes(i)); byte[] id = url.hash(); - SolrInputDocument sid = byPartialUpdate ? new SolrInputDocument() : collection.toSolrInputDocument(doc, omitFields); + SolrInputDocument sid = byPartialUpdate ? new SolrInputDocument() : collection.toSolrInputDocument(doc, localOmitFields); sid.setField(CollectionSchema.id.getSolrFieldName(), i); for (Object tag: proctags) try { @@ -1427,10 +1422,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri if (byPartialUpdate) { sid.setField(CollectionSchema.process_sxt.getSolrFieldName(), null); // setting this to null will cause a removal when doing a partial update sid.setField(CollectionSchema.harvestkey_s.getSolrFieldName(), null); - } else { + } /*else { // fields are omitted on sid creation sid.removeField(CollectionSchema.process_sxt.getSolrFieldName()); sid.removeField(CollectionSchema.harvestkey_s.getSolrFieldName()); - } + }*/ // with standard solr fields selected, the sid now contains the fields // id, http_unique_b, www_unique_b, references_i, references_internal_i, references_external_i, references_exthosts_i, host_extent_i // and the value for host_extent_i is by default 2147483647