From 5e8879beb7865ca31377e0ec87be0450064867de Mon Sep 17 00:00:00 2001
From: reger <reger18@arcor.de>
Date: Thu, 16 Feb 2017 01:43:14 +0100
Subject: [PATCH] Reduce self generated content for text_t (visible text index
 field) to avoid repeat of tokenized url as description, continuation of
 https://github.com/yacy/yacy_search_server/commit/7e09bff4a1a117d2f2336e004ec67ffb325a7e9d
 https://github.com/yacy/yacy_search_server/commit/1409cabe8b7bce1fb767f01665d9d7e0a91a81b6
 Add some javadoc, and not needed remove of omitted fields in postprocessing.

---
 .../schema/CollectionConfiguration.java       | 39 ++++++++-----------
 1 file changed, 17 insertions(+), 22 deletions(-)
diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java
index 7bc87e09e..8dd70ae28 100644
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@@ -247,10 +247,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
     }
     
     /**
-     * add uri attributes to solr document
+     * add uri attributes to solr document and assign the document id
      * @param doc
      * @param allAttr
-     * @param digestURL
+     * @param digestURL used to calc. the document.id and the doc.sku=(in index stored url)
      * @return the normalized url
      */
     public String addURIAttributes(final SolrInputDocument doc, final boolean allAttr, final DigestURL digestURL) {
@@ -305,13 +305,20 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
         }
         return us;
     }
-    
+
+    /**
+     * Convert a URIMetadataNode, which has some private fields to a pure
+     * SolrInputDocument with all field values from the input.
+     * This also assigns the document.id with the YaCy url.hash()
+     * @param md
+     * @return
+     */
     public SolrInputDocument metadata2solr(final URIMetadataNode md) {
 
         SolrInputDocument doc = toSolrInputDocument(md); //urimetadatanode stores some values in private fields, add now to sorldocument
 
         boolean allAttr = this.isEmpty();
-        addURIAttributes(doc, allAttr, md.url());
+        addURIAttributes(doc, allAttr, md.url()); // assign doc.id, doc.sku and url attribute fields
 
         String title = md.dc_title();
         if (allAttr || contains(CollectionSchema.title_count_i)) add(doc, CollectionSchema.title_count_i, 1);
@@ -873,18 +880,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
         }
         
         String content = document.getTextString();
-        String tokens = digestURL.toTokens();
-        if (content == null || content.length() == 0) {
-            content = tokens;
-        } else {
-            String[] t = CommonPattern.SPACE.split(tokens);
-            for (String r: t) {
-                if (r.length() > 0 &&
-                    content.indexOf(" " + r + " ") < 0 &&
-                    !content.startsWith(r + " ") &&
-                    !content.endsWith(" " + r)) content += " " + r;
-            }
-        }
 
         // handle image source meta data
         if (document.getContentDomain() == ContentDomain.IMAGE) {
@@ -1298,9 +1293,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
 			final AtomicInteger allcount) {
 		final Map<String, Long> hostExtentCache = new HashMap<String, Long>(); // a mapping from the host id to the number of documents which contain this host-id
         final Set<String> uniqueURLs = new ConcurrentHashSet<String>(); // will be used in a concurrent environment
-        final Set<String> omitFields = new HashSet<String>();
-        omitFields.add(CollectionSchema.process_sxt.getSolrFieldName());
-        omitFields.add(CollectionSchema.harvestkey_s.getSolrFieldName());
+        final Set<String> localOmitFields = new HashSet<String>();
+        localOmitFields.add(CollectionSchema.process_sxt.getSolrFieldName());
+        localOmitFields.add(CollectionSchema.harvestkey_s.getSolrFieldName());
         final Collection<String> failids = new ConcurrentHashSet<String>();
         final AtomicInteger countcheck = new AtomicInteger(0);
         final AtomicInteger proccount = new AtomicInteger();
@@ -1383,7 +1378,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
                                     try {
                                         DigestURL url = new DigestURL(u, ASCII.getBytes(i));
                                         byte[] id = url.hash();
-                                        SolrInputDocument sid = byPartialUpdate ? new SolrInputDocument() : collection.toSolrInputDocument(doc, omitFields);
+                                        SolrInputDocument sid = byPartialUpdate ? new SolrInputDocument() : collection.toSolrInputDocument(doc, localOmitFields);
                                         sid.setField(CollectionSchema.id.getSolrFieldName(), i);
                                         for (Object tag: proctags) try {
                                             
@@ -1427,10 +1422,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
                                         if (byPartialUpdate) {
                                             sid.setField(CollectionSchema.process_sxt.getSolrFieldName(), null); // setting this to null will cause a removal when doing a partial update
                                             sid.setField(CollectionSchema.harvestkey_s.getSolrFieldName(), null);
-                                        } else {
+                                        } /*else { // fields are omitted on sid creation
                                             sid.removeField(CollectionSchema.process_sxt.getSolrFieldName());
                                             sid.removeField(CollectionSchema.harvestkey_s.getSolrFieldName());
-                                        }
+                                        }*/
                                         // with standard solr fields selected, the sid now contains the fields
                                         // id, http_unique_b, www_unique_b, references_i, references_internal_i, references_external_i, references_exthosts_i, host_extent_i
                                         // and the value for host_extent_i is by default 2147483647