fine-tuned the import process of jsonl files which had been missing

to actually be able to make searches and browse the index with the host browser
11 months ago · b295e38969
parent de941c6fee
commit b295e38969
2 changed files with 36 additions and 13 deletions
--- a/source/net/yacy/document/importer/JsonListImporter.java
+++ b/source/net/yacy/document/importer/JsonListImporter.java
@ -148,6 +148,11 @@ public class JsonListImporter extends Thread implements Importer {
            }
            if ((json.opt("index") != null && json.length() == 1) || json.length() == 0) continue;
            final SolrInputDocument surrogate = new SolrInputDocument();
+
+            // set default values which act as constraints for a proper search
+            CollectionSchema.httpstatus_i.add(surrogate, 200);
+
+            // get fields for json object
            jsonreader: for (final String key: json.keySet()) {
                final Object o = json.opt(key);
                if (o == null) continue;
@ -212,10 +217,19 @@ public class JsonListImporter extends Thread implements Importer {
                        final String id = ASCII.String(durl.hash());
                        surrogate.setField(CollectionSchema.sku.getSolrFieldName(), durl.toNormalform(true));
                        surrogate.setField(CollectionSchema.id.getSolrFieldName(), id);
+                        surrogate.setField(CollectionSchema.host_s.getSolrFieldName(), durl.getHost());
                        surrogate.setField(CollectionSchema.host_id_s.getSolrFieldName(), id.substring(6));
                        continue jsonreader;
                    }
+                    if (key.equals("description")) {
+                        // in YaCy descriptions are full-text indexed and also multi-value fields
+                        final List<Object> descriptions = new ArrayList<>();
+                        descriptions.add(o.toString());
+                        CollectionSchema.description_txt.add(surrogate, descriptions);
+                        continue jsonreader;
+                    }
                    if (key.equals("referrer_url_s")) {
+                        // same patch as for urls which require re-calculation of id's; in this case we store the id only!
                        final DigestURL durl = new DigestURL(o.toString());
                        final String id = ASCII.String(durl.hash());
                        surrogate.setField(CollectionSchema.referrer_id_s.getSolrFieldName(), id);
@ -236,6 +250,12 @@ public class JsonListImporter extends Thread implements Importer {
                        continue jsonreader;
                    }

+                    // check if required fields are still missing and compute them
+                    if (!surrogate.containsKey(CollectionSchema.host_s.getSolrFieldName())) {
+                        final DigestURL durl = new DigestURL((String) surrogate.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
+                        surrogate.setField(CollectionSchema.host_s.getSolrFieldName(), durl.getHost());
+                    }
+
                    // regular situation, just read content of field
                    surrogate.setField(key, o.toString());
                }
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -2325,7 +2325,10 @@ public final class Switchboard extends serverSwitch {
                    || s.endsWith(".xml.zip")
                    || s.endsWith(".warc")
                    || s.endsWith(".warc.gz")
+                    || s.endsWith(".jsonl")
+                    || s.endsWith(".jsonl.gz")
                    || s.endsWith(".jsonlist")
+                    || s.endsWith(".jsonlist.gz")
                    || s.endsWith(".flatjson") ) {
                count++;
            }
@ -3167,9 +3170,9 @@ public final class Switchboard extends serverSwitch {
        }

        // check mustmatch pattern
-        Pattern mustmatchurl = profile.indexUrlMustMatchPattern();
+        final Pattern mustmatchurl = profile.indexUrlMustMatchPattern();
        if (mustmatchurl != CrawlProfile.MATCH_ALL_PATTERN && !mustmatchurl.matcher(urls).matches()) {
-            String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + mustmatchurl.pattern();
+            final String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + mustmatchurl.pattern();
            if (this.log.isInfo()) this.log.info(info);
            // create a new errorURL DB entry
            this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
@ -3177,9 +3180,9 @@ public final class Switchboard extends serverSwitch {
        }

        // check mustnotmatch
-        Pattern mustnotmatchurl = profile.indexUrlMustNotMatchPattern();
+        final Pattern mustnotmatchurl = profile.indexUrlMustNotMatchPattern();
        if (mustnotmatchurl != CrawlProfile.MATCH_NEVER_PATTERN && mustnotmatchurl.matcher(urls).matches()) {
-            String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustNotMatchPattern = " + mustnotmatchurl;
+            final String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustNotMatchPattern = " + mustnotmatchurl;
            if (this.log.isInfo()) this.log.info(info);
            // create a new errorURL DB entry
            this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
@ -3192,13 +3195,13 @@ public final class Switchboard extends serverSwitch {

            // check canonical
            if (profile.noindexWhenCanonicalUnequalURL()) {
-                AnchorURL canonical = document.getCanonical();
-                DigestURL source = document.dc_source();
+                final AnchorURL canonical = document.getCanonical();
+                final DigestURL source = document.dc_source();
                if (canonical != null && source != null) {
-                    String canonical_norm = canonical.toNormalform(true);
-                    String source_norm = source.toNormalform(true);
+                    final String canonical_norm = canonical.toNormalform(true);
+                    final String source_norm = source.toNormalform(true);
                    if (!canonical_norm.equals(source_norm)) {
-                        String info = "Not Condensed Resource '" + urls + "': denied, canonical != source; canonical = " +canonical_norm + "; source = " + source_norm;
+                        final String info = "Not Condensed Resource '" + urls + "': denied, canonical != source; canonical = " +canonical_norm + "; source = " + source_norm;
                        if (this.log.isInfo()) this.log.info(info);
                        // create a new errorURL DB entry
                        this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
@ -3216,9 +3219,9 @@ public final class Switchboard extends serverSwitch {
            }

            // check content pattern must-match
-            Pattern mustmatchcontent = profile.indexContentMustMatchPattern();
+            final Pattern mustmatchcontent = profile.indexContentMustMatchPattern();
            if (mustmatchcontent != CrawlProfile.MATCH_ALL_PATTERN && !mustmatchcontent.matcher(document.getTextString()).matches()) {
-                String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + mustmatchcontent.pattern() ;
+                final String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + mustmatchcontent.pattern() ;
                if (this.log.isInfo()) this.log.info(info);
                // create a new errorURL DB entry
                this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
@ -3226,9 +3229,9 @@ public final class Switchboard extends serverSwitch {
            }

            // check content pattern must-not-match
-            Pattern mustnotmatchcontent = profile.indexContentMustNotMatchPattern();
+            final Pattern mustnotmatchcontent = profile.indexContentMustNotMatchPattern();
            if (mustnotmatchcontent != CrawlProfile.MATCH_NEVER_PATTERN && mustnotmatchcontent.matcher(document.getTextString()).matches()) {
-                String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustNotMatchPattern = " + mustnotmatchcontent.pattern();
+                final String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustNotMatchPattern = " + mustnotmatchcontent.pattern();
                if (this.log.isInfo()) this.log.info(info);
                // create a new errorURL DB entry
                this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);