diff --git a/source/net/yacy/document/importer/JsonListImporter.java b/source/net/yacy/document/importer/JsonListImporter.java index 06082d701..7392f39ae 100644 --- a/source/net/yacy/document/importer/JsonListImporter.java +++ b/source/net/yacy/document/importer/JsonListImporter.java @@ -148,6 +148,11 @@ public class JsonListImporter extends Thread implements Importer { } if ((json.opt("index") != null && json.length() == 1) || json.length() == 0) continue; final SolrInputDocument surrogate = new SolrInputDocument(); + + // set default values which act as constraints for a proper search + CollectionSchema.httpstatus_i.add(surrogate, 200); + + // get fields for json object jsonreader: for (final String key: json.keySet()) { final Object o = json.opt(key); if (o == null) continue; @@ -212,10 +217,19 @@ public class JsonListImporter extends Thread implements Importer { final String id = ASCII.String(durl.hash()); surrogate.setField(CollectionSchema.sku.getSolrFieldName(), durl.toNormalform(true)); surrogate.setField(CollectionSchema.id.getSolrFieldName(), id); + surrogate.setField(CollectionSchema.host_s.getSolrFieldName(), durl.getHost()); surrogate.setField(CollectionSchema.host_id_s.getSolrFieldName(), id.substring(6)); continue jsonreader; } + if (key.equals("description")) { + // in YaCy descriptions are full-text indexed and also multi-value fields + final List descriptions = new ArrayList<>(); + descriptions.add(o.toString()); + CollectionSchema.description_txt.add(surrogate, descriptions); + continue jsonreader; + } if (key.equals("referrer_url_s")) { + // same patch as for urls which require re-calculation of id's; in this case we store the id only! final DigestURL durl = new DigestURL(o.toString()); final String id = ASCII.String(durl.hash()); surrogate.setField(CollectionSchema.referrer_id_s.getSolrFieldName(), id); @@ -236,6 +250,12 @@ public class JsonListImporter extends Thread implements Importer { continue jsonreader; } + // check if required fields are still missing and compute them + if (!surrogate.containsKey(CollectionSchema.host_s.getSolrFieldName())) { + final DigestURL durl = new DigestURL((String) surrogate.getFieldValue(CollectionSchema.sku.getSolrFieldName())); + surrogate.setField(CollectionSchema.host_s.getSolrFieldName(), durl.getHost()); + } + // regular situation, just read content of field surrogate.setField(key, o.toString()); } diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 0d5e03c00..0c39b51cf 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2325,7 +2325,10 @@ public final class Switchboard extends serverSwitch { || s.endsWith(".xml.zip") || s.endsWith(".warc") || s.endsWith(".warc.gz") + || s.endsWith(".jsonl") + || s.endsWith(".jsonl.gz") || s.endsWith(".jsonlist") + || s.endsWith(".jsonlist.gz") || s.endsWith(".flatjson") ) { count++; } @@ -3167,9 +3170,9 @@ public final class Switchboard extends serverSwitch { } // check mustmatch pattern - Pattern mustmatchurl = profile.indexUrlMustMatchPattern(); + final Pattern mustmatchurl = profile.indexUrlMustMatchPattern(); if (mustmatchurl != CrawlProfile.MATCH_ALL_PATTERN && !mustmatchurl.matcher(urls).matches()) { - String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + mustmatchurl.pattern(); + final String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + mustmatchurl.pattern(); if (this.log.isInfo()) this.log.info(info); // create a new errorURL DB entry this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1); @@ -3177,9 +3180,9 @@ public final class Switchboard extends serverSwitch { } // check mustnotmatch - Pattern mustnotmatchurl = profile.indexUrlMustNotMatchPattern(); + final Pattern mustnotmatchurl = profile.indexUrlMustNotMatchPattern(); if (mustnotmatchurl != CrawlProfile.MATCH_NEVER_PATTERN && mustnotmatchurl.matcher(urls).matches()) { - String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustNotMatchPattern = " + mustnotmatchurl; + final String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustNotMatchPattern = " + mustnotmatchurl; if (this.log.isInfo()) this.log.info(info); // create a new errorURL DB entry this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1); @@ -3192,13 +3195,13 @@ public final class Switchboard extends serverSwitch { // check canonical if (profile.noindexWhenCanonicalUnequalURL()) { - AnchorURL canonical = document.getCanonical(); - DigestURL source = document.dc_source(); + final AnchorURL canonical = document.getCanonical(); + final DigestURL source = document.dc_source(); if (canonical != null && source != null) { - String canonical_norm = canonical.toNormalform(true); - String source_norm = source.toNormalform(true); + final String canonical_norm = canonical.toNormalform(true); + final String source_norm = source.toNormalform(true); if (!canonical_norm.equals(source_norm)) { - String info = "Not Condensed Resource '" + urls + "': denied, canonical != source; canonical = " +canonical_norm + "; source = " + source_norm; + final String info = "Not Condensed Resource '" + urls + "': denied, canonical != source; canonical = " +canonical_norm + "; source = " + source_norm; if (this.log.isInfo()) this.log.info(info); // create a new errorURL DB entry this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1); @@ -3216,9 +3219,9 @@ public final class Switchboard extends serverSwitch { } // check content pattern must-match - Pattern mustmatchcontent = profile.indexContentMustMatchPattern(); + final Pattern mustmatchcontent = profile.indexContentMustMatchPattern(); if (mustmatchcontent != CrawlProfile.MATCH_ALL_PATTERN && !mustmatchcontent.matcher(document.getTextString()).matches()) { - String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + mustmatchcontent.pattern() ; + final String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + mustmatchcontent.pattern() ; if (this.log.isInfo()) this.log.info(info); // create a new errorURL DB entry this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1); @@ -3226,9 +3229,9 @@ public final class Switchboard extends serverSwitch { } // check content pattern must-not-match - Pattern mustnotmatchcontent = profile.indexContentMustNotMatchPattern(); + final Pattern mustnotmatchcontent = profile.indexContentMustNotMatchPattern(); if (mustnotmatchcontent != CrawlProfile.MATCH_NEVER_PATTERN && mustnotmatchcontent.matcher(document.getTextString()).matches()) { - String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustNotMatchPattern = " + mustnotmatchcontent.pattern(); + final String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustNotMatchPattern = " + mustnotmatchcontent.pattern(); if (this.log.isInfo()) this.log.info(info); // create a new errorURL DB entry this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);