fine-tuned the import process of jsonl files which had been missing

to actually be able to make searches and browse the index with the host
browser
pull/651/head
Michael Peter Christen 9 months ago
parent de941c6fee
commit b295e38969

@ -148,6 +148,11 @@ public class JsonListImporter extends Thread implements Importer {
} }
if ((json.opt("index") != null && json.length() == 1) || json.length() == 0) continue; if ((json.opt("index") != null && json.length() == 1) || json.length() == 0) continue;
final SolrInputDocument surrogate = new SolrInputDocument(); final SolrInputDocument surrogate = new SolrInputDocument();
// set default values which act as constraints for a proper search
CollectionSchema.httpstatus_i.add(surrogate, 200);
// get fields for json object
jsonreader: for (final String key: json.keySet()) { jsonreader: for (final String key: json.keySet()) {
final Object o = json.opt(key); final Object o = json.opt(key);
if (o == null) continue; if (o == null) continue;
@ -212,10 +217,19 @@ public class JsonListImporter extends Thread implements Importer {
final String id = ASCII.String(durl.hash()); final String id = ASCII.String(durl.hash());
surrogate.setField(CollectionSchema.sku.getSolrFieldName(), durl.toNormalform(true)); surrogate.setField(CollectionSchema.sku.getSolrFieldName(), durl.toNormalform(true));
surrogate.setField(CollectionSchema.id.getSolrFieldName(), id); surrogate.setField(CollectionSchema.id.getSolrFieldName(), id);
surrogate.setField(CollectionSchema.host_s.getSolrFieldName(), durl.getHost());
surrogate.setField(CollectionSchema.host_id_s.getSolrFieldName(), id.substring(6)); surrogate.setField(CollectionSchema.host_id_s.getSolrFieldName(), id.substring(6));
continue jsonreader; continue jsonreader;
} }
if (key.equals("description")) {
// in YaCy descriptions are full-text indexed and also multi-value fields
final List<Object> descriptions = new ArrayList<>();
descriptions.add(o.toString());
CollectionSchema.description_txt.add(surrogate, descriptions);
continue jsonreader;
}
if (key.equals("referrer_url_s")) { if (key.equals("referrer_url_s")) {
// same patch as for urls which require re-calculation of id's; in this case we store the id only!
final DigestURL durl = new DigestURL(o.toString()); final DigestURL durl = new DigestURL(o.toString());
final String id = ASCII.String(durl.hash()); final String id = ASCII.String(durl.hash());
surrogate.setField(CollectionSchema.referrer_id_s.getSolrFieldName(), id); surrogate.setField(CollectionSchema.referrer_id_s.getSolrFieldName(), id);
@ -236,6 +250,12 @@ public class JsonListImporter extends Thread implements Importer {
continue jsonreader; continue jsonreader;
} }
// check if required fields are still missing and compute them
if (!surrogate.containsKey(CollectionSchema.host_s.getSolrFieldName())) {
final DigestURL durl = new DigestURL((String) surrogate.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
surrogate.setField(CollectionSchema.host_s.getSolrFieldName(), durl.getHost());
}
// regular situation, just read content of field // regular situation, just read content of field
surrogate.setField(key, o.toString()); surrogate.setField(key, o.toString());
} }

@ -2325,7 +2325,10 @@ public final class Switchboard extends serverSwitch {
|| s.endsWith(".xml.zip") || s.endsWith(".xml.zip")
|| s.endsWith(".warc") || s.endsWith(".warc")
|| s.endsWith(".warc.gz") || s.endsWith(".warc.gz")
|| s.endsWith(".jsonl")
|| s.endsWith(".jsonl.gz")
|| s.endsWith(".jsonlist") || s.endsWith(".jsonlist")
|| s.endsWith(".jsonlist.gz")
|| s.endsWith(".flatjson") ) { || s.endsWith(".flatjson") ) {
count++; count++;
} }
@ -3167,9 +3170,9 @@ public final class Switchboard extends serverSwitch {
} }
// check mustmatch pattern // check mustmatch pattern
Pattern mustmatchurl = profile.indexUrlMustMatchPattern(); final Pattern mustmatchurl = profile.indexUrlMustMatchPattern();
if (mustmatchurl != CrawlProfile.MATCH_ALL_PATTERN && !mustmatchurl.matcher(urls).matches()) { if (mustmatchurl != CrawlProfile.MATCH_ALL_PATTERN && !mustmatchurl.matcher(urls).matches()) {
String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + mustmatchurl.pattern(); final String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + mustmatchurl.pattern();
if (this.log.isInfo()) this.log.info(info); if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry // create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1); this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
@ -3177,9 +3180,9 @@ public final class Switchboard extends serverSwitch {
} }
// check mustnotmatch // check mustnotmatch
Pattern mustnotmatchurl = profile.indexUrlMustNotMatchPattern(); final Pattern mustnotmatchurl = profile.indexUrlMustNotMatchPattern();
if (mustnotmatchurl != CrawlProfile.MATCH_NEVER_PATTERN && mustnotmatchurl.matcher(urls).matches()) { if (mustnotmatchurl != CrawlProfile.MATCH_NEVER_PATTERN && mustnotmatchurl.matcher(urls).matches()) {
String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustNotMatchPattern = " + mustnotmatchurl; final String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustNotMatchPattern = " + mustnotmatchurl;
if (this.log.isInfo()) this.log.info(info); if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry // create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1); this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
@ -3192,13 +3195,13 @@ public final class Switchboard extends serverSwitch {
// check canonical // check canonical
if (profile.noindexWhenCanonicalUnequalURL()) { if (profile.noindexWhenCanonicalUnequalURL()) {
AnchorURL canonical = document.getCanonical(); final AnchorURL canonical = document.getCanonical();
DigestURL source = document.dc_source(); final DigestURL source = document.dc_source();
if (canonical != null && source != null) { if (canonical != null && source != null) {
String canonical_norm = canonical.toNormalform(true); final String canonical_norm = canonical.toNormalform(true);
String source_norm = source.toNormalform(true); final String source_norm = source.toNormalform(true);
if (!canonical_norm.equals(source_norm)) { if (!canonical_norm.equals(source_norm)) {
String info = "Not Condensed Resource '" + urls + "': denied, canonical != source; canonical = " +canonical_norm + "; source = " + source_norm; final String info = "Not Condensed Resource '" + urls + "': denied, canonical != source; canonical = " +canonical_norm + "; source = " + source_norm;
if (this.log.isInfo()) this.log.info(info); if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry // create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1); this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
@ -3216,9 +3219,9 @@ public final class Switchboard extends serverSwitch {
} }
// check content pattern must-match // check content pattern must-match
Pattern mustmatchcontent = profile.indexContentMustMatchPattern(); final Pattern mustmatchcontent = profile.indexContentMustMatchPattern();
if (mustmatchcontent != CrawlProfile.MATCH_ALL_PATTERN && !mustmatchcontent.matcher(document.getTextString()).matches()) { if (mustmatchcontent != CrawlProfile.MATCH_ALL_PATTERN && !mustmatchcontent.matcher(document.getTextString()).matches()) {
String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + mustmatchcontent.pattern() ; final String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + mustmatchcontent.pattern() ;
if (this.log.isInfo()) this.log.info(info); if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry // create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1); this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
@ -3226,9 +3229,9 @@ public final class Switchboard extends serverSwitch {
} }
// check content pattern must-not-match // check content pattern must-not-match
Pattern mustnotmatchcontent = profile.indexContentMustNotMatchPattern(); final Pattern mustnotmatchcontent = profile.indexContentMustNotMatchPattern();
if (mustnotmatchcontent != CrawlProfile.MATCH_NEVER_PATTERN && mustnotmatchcontent.matcher(document.getTextString()).matches()) { if (mustnotmatchcontent != CrawlProfile.MATCH_NEVER_PATTERN && mustnotmatchcontent.matcher(document.getTextString()).matches()) {
String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustNotMatchPattern = " + mustnotmatchcontent.pattern(); final String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustNotMatchPattern = " + mustnotmatchcontent.pattern();
if (this.log.isInfo()) this.log.info(info); if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry // create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1); this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);

Loading…
Cancel
Save