yacy can now read searchlab export dump files

using the surrogate input process:
- copy the searchlab export file to DATA/SURROGATE/in
- the file is processed automatically and then moved to
DATA/SURROGATE/OUT
pull/533/head
Michael Peter Christen 2 years ago
parent 32e6a5f903
commit 49daa32a88

@ -2165,7 +2165,7 @@ public final class Switchboard extends serverSwitch {
this.log.warn("IO Error processing warc file " + infile); this.log.warn("IO Error processing warc file " + infile);
} }
return moved; return moved;
} else if (s.endsWith(".jsonlist") || s.endsWith(".flatjson")) { } else if (s.endsWith(".jsonlist") || s.endsWith(".jsonlist.gz") || s.endsWith(".flatjson")) {
return this.processSurrogateJson(infile, outfile); return this.processSurrogateJson(infile, outfile);
} }
InputStream is = null; InputStream is = null;
@ -2216,7 +2216,7 @@ public final class Switchboard extends serverSwitch {
final long starttime = System.currentTimeMillis(); final long starttime = System.currentTimeMillis();
boolean moved = false; boolean moved = false;
FileInputStream fis = null; InputStream fis = null;
BufferedReader br = null; BufferedReader br = null;
// start indexer threads which mostly care about tokenization and facet + synonym enrichment // start indexer threads which mostly care about tokenization and facet + synonym enrichment
@ -2255,16 +2255,16 @@ public final class Switchboard extends serverSwitch {
} }
try { try {
fis = new FileInputStream(infile); fis = infile.getName().endsWith(".gz") ? new GZIPInputStream(new FileInputStream(infile)) : new FileInputStream(infile);
final InputStream is = new BufferedInputStream(fis); final InputStream bis = new BufferedInputStream(fis);
br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8)); br = new BufferedReader(new InputStreamReader(bis, StandardCharsets.UTF_8));
String line; String line;
while ((line = br.readLine()) != null) { while ((line = br.readLine()) != null) {
final JSONTokener jt = new JSONTokener(line); final JSONTokener jt = new JSONTokener(line);
final JSONObject json = new JSONObject(jt); final JSONObject json = new JSONObject(jt);
if ((json.opt("index") != null && json.length() == 1) || json.length() == 0) continue; if ((json.opt("index") != null && json.length() == 1) || json.length() == 0) continue;
final SolrInputDocument surrogate = new SolrInputDocument(); final SolrInputDocument surrogate = new SolrInputDocument();
for (final String key: json.keySet()) { jsonreader: for (final String key: json.keySet()) {
final Object o = json.opt(key); final Object o = json.opt(key);
if (o == null) continue; if (o == null) continue;
if (o instanceof JSONArray) { if (o instanceof JSONArray) {
@ -2282,7 +2282,9 @@ public final class Switchboard extends serverSwitch {
} }
CollectionSchema.inboundlinks_urlstub_sxt.add(surrogate, urlstub); CollectionSchema.inboundlinks_urlstub_sxt.add(surrogate, urlstub);
CollectionSchema.inboundlinks_protocol_sxt.add(surrogate, protocol); CollectionSchema.inboundlinks_protocol_sxt.add(surrogate, protocol);
} else if (key.equals("outboundlinks_sxt")) { continue jsonreader;
}
if (key.equals("outboundlinks_sxt")) {
// compute outboundlinks_urlstub_sxt and outboundlinks_protocol_sxt // compute outboundlinks_urlstub_sxt and outboundlinks_protocol_sxt
final List<Object> urlstub = new ArrayList<>(); final List<Object> urlstub = new ArrayList<>();
final List<Object> protocol = new ArrayList<>(); final List<Object> protocol = new ArrayList<>();
@ -2293,7 +2295,9 @@ public final class Switchboard extends serverSwitch {
} }
CollectionSchema.outboundlinks_urlstub_sxt.add(surrogate, urlstub); CollectionSchema.outboundlinks_urlstub_sxt.add(surrogate, urlstub);
CollectionSchema.outboundlinks_protocol_sxt.add(surrogate, protocol); CollectionSchema.outboundlinks_protocol_sxt.add(surrogate, protocol);
} else if (key.equals("images_sxt")) { continue jsonreader;
}
if (key.equals("images_sxt")) {
// compute images_urlstub_sxt and images_protocol_sxt // compute images_urlstub_sxt and images_protocol_sxt
final List<Object> urlstub = new ArrayList<>(); final List<Object> urlstub = new ArrayList<>();
final List<Object> protocol = new ArrayList<>(); final List<Object> protocol = new ArrayList<>();
@ -2304,35 +2308,52 @@ public final class Switchboard extends serverSwitch {
} }
CollectionSchema.images_urlstub_sxt.add(surrogate, urlstub); CollectionSchema.images_urlstub_sxt.add(surrogate, urlstub);
CollectionSchema.images_protocol_sxt.add(surrogate, protocol); CollectionSchema.images_protocol_sxt.add(surrogate, protocol);
} else { continue jsonreader;
}
// prepare to read key type
CollectionSchema ctype = null;
try {ctype = CollectionSchema.valueOf(key);} catch (final Exception e) {
this.log.warn("unknown key for CollectionSchema: " + key);
continue jsonreader;
}
final List<Object> list = new ArrayList<>(); final List<Object> list = new ArrayList<>();
for (int i = 0; i < a.length(); i++) list.add(a.get(i)); for (int i = 0; i < a.length(); i++) list.add(a.get(i));
final CollectionSchema schema = CollectionSchema.valueOf(key); ctype.add(surrogate, list);
schema.add(surrogate, list);
}
} else { } else {
CollectionSchema ctype = null; // first handle exceptional keys / maybe patch for other systems + other names
try {ctype = CollectionSchema.valueOf(key);} catch (final IllegalArgumentException e) {}
if (key.equals("url_s") || key.equals("sku")) { if (key.equals("url_s") || key.equals("sku")) {
ctype = CollectionSchema.sku;
// patch yacy grid altered schema (yacy grid does not have IDs any more, but they can be re-computed here) // patch yacy grid altered schema (yacy grid does not have IDs any more, but they can be re-computed here)
final DigestURL durl = new DigestURL(o.toString()); final DigestURL durl = new DigestURL(o.toString());
final String id = ASCII.String(durl.hash()); final String id = ASCII.String(durl.hash());
surrogate.setField(CollectionSchema.sku.getSolrFieldName(), durl.toNormalform(true)); surrogate.setField(CollectionSchema.sku.getSolrFieldName(), durl.toNormalform(true));
surrogate.setField(CollectionSchema.id.getSolrFieldName(), id); surrogate.setField(CollectionSchema.id.getSolrFieldName(), id);
surrogate.setField(CollectionSchema.host_id_s.getSolrFieldName(), id.substring(6)); surrogate.setField(CollectionSchema.host_id_s.getSolrFieldName(), id.substring(6));
} else if (key.equals("referrer_url_s")) { continue jsonreader;
}
if (key.equals("referrer_url_s")) {
final DigestURL durl = new DigestURL(o.toString()); final DigestURL durl = new DigestURL(o.toString());
final String id = ASCII.String(durl.hash()); final String id = ASCII.String(durl.hash());
surrogate.setField(CollectionSchema.referrer_id_s.getSolrFieldName(), id); surrogate.setField(CollectionSchema.referrer_id_s.getSolrFieldName(), id);
} else if (ctype != null && ctype.getType() == SolrType.date) { continue jsonreader;
}
// prepare to read key type
CollectionSchema ctype = null;
try {ctype = CollectionSchema.valueOf(key);} catch (final Exception e) {
this.log.warn("unknown key for CollectionSchema: " + key);
continue jsonreader;
}
if (ctype != null && ctype.getType() == SolrType.date) {
// patch date into something that Solr can understand // patch date into something that Solr can understand
final String d = o.toString(); // i.e. Wed Apr 01 02:00:00 CEST 2020 final String d = o.toString(); // i.e. Wed Apr 01 02:00:00 CEST 2020
final Date dd = d == null || d.length() == 0 ? null : AbstractFormatter.parseAny(d); final Date dd = d == null || d.length() == 0 ? null : AbstractFormatter.parseAny(d);
if (dd != null) surrogate.setField(ctype.getSolrFieldName(), ISO8601Formatter.FORMATTER.format(dd)); // solr dateTime is ISO8601 format if (dd != null) surrogate.setField(ctype.getSolrFieldName(), ISO8601Formatter.FORMATTER.format(dd)); // solr dateTime is ISO8601 format
} else { continue jsonreader;
surrogate.setField(key, o.toString());
} }
// regular situation, just read content of field
surrogate.setField(key, o.toString());
} }
} }
@ -2522,7 +2543,9 @@ public final class Switchboard extends serverSwitch {
|| surrogate.endsWith(".warc") || surrogate.endsWith(".warc")
|| surrogate.endsWith(".warc.gz") || surrogate.endsWith(".warc.gz")
|| surrogate.endsWith(".jsonlist") || surrogate.endsWith(".jsonlist")
|| surrogate.endsWith(".flatjson") ) { || surrogate.endsWith(".jsonlist.gz")
|| surrogate.endsWith(".flatjson")
|| surrogate.endsWith(".flatjson.gz") ) {
// read the surrogate file and store entry in index // read the surrogate file and store entry in index
if ( this.processSurrogate(surrogate) ) { if ( this.processSurrogate(surrogate) ) {
return true; return true;

Loading…
Cancel
Save