fix for json importer

pull/405/head
Michael Peter Christen 4 years ago
parent e18d0ef544
commit a857e3d3d5

@ -25,9 +25,14 @@
package net.yacy.cora.date;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.TimeZone;
import java.util.regex.Pattern;
public abstract class AbstractFormatter implements DateFormatter {
@ -37,7 +42,7 @@ public abstract class AbstractFormatter implements DateFormatter {
static {
UTCCalendar.setTimeZone(UTCtimeZone);
}
// statics
public final static long secondMillis = 1000;
public final static long minuteMillis = 60 * secondMillis;
@ -49,11 +54,58 @@ public abstract class AbstractFormatter implements DateFormatter {
protected long last_time;
protected String last_format;
@Override
public abstract Calendar parse(String s, int timezoneOffset) throws ParseException;
@Override
public abstract String format(final Date date);
@Override
public abstract String format();
private static final HashMap<Pattern, SimpleDateFormat> DATE_FORMAT_REGEXPS = new HashMap<Pattern, SimpleDateFormat>() {
private static final long serialVersionUID = 1321140786174228717L;
{
put(Pattern.compile("^\\d{8}$"), new SimpleDateFormat("yyyyMMdd", Locale.US));
put(Pattern.compile("^\\d{1,2}-\\d{1,2}-\\d{4}$"), new SimpleDateFormat("dd-MM-yyyy", Locale.US));
put(Pattern.compile("^\\d{4}-\\d{1,2}-\\d{1,2}$"), new SimpleDateFormat("yyyy-MM-dd", Locale.US));
put(Pattern.compile("^\\d{1,2}/\\d{1,2}/\\d{4}$"), new SimpleDateFormat("MM/dd/yyyy", Locale.US));
put(Pattern.compile("^\\d{4}/\\d{1,2}/\\d{1,2}$"), new SimpleDateFormat("yyyy/MM/dd", Locale.US));
put(Pattern.compile("^\\d{1,2}\\s[a-z]{3}\\s\\d{4}$"), new SimpleDateFormat("dd MMM yyyy", Locale.US));
put(Pattern.compile("^\\d{1,2}\\s[a-z]{4,}\\s\\d{4}$"), new SimpleDateFormat("dd MMMM yyyy", Locale.US));
put(Pattern.compile("^\\d{12}$"), new SimpleDateFormat("yyyyMMddHHmm", Locale.US));
put(Pattern.compile("^\\d{8}\\s\\d{4}$"), new SimpleDateFormat("yyyyMMdd HHmm", Locale.US));
put(Pattern.compile("^\\d{1,2}-\\d{1,2}-\\d{4}\\s\\d{1,2}:\\d{2}$"), new SimpleDateFormat("dd-MM-yyyy HH:mm", Locale.US));
put(Pattern.compile("^\\d{4}-\\d{1,2}-\\d{1,2}\\s\\d{1,2}:\\d{2}$"), new SimpleDateFormat("yyyy-MM-dd HH:mm", Locale.US));
put(Pattern.compile("^\\d{1,2}/\\d{1,2}/\\d{4}\\s\\d{1,2}:\\d{2}$"), new SimpleDateFormat("MM/dd/yyyy HH:mm", Locale.US));
put(Pattern.compile("^\\d{4}/\\d{1,2}/\\d{1,2}\\s\\d{1,2}:\\d{2}$"), new SimpleDateFormat("yyyy/MM/dd HH:mm", Locale.US));
put(Pattern.compile("^\\d{1,2}\\s[a-z]{3}\\s\\d{4}\\s\\d{1,2}:\\d{2}$"), new SimpleDateFormat("dd MMM yyyy HH:mm", Locale.US));
put(Pattern.compile("^\\d{1,2}\\s[a-z]{4,}\\s\\d{4}\\s\\d{1,2}:\\d{2}$"), new SimpleDateFormat("dd MMMM yyyy HH:mm", Locale.US));
put(Pattern.compile("^\\d{14}$"), new SimpleDateFormat("yyyyMMddHHmmss", Locale.US));
put(Pattern.compile("^\\d{8}\\s\\d{6}$"), new SimpleDateFormat("yyyyMMdd HHmmss", Locale.US));
put(Pattern.compile("^\\d{1,2}-\\d{1,2}-\\d{4}\\s\\d{1,2}:\\d{2}:\\d{2}$"), new SimpleDateFormat("dd-MM-yyyy HH:mm:ss", Locale.US));
put(Pattern.compile("^\\d{4}-\\d{1,2}-\\d{1,2}\\s\\d{1,2}:\\d{2}:\\d{2}$"), new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US));
put(Pattern.compile("^\\d{1,2}/\\d{1,2}/\\d{4}\\s\\d{1,2}:\\d{2}:\\d{2}$"), new SimpleDateFormat("MM/dd/yyyy HH:mm:ss", Locale.US));
put(Pattern.compile("^\\d{4}/\\d{1,2}/\\d{1,2}\\s\\d{1,2}:\\d{2}:\\d{2}$"), new SimpleDateFormat("yyyy/MM/dd HH:mm:ss", Locale.US));
put(Pattern.compile("^\\d{1,2}\\s[a-z]{3}\\s\\d{4}\\s\\d{1,2}:\\d{2}:\\d{2}$"), new SimpleDateFormat("dd MMM yyyy HH:mm:ss", Locale.US));
put(Pattern.compile("^\\d{1,2}\\s[a-z]{4,}\\s\\d{4}\\s\\d{1,2}:\\d{2}:\\d{2}$"), new SimpleDateFormat("dd MMMM yyyy HH:mm:ss", Locale.US));
put(Pattern.compile("^[a-z]{3}\\s[a-z]{3}\\s\\d{1,2}\\s\\d{1,2}:\\d{2}:\\d{2}\\s[a-z]{4,}\\s\\d{4}$"), new SimpleDateFormat("EEE MMM dd HH:mm:ss zzz yyyy", Locale.US));
}};
@Override
public Date parse(String s) {
return parseAny(s);
}
public static Date parseAny(String s) {
for (Map.Entry<Pattern, SimpleDateFormat> ps: DATE_FORMAT_REGEXPS.entrySet()) {
if (ps.getKey().matcher(s.toLowerCase()).matches()) {
try {
return ps.getValue().parse(s);
} catch (ParseException e) {
}
}
}
return null; // Unknown format.
}
}

@ -30,6 +30,7 @@ import java.util.Date;
public interface DateFormatter {
public Date parse(String s);
public Calendar parse(String s, int timezoneOffset) throws ParseException;
public String format(final Date date);
public String format();

@ -97,6 +97,7 @@ import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.core.SolrCore;
import org.apache.solr.search.SyntaxError;
import org.eclipse.jetty.http.DateParser;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
@ -108,7 +109,9 @@ import com.google.common.io.Files;
import net.yacy.contentcontrol.ContentControlFilterUpdateThread;
import net.yacy.contentcontrol.SMWListSyncThread;
import net.yacy.cora.date.AbstractFormatter;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.document.WordCache;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.encoding.ASCII;
@ -121,6 +124,7 @@ import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.solr.FailCategory;
import net.yacy.cora.federate.solr.Ranking;
import net.yacy.cora.federate.solr.SolrType;
import net.yacy.cora.federate.solr.connector.ShardSelection;
import net.yacy.cora.federate.solr.connector.SolrConnector.LoadTimeURL;
import net.yacy.cora.federate.solr.instance.EmbeddedInstance;
@ -2199,6 +2203,7 @@ public final class Switchboard extends serverSwitch {
// see https://github.com/yacy/yacy_grid_parser/blob/master/README.md
FileInputStream fis = null;
BufferedReader br = null;
VocabularyScraper scraper = new VocabularyScraper();
try {
fis = new FileInputStream(infile);
InputStream is = new BufferedInputStream(fis);
@ -2256,8 +2261,11 @@ public final class Switchboard extends serverSwitch {
schema.add(surrogate, list);
}
} else {
// patch yacy grid altered schema (yacy grid does not have IDs any more, but they can be re-computed here)
if (key.equals("url_s")) {
CollectionSchema ctype = null;
try {ctype = CollectionSchema.valueOf(key);} catch (IllegalArgumentException e) {}
if (key.equals("url_s") || key.equals("sku")) {
ctype = CollectionSchema.sku;
// patch yacy grid altered schema (yacy grid does not have IDs any more, but they can be re-computed here)
DigestURL durl = new DigestURL(o.toString());
String id = ASCII.String(durl.hash());
surrogate.setField(CollectionSchema.sku.getSolrFieldName(), durl.toNormalform(true));
@ -2267,11 +2275,29 @@ public final class Switchboard extends serverSwitch {
DigestURL durl = new DigestURL(o.toString());
String id = ASCII.String(durl.hash());
surrogate.setField(CollectionSchema.referrer_id_s.getSolrFieldName(), id);
} else if (ctype != null && ctype.getType() == SolrType.date) {
// patch date into something that Solr can understand
String d = o.toString(); // i.e. Wed Apr 01 02:00:00 CEST 2020
Date dd = d == null || d.length() == 0 ? null : AbstractFormatter.parseAny(d);
if (dd != null) surrogate.setField(ctype.getSolrFieldName(), ISO8601Formatter.FORMATTER.format(dd)); // solr dateTime is ISO8601 format
} else {
surrogate.setField(key, o.toString());
}
}
}
// enrich the surrogate
final String id = (String) surrogate.getFieldValue(CollectionSchema.id.getSolrFieldName());
final String text = (String) surrogate.getFieldValue(CollectionSchema.text_t.getSolrFieldName());
final DigestURL rootURL = new DigestURL((String) surrogate.getFieldValue(CollectionSchema.sku.getSolrFieldName()), ASCII.getBytes(id));
if (text != null && text.length() > 0 && id != null ) {
// run the tokenizer on the text to get vocabularies and synonyms
final Tokenizer tokenizer = new Tokenizer(rootURL, text, LibraryProvider.dymLib, true, scraper);
final Map<String, Set<String>> facets = Document.computeGenericFacets(tokenizer.tags());
// overwrite the given vocabularies and synonyms with new computed ones
Switchboard.this.index.fulltext().getDefaultConfiguration().enrich(surrogate, tokenizer.synonyms(), facets);
}
Switchboard.this.index.putDocument(surrogate);
}
br.close();

Loading…
Cancel
Save