From a857e3d3d5515dfdc53be83bbdc20e1aa3100d01 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Mon, 29 Mar 2021 18:46:42 +0200 Subject: [PATCH] fix for json importer --- .../net/yacy/cora/date/AbstractFormatter.java | 56 ++++++++++++++++++- source/net/yacy/cora/date/DateFormatter.java | 1 + source/net/yacy/search/Switchboard.java | 30 +++++++++- 3 files changed, 83 insertions(+), 4 deletions(-) diff --git a/source/net/yacy/cora/date/AbstractFormatter.java b/source/net/yacy/cora/date/AbstractFormatter.java index 932fae059..e2c834720 100644 --- a/source/net/yacy/cora/date/AbstractFormatter.java +++ b/source/net/yacy/cora/date/AbstractFormatter.java @@ -25,9 +25,14 @@ package net.yacy.cora.date; import java.text.ParseException; +import java.text.SimpleDateFormat; import java.util.Calendar; import java.util.Date; +import java.util.HashMap; +import java.util.Locale; +import java.util.Map; import java.util.TimeZone; +import java.util.regex.Pattern; public abstract class AbstractFormatter implements DateFormatter { @@ -37,7 +42,7 @@ public abstract class AbstractFormatter implements DateFormatter { static { UTCCalendar.setTimeZone(UTCtimeZone); } - + // statics public final static long secondMillis = 1000; public final static long minuteMillis = 60 * secondMillis; @@ -49,11 +54,58 @@ public abstract class AbstractFormatter implements DateFormatter { protected long last_time; protected String last_format; - + @Override public abstract Calendar parse(String s, int timezoneOffset) throws ParseException; @Override public abstract String format(final Date date); @Override public abstract String format(); + + private static final HashMap DATE_FORMAT_REGEXPS = new HashMap() { + private static final long serialVersionUID = 1321140786174228717L; + { + put(Pattern.compile("^\\d{8}$"), new SimpleDateFormat("yyyyMMdd", Locale.US)); + put(Pattern.compile("^\\d{1,2}-\\d{1,2}-\\d{4}$"), new SimpleDateFormat("dd-MM-yyyy", Locale.US)); + put(Pattern.compile("^\\d{4}-\\d{1,2}-\\d{1,2}$"), new SimpleDateFormat("yyyy-MM-dd", Locale.US)); + put(Pattern.compile("^\\d{1,2}/\\d{1,2}/\\d{4}$"), new SimpleDateFormat("MM/dd/yyyy", Locale.US)); + put(Pattern.compile("^\\d{4}/\\d{1,2}/\\d{1,2}$"), new SimpleDateFormat("yyyy/MM/dd", Locale.US)); + put(Pattern.compile("^\\d{1,2}\\s[a-z]{3}\\s\\d{4}$"), new SimpleDateFormat("dd MMM yyyy", Locale.US)); + put(Pattern.compile("^\\d{1,2}\\s[a-z]{4,}\\s\\d{4}$"), new SimpleDateFormat("dd MMMM yyyy", Locale.US)); + put(Pattern.compile("^\\d{12}$"), new SimpleDateFormat("yyyyMMddHHmm", Locale.US)); + put(Pattern.compile("^\\d{8}\\s\\d{4}$"), new SimpleDateFormat("yyyyMMdd HHmm", Locale.US)); + put(Pattern.compile("^\\d{1,2}-\\d{1,2}-\\d{4}\\s\\d{1,2}:\\d{2}$"), new SimpleDateFormat("dd-MM-yyyy HH:mm", Locale.US)); + put(Pattern.compile("^\\d{4}-\\d{1,2}-\\d{1,2}\\s\\d{1,2}:\\d{2}$"), new SimpleDateFormat("yyyy-MM-dd HH:mm", Locale.US)); + put(Pattern.compile("^\\d{1,2}/\\d{1,2}/\\d{4}\\s\\d{1,2}:\\d{2}$"), new SimpleDateFormat("MM/dd/yyyy HH:mm", Locale.US)); + put(Pattern.compile("^\\d{4}/\\d{1,2}/\\d{1,2}\\s\\d{1,2}:\\d{2}$"), new SimpleDateFormat("yyyy/MM/dd HH:mm", Locale.US)); + put(Pattern.compile("^\\d{1,2}\\s[a-z]{3}\\s\\d{4}\\s\\d{1,2}:\\d{2}$"), new SimpleDateFormat("dd MMM yyyy HH:mm", Locale.US)); + put(Pattern.compile("^\\d{1,2}\\s[a-z]{4,}\\s\\d{4}\\s\\d{1,2}:\\d{2}$"), new SimpleDateFormat("dd MMMM yyyy HH:mm", Locale.US)); + put(Pattern.compile("^\\d{14}$"), new SimpleDateFormat("yyyyMMddHHmmss", Locale.US)); + put(Pattern.compile("^\\d{8}\\s\\d{6}$"), new SimpleDateFormat("yyyyMMdd HHmmss", Locale.US)); + put(Pattern.compile("^\\d{1,2}-\\d{1,2}-\\d{4}\\s\\d{1,2}:\\d{2}:\\d{2}$"), new SimpleDateFormat("dd-MM-yyyy HH:mm:ss", Locale.US)); + put(Pattern.compile("^\\d{4}-\\d{1,2}-\\d{1,2}\\s\\d{1,2}:\\d{2}:\\d{2}$"), new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US)); + put(Pattern.compile("^\\d{1,2}/\\d{1,2}/\\d{4}\\s\\d{1,2}:\\d{2}:\\d{2}$"), new SimpleDateFormat("MM/dd/yyyy HH:mm:ss", Locale.US)); + put(Pattern.compile("^\\d{4}/\\d{1,2}/\\d{1,2}\\s\\d{1,2}:\\d{2}:\\d{2}$"), new SimpleDateFormat("yyyy/MM/dd HH:mm:ss", Locale.US)); + put(Pattern.compile("^\\d{1,2}\\s[a-z]{3}\\s\\d{4}\\s\\d{1,2}:\\d{2}:\\d{2}$"), new SimpleDateFormat("dd MMM yyyy HH:mm:ss", Locale.US)); + put(Pattern.compile("^\\d{1,2}\\s[a-z]{4,}\\s\\d{4}\\s\\d{1,2}:\\d{2}:\\d{2}$"), new SimpleDateFormat("dd MMMM yyyy HH:mm:ss", Locale.US)); + put(Pattern.compile("^[a-z]{3}\\s[a-z]{3}\\s\\d{1,2}\\s\\d{1,2}:\\d{2}:\\d{2}\\s[a-z]{4,}\\s\\d{4}$"), new SimpleDateFormat("EEE MMM dd HH:mm:ss zzz yyyy", Locale.US)); + }}; + + @Override + public Date parse(String s) { + return parseAny(s); + } + + public static Date parseAny(String s) { + for (Map.Entry ps: DATE_FORMAT_REGEXPS.entrySet()) { + if (ps.getKey().matcher(s.toLowerCase()).matches()) { + try { + return ps.getValue().parse(s); + } catch (ParseException e) { + } + } + } + return null; // Unknown format. + } + } diff --git a/source/net/yacy/cora/date/DateFormatter.java b/source/net/yacy/cora/date/DateFormatter.java index f929534d1..261e0fa97 100644 --- a/source/net/yacy/cora/date/DateFormatter.java +++ b/source/net/yacy/cora/date/DateFormatter.java @@ -30,6 +30,7 @@ import java.util.Date; public interface DateFormatter { + public Date parse(String s); public Calendar parse(String s, int timezoneOffset) throws ParseException; public String format(final Date date); public String format(); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 4038ebb63..37616845e 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -97,6 +97,7 @@ import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.core.SolrCore; import org.apache.solr.search.SyntaxError; +import org.eclipse.jetty.http.DateParser; import org.json.JSONArray; import org.json.JSONException; import org.json.JSONObject; @@ -108,7 +109,9 @@ import com.google.common.io.Files; import net.yacy.contentcontrol.ContentControlFilterUpdateThread; import net.yacy.contentcontrol.SMWListSyncThread; +import net.yacy.cora.date.AbstractFormatter; import net.yacy.cora.date.GenericFormatter; +import net.yacy.cora.date.ISO8601Formatter; import net.yacy.cora.document.WordCache; import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.encoding.ASCII; @@ -121,6 +124,7 @@ import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.solr.FailCategory; import net.yacy.cora.federate.solr.Ranking; +import net.yacy.cora.federate.solr.SolrType; import net.yacy.cora.federate.solr.connector.ShardSelection; import net.yacy.cora.federate.solr.connector.SolrConnector.LoadTimeURL; import net.yacy.cora.federate.solr.instance.EmbeddedInstance; @@ -2199,6 +2203,7 @@ public final class Switchboard extends serverSwitch { // see https://github.com/yacy/yacy_grid_parser/blob/master/README.md FileInputStream fis = null; BufferedReader br = null; + VocabularyScraper scraper = new VocabularyScraper(); try { fis = new FileInputStream(infile); InputStream is = new BufferedInputStream(fis); @@ -2256,8 +2261,11 @@ public final class Switchboard extends serverSwitch { schema.add(surrogate, list); } } else { - // patch yacy grid altered schema (yacy grid does not have IDs any more, but they can be re-computed here) - if (key.equals("url_s")) { + CollectionSchema ctype = null; + try {ctype = CollectionSchema.valueOf(key);} catch (IllegalArgumentException e) {} + if (key.equals("url_s") || key.equals("sku")) { + ctype = CollectionSchema.sku; + // patch yacy grid altered schema (yacy grid does not have IDs any more, but they can be re-computed here) DigestURL durl = new DigestURL(o.toString()); String id = ASCII.String(durl.hash()); surrogate.setField(CollectionSchema.sku.getSolrFieldName(), durl.toNormalform(true)); @@ -2267,11 +2275,29 @@ public final class Switchboard extends serverSwitch { DigestURL durl = new DigestURL(o.toString()); String id = ASCII.String(durl.hash()); surrogate.setField(CollectionSchema.referrer_id_s.getSolrFieldName(), id); + } else if (ctype != null && ctype.getType() == SolrType.date) { + // patch date into something that Solr can understand + String d = o.toString(); // i.e. Wed Apr 01 02:00:00 CEST 2020 + Date dd = d == null || d.length() == 0 ? null : AbstractFormatter.parseAny(d); + if (dd != null) surrogate.setField(ctype.getSolrFieldName(), ISO8601Formatter.FORMATTER.format(dd)); // solr dateTime is ISO8601 format } else { surrogate.setField(key, o.toString()); } } } + + // enrich the surrogate + final String id = (String) surrogate.getFieldValue(CollectionSchema.id.getSolrFieldName()); + final String text = (String) surrogate.getFieldValue(CollectionSchema.text_t.getSolrFieldName()); + final DigestURL rootURL = new DigestURL((String) surrogate.getFieldValue(CollectionSchema.sku.getSolrFieldName()), ASCII.getBytes(id)); + if (text != null && text.length() > 0 && id != null ) { + // run the tokenizer on the text to get vocabularies and synonyms + final Tokenizer tokenizer = new Tokenizer(rootURL, text, LibraryProvider.dymLib, true, scraper); + final Map> facets = Document.computeGenericFacets(tokenizer.tags()); + // overwrite the given vocabularies and synonyms with new computed ones + Switchboard.this.index.fulltext().getDefaultConfiguration().enrich(surrogate, tokenizer.synonyms(), facets); + } + Switchboard.this.index.putDocument(surrogate); } br.close();