From b060ba900d6e3ea173202ab5d2a64f9cb3e01e10 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Mon, 13 Apr 2015 16:20:00 +0200 Subject: [PATCH] added parsing of contentprop attribute in html tags for content='startDate' and content='endDate'. The value of these field is now written to new solr fields startDates_dts and endDates_dts. --- defaults/solr.collection.schema | 6 +++ .../net/yacy/cora/federate/solr/SolrType.java | 2 +- .../document/parser/html/ContentScraper.java | 40 +++++++++++++++++-- .../schema/CollectionConfiguration.java | 13 ++++-- .../yacy/search/schema/CollectionSchema.java | 3 ++ 5 files changed, 56 insertions(+), 8 deletions(-) diff --git a/defaults/solr.collection.schema b/defaults/solr.collection.schema index a347f6107..d48750f6f 100644 --- a/defaults/solr.collection.schema +++ b/defaults/solr.collection.schema @@ -23,6 +23,12 @@ dates_in_content_dts ## the number of entries in dates_in_content_sxt dates_in_content_count_i + +## content of itemprop attributes with content='startDate' +startDates_dts + +## content of itemprop attributes with content='endDate' +endDates_dts ## mime-type of document, string (mandatory field) content_type diff --git a/source/net/yacy/cora/federate/solr/SolrType.java b/source/net/yacy/cora/federate/solr/SolrType.java index 95dad9e03..06b350dfc 100644 --- a/source/net/yacy/cora/federate/solr/SolrType.java +++ b/source/net/yacy/cora/federate/solr/SolrType.java @@ -26,7 +26,7 @@ public enum SolrType { text_general("t", "txt"), // tokenizes with StandardTokenizer, removes stop words from case-insensitive "stopwords.txt", down cases, applies synonyms. text_en_splitting_tight(null, null), // can insert dashes in the wrong place and still match location("p", null), // lat,lon - format: specialized field for geospatial search. - date("dt", null), // date format as in http://www.w3.org/TR/xmlschema-2/#dateTime with trailing 'Z' + date("dt", "dts"), // date format as in http://www.w3.org/TR/xmlschema-2/#dateTime with trailing 'Z' bool("b", "bs", "boolean"), num_integer("i", "val", "int"), num_long("l", "ls", "long"), diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 5fa26e440..244dad876 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -177,6 +177,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { private final Map hreflang, navigation; private LinkedHashSet titles; private final List articles; + private final List startDates, endDates; //private String headline; private List[] headlines; private final ClusteredScoreMap bold, italic, underline; @@ -234,6 +235,8 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.script = new SizeLimitedSet(maxLinks); this.titles = new LinkedHashSet(); this.articles = new ArrayList(); + this.startDates = new ArrayList<>(); + this.endDates = new ArrayList<>(); this.headlines = (List[]) Array.newInstance(ArrayList.class, 6); for (int i = 0; i < this.headlines.length; i++) this.headlines[i] = new ArrayList(); this.bold = new ClusteredScoreMap(false); @@ -373,9 +376,34 @@ public class ContentScraper extends AbstractScraper implements Scraper { return null; } } + + private void checkOpts(Tag tag) { + // vocabulary classes + final String classprop = tag.opts.getProperty("class", EMPTY_STRING); + this.vocabularyScraper.check(this.root, classprop, tag.content); + + // itemprop + String itemprop = tag.opts.getProperty("itemprop"); + if (itemprop != null) { + String content = tag.opts.getProperty("content"); + if (content != null) { + if ("startDate".equals(itemprop)) try { + // parse ISO 8601 date + Date startDate = ISO8601Formatter.FORMATTER.parse(content); + this.startDates.add(startDate); + } catch (ParseException e) {} + if ("endDate".equals(itemprop)) try { + // parse ISO 8601 date + Date endDate = ISO8601Formatter.FORMATTER.parse(content); + this.endDates.add(endDate); + } catch (ParseException e) {} + } + } + } @Override public void scrapeTag0(Tag tag) { + checkOpts(tag); if (tag.name.equalsIgnoreCase("img")) { final String src = tag.opts.getProperty("src", EMPTY_STRING); try { @@ -514,9 +542,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { @Override public void scrapeTag1(Tag tag) { - final String classprop = tag.opts.getProperty("class", EMPTY_STRING); - //System.out.println("class = " + classprop); - this.vocabularyScraper.check(this.root, classprop, tag.content); + checkOpts(tag); // System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text)); if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) { String href = tag.opts.getProperty("href", EMPTY_STRING); @@ -747,6 +773,14 @@ public class ContentScraper extends AbstractScraper implements Scraper { public String[] getDd() { return this.dd.toArray(new String[this.dd.size()]); } + + public List getStartDates() { + return this.startDates; + } + + public List getEndDates() { + return this.endDates; + } public DigestURL[] getFlash() { String ext; diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index c3a8462a0..1d2b70d3f 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -650,11 +650,16 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri final String[] dt = html.getDt(); add(doc, CollectionSchema.dtcount_i, dt.length); - if (dt.length > 0) add(doc, CollectionSchema.dt_txt, li); - - final String[] dd = html.getLi(); + if (dt.length > 0) add(doc, CollectionSchema.dt_txt, dt); + + final String[] dd = html.getDd(); add(doc, CollectionSchema.ddcount_i, dd.length); - if (dd.length > 0) add(doc, CollectionSchema.dd_txt, li); + if (dd.length > 0) add(doc, CollectionSchema.dd_txt, dd); + + final List startDates = html.getStartDates(); + if (startDates.size() > 0) add(doc, CollectionSchema.startDates_dts, startDates.toArray(new Date[startDates.size()])); + final List endDates = html.getStartDates(); + if (endDates.size() > 0) add(doc, CollectionSchema.endDates_dts, endDates.toArray(new Date[endDates.size()])); final List articles = html.getArticles(); add(doc, CollectionSchema.articlecount_i, articles.size()); diff --git a/source/net/yacy/search/schema/CollectionSchema.java b/source/net/yacy/search/schema/CollectionSchema.java index c9cf43a01..c23f5640a 100644 --- a/source/net/yacy/search/schema/CollectionSchema.java +++ b/source/net/yacy/search/schema/CollectionSchema.java @@ -37,6 +37,8 @@ public enum CollectionSchema implements SchemaDeclaration { last_modified(SolrType.date, true, true, false, false, false, "last-modified from http header"), dates_in_content_dts(SolrType.date, true, true, true, false, true, "if date expressions can be found in the content, these dates are listed here as date objects in order of the appearances"), dates_in_content_count_i(SolrType.num_integer, true, true, false, false, false, "the number of entries in dates_in_content_sxt"), + startDates_dts(SolrType.date, true, true, true, false, true, "content of itemprop attributes with content='startDate'"), + endDates_dts(SolrType.date, true, true, true, false, true, "content of itemprop attributes with content='endDate'"), content_type(SolrType.string, true, true, true, false, false, "mime-type of document"), http_unique_b(SolrType.bool, true, true, false, false, false, "unique-field which is true when an url appears the first time. If the same url which was http then appears as https (or vice versa) then the field is false"), www_unique_b(SolrType.bool, true, true, false, false, false, "unique-field which is true when an url appears the first time. If the same url within the subdomain www then appears without that subdomain (or vice versa) then the field is false"), @@ -272,6 +274,7 @@ public enum CollectionSchema implements SchemaDeclaration { assert !ext.equals("s") || (type == SolrType.string && !multiValued) : name; assert !ext.equals("sxt") || (type == SolrType.string && multiValued) : name; assert !ext.equals("dt") || (type == SolrType.date && !multiValued) : name; + assert !ext.equals("dts") || (type == SolrType.date && multiValued) : name; assert !ext.equals("t") || (type == SolrType.text_general && !multiValued) : name; assert !ext.equals("coordinate") || (type == SolrType.coordinate && !multiValued) : name; assert !ext.equals("txt") || (type == SolrType.text_general && multiValued) : name;