added parsing of contentprop attribute in html tags for

content='startDate' and content='endDate'. The value of these field is now written to new solr fields startDates_dts and endDates_dts.
10 years ago · b060ba900d
parent a08a3c5f29
commit b060ba900d
5 changed files with 56 additions and 8 deletions
--- a/defaults/solr.collection.schema
+++ b/defaults/solr.collection.schema
@ -23,6 +23,12 @@ dates_in_content_dts

 ## the number of entries in dates_in_content_sxt
 dates_in_content_count_i
+
+## content of itemprop attributes with content='startDate'
+startDates_dts
+
+## content of itemprop attributes with content='endDate'
+endDates_dts
    
 ## mime-type of document, string (mandatory field)
 content_type
--- a/source/net/yacy/cora/federate/solr/SolrType.java
+++ b/source/net/yacy/cora/federate/solr/SolrType.java
@ -26,7 +26,7 @@ public enum SolrType {
    text_general("t", "txt"),            // tokenizes with StandardTokenizer, removes stop words from case-insensitive "stopwords.txt", down cases, applies synonyms.
    text_en_splitting_tight(null, null), // can insert dashes in the wrong place and still match
    location("p", null),                 // lat,lon - format: specialized field for geospatial search.
-    date("dt", null),                    // date format as in http://www.w3.org/TR/xmlschema-2/#dateTime with trailing 'Z'
+    date("dt", "dts"),                   // date format as in http://www.w3.org/TR/xmlschema-2/#dateTime with trailing 'Z'
    bool("b", "bs", "boolean"),
    num_integer("i", "val", "int"),
    num_long("l", "ls", "long"), 
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -177,6 +177,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    private final Map<String, DigestURL> hreflang, navigation;
    private LinkedHashSet<String> titles;
    private final List<String> articles;
+    private final List<Date> startDates, endDates;
    //private String headline;
    private List<String>[] headlines;
    private final ClusteredScoreMap<String> bold, italic, underline;
@ -234,6 +235,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        this.script = new SizeLimitedSet<AnchorURL>(maxLinks);
        this.titles = new LinkedHashSet<String>();
        this.articles = new ArrayList<String>();
+        this.startDates = new ArrayList<>();
+        this.endDates = new ArrayList<>();
        this.headlines = (List<String>[]) Array.newInstance(ArrayList.class, 6);
        for (int i = 0; i < this.headlines.length; i++) this.headlines[i] = new ArrayList<String>();
        this.bold = new ClusteredScoreMap<String>(false);
@ -373,9 +376,34 @@ public class ContentScraper extends AbstractScraper implements Scraper {
            return null;
        }
    }
+    
+    private void checkOpts(Tag tag) {
+        // vocabulary classes
+        final String classprop = tag.opts.getProperty("class", EMPTY_STRING);
+        this.vocabularyScraper.check(this.root, classprop, tag.content);
+        
+        // itemprop
+        String itemprop = tag.opts.getProperty("itemprop");
+        if (itemprop != null) {
+            String content = tag.opts.getProperty("content");
+            if (content != null) {
+                if ("startDate".equals(itemprop)) try {
+                    // parse ISO 8601 date
+                    Date startDate = ISO8601Formatter.FORMATTER.parse(content);
+                    this.startDates.add(startDate);
+                } catch (ParseException e) {}
+                if ("endDate".equals(itemprop)) try {
+                    // parse ISO 8601 date
+                    Date endDate = ISO8601Formatter.FORMATTER.parse(content);
+                    this.endDates.add(endDate);
+                } catch (ParseException e) {}
+            }
+        }
+    }

    @Override
    public void scrapeTag0(Tag tag) {
+        checkOpts(tag);
        if (tag.name.equalsIgnoreCase("img")) {
            final String src = tag.opts.getProperty("src", EMPTY_STRING);
            try {
@ -514,9 +542,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {

    @Override
    public void scrapeTag1(Tag tag) {
-        final String classprop = tag.opts.getProperty("class", EMPTY_STRING);
-        //System.out.println("class = " + classprop);
-        this.vocabularyScraper.check(this.root, classprop, tag.content);
+        checkOpts(tag);
        // System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text));
        if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) {
            String href = tag.opts.getProperty("href", EMPTY_STRING);
@ -747,6 +773,14 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    public String[] getDd() {
        return this.dd.toArray(new String[this.dd.size()]);
    }
+    
+    public List<Date> getStartDates() {
+        return this.startDates;
+    }
+    
+    public List<Date> getEndDates() {
+        return this.endDates;
+    }

    public DigestURL[] getFlash() {
        String ext;
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@ -650,11 +650,16 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
            
            final String[] dt = html.getDt();
            add(doc, CollectionSchema.dtcount_i, dt.length);
-            if (dt.length > 0) add(doc, CollectionSchema.dt_txt, li);
-            
-            final String[] dd = html.getLi();
+            if (dt.length > 0) add(doc, CollectionSchema.dt_txt, dt);
+
+            final String[] dd = html.getDd();
            add(doc, CollectionSchema.ddcount_i, dd.length);
-            if (dd.length > 0) add(doc, CollectionSchema.dd_txt, li);
+            if (dd.length > 0) add(doc, CollectionSchema.dd_txt, dd);
+
+            final List<Date> startDates = html.getStartDates();
+            if (startDates.size() > 0) add(doc, CollectionSchema.startDates_dts, startDates.toArray(new Date[startDates.size()]));
+            final List<Date> endDates = html.getStartDates();
+            if (endDates.size() > 0) add(doc, CollectionSchema.endDates_dts, endDates.toArray(new Date[endDates.size()]));
            
            final List<String> articles = html.getArticles();
            add(doc, CollectionSchema.articlecount_i, articles.size());
--- a/source/net/yacy/search/schema/CollectionSchema.java
+++ b/source/net/yacy/search/schema/CollectionSchema.java
@ -37,6 +37,8 @@ public enum CollectionSchema implements SchemaDeclaration {
    last_modified(SolrType.date, true, true, false, false, false, "last-modified from http header"),
    dates_in_content_dts(SolrType.date, true, true, true, false, true, "if date expressions can be found in the content, these dates are listed here as date objects in order of the appearances"),
    dates_in_content_count_i(SolrType.num_integer, true, true, false, false, false, "the number of entries in dates_in_content_sxt"),
+    startDates_dts(SolrType.date, true, true, true, false, true, "content of itemprop attributes with content='startDate'"),
+    endDates_dts(SolrType.date, true, true, true, false, true, "content of itemprop attributes with content='endDate'"),
    content_type(SolrType.string, true, true, true, false, false, "mime-type of document"),
    http_unique_b(SolrType.bool, true, true, false, false, false, "unique-field which is true when an url appears the first time. If the same url which was http then appears as https (or vice versa) then the field is false"),
    www_unique_b(SolrType.bool, true, true, false, false, false, "unique-field which is true when an url appears the first time. If the same url within the subdomain www then appears without that subdomain (or vice versa) then the field is false"),
@ -272,6 +274,7 @@ public enum CollectionSchema implements SchemaDeclaration {
            assert !ext.equals("s") || (type == SolrType.string && !multiValued) : name;
            assert !ext.equals("sxt") || (type == SolrType.string && multiValued) : name;
            assert !ext.equals("dt") || (type == SolrType.date && !multiValued) : name;
+            assert !ext.equals("dts") || (type == SolrType.date && multiValued) : name;
            assert !ext.equals("t") || (type == SolrType.text_general && !multiValued) : name;
            assert !ext.equals("coordinate") || (type == SolrType.coordinate && !multiValued) : name;
            assert !ext.equals("txt") || (type == SolrType.text_general && multiValued) : name;