added schema.org breadcrumb counter to parser and solr schema

13 years ago · c3e5f667a7
parent a06930662c
commit c3e5f667a7
4 changed files with 18 additions and 2 deletions
--- a/defaults/solr.keys.list
+++ b/defaults/solr.keys.list
@ -371,6 +371,9 @@ host_organization_s
 #h5_i
 #h6_i

+## breadcrumbs, see http://schema.org/WebPage; this is a counter how many itemprop="breadcrumb" properties in div tags appears within a page
+#schema_org_breadcrumb_i
+
 ## names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias
 #ext_cms_txt

--- a/source/net/yacy/cora/federate/solr/YaCySchema.java
+++ b/source/net/yacy/cora/federate/solr/YaCySchema.java
@ -164,7 +164,9 @@ public enum YaCySchema implements Schema {
    h4_i(SolrType.integer, true, true, false, "number of h4 header lines"),
    h5_i(SolrType.integer, true, true, false, "number of h5 header lines"),
    h6_i(SolrType.integer, true, true, false, "number of h6 header lines"),
-
+    
+    schema_org_breadcrumb_i(SolrType.integer, true, true, false, "number of itemprop=\"breadcrumb\" appearances in div tags"),    
+    
    // special values; can only be used if '_val' type is defined in schema file; this is not standard
    bold_val(SolrType.integer, true, true, true, "number of occurrences of texts in bold_txt"),
    italic_val(SolrType.integer, true, true, true, "number of occurrences of texts in italic_txt"),
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -138,6 +138,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    private double lon, lat;
    private MultiProtocolURI canonical;
    private final int maxLinks;
+    private int breadcrumbs;


    /**
@ -186,6 +187,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        this.lat = 0.0d;
        this.evaluationScores.match(Element.url, root.toNormalform(false, false));
        this.canonical = null;
+        this.breadcrumbs = 0;
    }

    @Override
@ -356,6 +358,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        } else if (tagname.equalsIgnoreCase("div")) {
            final String id = tagopts.getProperty("id", EMPTY_STRING);
            this.evaluationScores.match(Element.divid, id);
+            final String itemtype = tagopts.getProperty("itemtype", EMPTY_STRING);
+            if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) {
+                breadcrumbs++;
+            }
        } else if (tagname.equalsIgnoreCase("meta")) {
            String name = tagopts.getProperty("name", EMPTY_STRING);
            final String content = tagopts.getProperty("content", EMPTY_STRING);
@ -652,6 +658,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        return false;
    }

+    public int breadcrumbCount() {
+        return this.breadcrumbs;
+    }
+    
    public String getText() {
        try {
            return this.content.toString();
--- a/source/net/yacy/search/index/SolrConfiguration.java
+++ b/source/net/yacy/search/index/SolrConfiguration.java
@ -447,8 +447,9 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
            hs = html.getHeadlines(4); h = h | (hs.length > 0 ? f : 0); f = f * 2; add(doc, YaCySchema.h4_txt, hs); add(doc, YaCySchema.h4_i, hs.length);
            hs = html.getHeadlines(5); h = h | (hs.length > 0 ? f : 0); f = f * 2; add(doc, YaCySchema.h5_txt, hs); add(doc, YaCySchema.h5_i, hs.length);
            hs = html.getHeadlines(6); h = h | (hs.length > 0 ? f : 0); f = f * 2; add(doc, YaCySchema.h6_txt, hs); add(doc, YaCySchema.h6_i, hs.length);
-
+       
            add(doc, YaCySchema.htags_i, h);
+            add(doc, YaCySchema.schema_org_breadcrumb_i, html.breadcrumbCount());

            // noindex and nofollow attributes
            // from HTML (meta-tag in HTML header: robots)