From c3e5f667a75b4865da16efc5c1d2060c89074195 Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Tue, 9 Oct 2012 13:02:43 +0200
Subject: [PATCH] added schema.org breadcrumb counter to parser and solr schema

---
 defaults/solr.keys.list                                |  3 +++
 source/net/yacy/cora/federate/solr/YaCySchema.java     |  4 +++-
 .../net/yacy/document/parser/html/ContentScraper.java  | 10 ++++++++++
 source/net/yacy/search/index/SolrConfiguration.java    |  3 ++-
 4 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/defaults/solr.keys.list b/defaults/solr.keys.list
index 81f0b322b..8cb4852ba 100644
--- a/defaults/solr.keys.list
+++ b/defaults/solr.keys.list
@@ -371,6 +371,9 @@ host_organization_s
 #h5_i
 #h6_i
 
+## breadcrumbs, see http://schema.org/WebPage; this is a counter how many itemprop="breadcrumb" properties in div tags appears within a page
+#schema_org_breadcrumb_i
+
 ## names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias
 #ext_cms_txt
 
diff --git a/source/net/yacy/cora/federate/solr/YaCySchema.java b/source/net/yacy/cora/federate/solr/YaCySchema.java
index 7a749464b..8efde1cd3 100644
--- a/source/net/yacy/cora/federate/solr/YaCySchema.java
+++ b/source/net/yacy/cora/federate/solr/YaCySchema.java
@@ -164,7 +164,9 @@ public enum YaCySchema implements Schema {
     h4_i(SolrType.integer, true, true, false, "number of h4 header lines"),
     h5_i(SolrType.integer, true, true, false, "number of h5 header lines"),
     h6_i(SolrType.integer, true, true, false, "number of h6 header lines"),
-
+    
+    schema_org_breadcrumb_i(SolrType.integer, true, true, false, "number of itemprop=\"breadcrumb\" appearances in div tags"),    
+    
     // special values; can only be used if '_val' type is defined in schema file; this is not standard
     bold_val(SolrType.integer, true, true, true, "number of occurrences of texts in bold_txt"),
     italic_val(SolrType.integer, true, true, true, "number of occurrences of texts in italic_txt"),
diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java
index 2990de966..62a18b8cb 100644
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@@ -138,6 +138,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
     private double lon, lat;
     private MultiProtocolURI canonical;
     private final int maxLinks;
+    private int breadcrumbs;
 
 
     /**
@@ -186,6 +187,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
         this.lat = 0.0d;
         this.evaluationScores.match(Element.url, root.toNormalform(false, false));
         this.canonical = null;
+        this.breadcrumbs = 0;
     }
 
     @Override
@@ -356,6 +358,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
         } else if (tagname.equalsIgnoreCase("div")) {
             final String id = tagopts.getProperty("id", EMPTY_STRING);
             this.evaluationScores.match(Element.divid, id);
+            final String itemtype = tagopts.getProperty("itemtype", EMPTY_STRING);
+            if (itemtype.equals("http://data-vocabulary.org/Breadcrumb")) {
+                breadcrumbs++;
+            }
         } else if (tagname.equalsIgnoreCase("meta")) {
             String name = tagopts.getProperty("name", EMPTY_STRING);
             final String content = tagopts.getProperty("content", EMPTY_STRING);
@@ -652,6 +658,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
         return false;
     }
 
+    public int breadcrumbCount() {
+        return this.breadcrumbs;
+    }
+    
     public String getText() {
         try {
             return this.content.toString();
diff --git a/source/net/yacy/search/index/SolrConfiguration.java b/source/net/yacy/search/index/SolrConfiguration.java
index 7af31ea3a..eb06f3838 100644
--- a/source/net/yacy/search/index/SolrConfiguration.java
+++ b/source/net/yacy/search/index/SolrConfiguration.java
@@ -447,8 +447,9 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
             hs = html.getHeadlines(4); h = h | (hs.length > 0 ? f : 0); f = f * 2; add(doc, YaCySchema.h4_txt, hs); add(doc, YaCySchema.h4_i, hs.length);
             hs = html.getHeadlines(5); h = h | (hs.length > 0 ? f : 0); f = f * 2; add(doc, YaCySchema.h5_txt, hs); add(doc, YaCySchema.h5_i, hs.length);
             hs = html.getHeadlines(6); h = h | (hs.length > 0 ? f : 0); f = f * 2; add(doc, YaCySchema.h6_txt, hs); add(doc, YaCySchema.h6_i, hs.length);
-
+       
             add(doc, YaCySchema.htags_i, h);
+            add(doc, YaCySchema.schema_org_breadcrumb_i, html.breadcrumbCount());
 
             // noindex and nofollow attributes
             // from HTML (meta-tag in HTML header: robots)